From 1bf7a69095b4e8fd7a379bfba4f1558a116e1f67 Mon Sep 17 00:00:00 2001 From: "Fang.Che" Date: Fri, 15 May 2026 02:54:22 +0000 Subject: [PATCH 1/6] fix(pa): support block_id > 65535 in ASM paged-attention kernels (#3062) Use min-based anchor rebase in K_V_window_rebase to allow pages within the same wave to span different 65536 windows, as long as max_page_id - min_page_id < 65536. This removes the previous constraint that all pages in a load group must share the same high-16-bit window. Changes: - Replace v_mul_u32_u24 with v_mul_lo_u32 to remove 24-bit truncation - Update K/V buffer descriptor num_records for full offset range - Update all 36 PA kernel .co binaries (gfx942 + gfx950) - Add test_pa_block_id_truncation.py regression test --- csrc/py_itfs_cu/asm_pa.cu | 9 +- hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co | Bin 21816 -> 24144 bytes hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w.co | Bin 22248 -> 24584 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 67464 -> 73616 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 76640 -> 82800 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 160856 -> 165808 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 190712 -> 195656 bytes .../pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co | Bin 23152 -> 25008 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 70440 -> 75168 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 79584 -> 84312 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co | Bin 19960 -> 21816 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 22152 -> 24008 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 25912 -> 27768 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 173176 -> 178128 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 203040 -> 207984 bytes .../pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co | Bin 24384 -> 26240 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 74840 -> 79568 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 83984 -> 88712 bytes .../pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co | Bin 20608 -> 22464 bytes hsa/gfx942/pa/pa_fp16_noquant_gqa16_1tg_4w.co | Bin 19528 -> 21856 bytes hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w.co | Bin 20928 -> 23264 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 58928 -> 65080 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 68104 -> 74264 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 159416 -> 164368 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 189272 -> 194216 bytes .../pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co | Bin 23008 -> 24864 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 69600 -> 74328 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 78744 -> 83472 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co | Bin 19904 -> 21760 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 22096 -> 23952 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 25856 -> 27712 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 171736 -> 176688 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 201600 -> 206544 bytes .../pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co | Bin 24240 -> 26096 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 74000 -> 78728 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 83144 -> 87872 bytes .../pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co | Bin 20552 -> 22408 bytes hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co | Bin 21968 -> 24144 bytes hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co | Bin 22400 -> 24584 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 67616 -> 73616 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 76792 -> 82800 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 161008 -> 165808 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 190864 -> 195656 bytes .../pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co | Bin 23304 -> 25008 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 70592 -> 75168 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 79736 -> 84312 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co | Bin 20112 -> 21816 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 22304 -> 24008 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 26064 -> 27768 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 173328 -> 178128 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 203192 -> 207984 bytes .../pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co | Bin 24536 -> 26240 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 74992 -> 79568 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 84136 -> 88712 bytes .../pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co | Bin 20760 -> 22464 bytes hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co | Bin 19680 -> 21856 bytes hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w.co | Bin 21080 -> 23264 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 59080 -> 65080 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 68256 -> 74264 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 159568 -> 164368 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 189424 -> 194216 bytes .../pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co | Bin 23160 -> 24864 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 69752 -> 74328 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 78896 -> 83472 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co | Bin 20056 -> 21760 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 22248 -> 23952 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 26008 -> 27712 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 171888 -> 176688 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 201752 -> 206544 bytes .../pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co | Bin 24392 -> 26096 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 74152 -> 78728 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 83296 -> 87872 bytes .../pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co | Bin 20704 -> 22408 bytes op_tests/test_pa_block_id_truncation.py | 260 ++++++++++++++++++ 74 files changed, 266 insertions(+), 3 deletions(-) mode change 100755 => 100644 hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co create mode 100644 op_tests/test_pa_block_id_truncation.py diff --git a/csrc/py_itfs_cu/asm_pa.cu b/csrc/py_itfs_cu/asm_pa.cu index eebe86dcab..782e2c7f2f 100644 --- a/csrc/py_itfs_cu/asm_pa.cu +++ b/csrc/py_itfs_cu/asm_pa.cu @@ -37,10 +37,12 @@ struct __attribute__((packed)) KernelArgs p3 _p16; unsigned int KVs; p3 _p17; - unsigned int GQA; + unsigned int mtp; p3 _p18; + unsigned int GQA; + p3 _p19; void* ptr_QTP; - p2 _p19; + p2 _p20; }; @@ -198,7 +200,7 @@ void pa_fwd(aiter_tensor_t* Q, // [num_seqs, num_heads, head_size float k_scalar = sqrt(dim); k_scalar = (float)((double)k_log2e / (double)k_scalar); - KernelArgs args; + KernelArgs args = {}; size_t arg_size = sizeof(args); args.ptr_O = out_->data_ptr(); args.ptr_Q = Q->data_ptr(); @@ -222,6 +224,7 @@ void pa_fwd(aiter_tensor_t* Q, // [num_seqs, num_heads, head_size args.Qs = stride_Q; args.Bs = stride_KV_blk; args.KVs = stride_KV_head; + args.mtp = max_qlen - 1; args.GQA = gqa_ratio; args.ptr_QTP = (qo_indptr != nullptr) ? qo_indptr->data_ptr() : nullptr; diff --git a/hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co b/hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co index 8dcd2225081fb9aa69ed8361ae9d3394180372f0..b2e2fa614ed740fd824a009faf9743cbdfc4a2b4 100755 GIT binary patch delta 4012 zcmeHKO>7%Q6rSDu*-0JmI6o9Cq;5d=I!YYJt-YqWG$lz9Qj|D$NSr8Ws!HJFS z(1?TAShPe40qI;&;ZVe(1y!PwF9@Odk|TvAHzW>#IB=<&1BlR|n%UjiD6yK-$`Rra zOFQ3tZ{EzjnQz{D-&bEy>)%j%X>jNQO;LT-EBX`k`qWBiLNWp6mm>%mRi`l2$YPVK zgQxf7@_?#qxOZGF?pmH8{TKIRQdPl{P8B+|fi%`1JP@6stR4S8AIei=d&sCxt6?=+ zt`6f1MpR3)l&jD~a}8DVnIwuC{z5+)bcPp4&tG>WL@_O7zTHAU>-wnQm3CWc|Hf?> zx?kf#A2lAMZ&N5qyQ^+fI`lf-gzh#*DW(uIG8F2Awe6-VypncQWbICj5bv$egx+Ot zq0fVj1wU+~s4X;a`k-MT?|a|mGv!)+t%V)a9a>_*cdHP5zuFAeS6jfwYRNKW1)x+i zf?UyBvy}CLuUCEaP`5AAsGW%W|}Hlfm_{boCs?XnyQ zn3aX+Xkfzu*f6pfS&Vj!c8pd-SJY;3L|KC~YB#u|N&urIfB*^LWfH)+9Dw9L+yIPB z%w{yjSYva{ZfuPy0Zftrrbqy8m^M@glQ!zB?C z5+NZG5)z?GV)+qD6_R)wljy-DdNGM-6cQmJ5fTz1ArUGhmUAdoc#Dh{OkxKn@fl2_ zRUr`)5+NZG?j^CD!|6TO_hBT$14y(QSsXwo4xkGM;E28SVSVuz{iR}EzPh+oSUFy* zlm6p0J#2;_H%$+n;m1tVa)!@6BE!r93?t@b8+EbJeNv|KC|N6cK_ zcfb4#K@}TR0LKX`g$R~^CH?ipK2GqD`=x{|UL*b5SN|GZd8-vIjWCkS<#0Nj7Pcpz zSeO%1d@`JgE`;X=KAqq*(Qt-OCFj$8B)q_<7n8|EDuX>DuZ8Dg3v;+%;Ae%I#W_B< zASQS*k>bPgIG;%K$y7p&#f5ZFIu+@O$KROm#+FxwNJddCqT4&;^vp(t-V@@pkzP0F z-37VPNz%g^+sIbNJQiWJNlF|OLEBtKpnUdRD|8;m?G7I-_Grt6yXVCXS?>> zJBaxuT!%Qx;c~`wd3b19Wo+wz)`PVoe)HUfTv!& z3D+RHg4H#0G1eg340xb+k2D-dSA*?{3!3z#3dW^cAjmu*h}eo1A~Ke3x-O10dSEfn|% DP4g9= delta 1856 zcmd6oPe_wt9LL}1b-F*g?Rlo?jG;38Gl$J7bAKd75eHGYcUJ!0ih^VlMeUM@U?LSA zg0x=;gCHvCAVJA{@nB5o>cLZV@ZhD31bGQW$@T1ge~&Z?I_hBi{{BAu?Dw+Yv%Nvq zSBW{(e>Ex-(o?i-u9es0%ifs`leE_Yszi04a4F844&9gBg=*;1eW8E%mGG1)rsqvF zg*vOd(gCVlG|D{i??&h(jqd-S2a9B)MzQJV3N_%@7`a}uL5b`kKDAX2mBOU04DMO? z;kY;muk2;=);`S10eS0yOF6tYRD+Gih@{@6l)x-ASBnAQiC(UZ6ZbH9&OSjnQK>2p z7a_YaUH(ipRs4`MhZ9#RRY>k|Vt~@?c~vMK$?G7c36g8HwxpV@zLdx6Pc>WHQ(Ao- z>j$y^2G$SpdOJ-w*&S(*y&~OguS{$8qgX$N^+~KBCM2hBI-e2^gCPb(42Bp?!w7JU zAjb%Ej7|-MAqGPXh8Rr4sN)!Jj#1As8Z``t7z{BOVz6OS!?``Y!OI(3c!Mu}dC8J} zZ+et9E*i3fYJ)3h)ELinRIuCNA|Ca31Hzo)z(@|(TkEWAM~hiWTF~Ax^?U2NnbKGF zbL5;%$yNOE=wWtEecAW3JemJv<~Ko_+C7<*Y5K;?zOk0kNbNA5ozl7 zox4-F{2kGdFxz@NgxL`d2(vd36lQ-UD$Kr65T?hz$chcx$9)#@WE2XM<3WoU^a_Pl z?rz}RHz@3JS3IU23f@G>jCT}Yxs!e?Q3KlvY7xKH6ob7)kj>*AG>>=E l6JA&gO44Z-kt&!=hUBFftR}nV6$SQ^-DXo7%Q6vt;b&WD{k-gT3ZS|+tjWv{E$**NtF!EH@GTv4pVN$il6wp+*X#s()g zwu1@>uWO~KDpg4|mx@z4l>>-M76NgIIply4g@zNyB7w?{9zcQyVrF+|OB^*N#Ss!x zEP4NL-pqL3&Trm&-%lSBmp>=e!cf3Z5`<5+tUgFyKC^r{CFp_lDPRSZ+$NCJi{^g0 z4W6l1qhD?-x_Mp$>M~8q{iZjnH7z&6U2zcln3ahnYCL)(>?17ZfBrpKB>2vdPWn}# zs7h{vMx;I+Y#lGS&v(gIgth9l(m5M1~ z0iaURfkN3*efg){d#S?6F}fJ^wiZT@ zwG|(rc~p?7JV0})kf^~tH7V*?o;pKREWjzOSrz~&1Of&taIRTJdwAHuAe#~GV9;Pg zwoSCh#thQJvm~&h0IWzEqzqCUQX5i>wkvMc+T)CtjoY-2xD>!B4qyxi5XJ#ahye)B z-3>rTC9FDqg3+}mY`XS@6u=Y?;4}{4taRt{x+9Od^X=|d+P;^F%I2L>cA#{={VE|; z%)1Mb#t>-?k;V{d8RFJ06l#Qc6hU+$h#my7Pl8Beh%|;sV~Df_aVtZi##5v-A&6!K z@eqP&ks#6-B8?%^n;~vxcy`nAeH=u38;BMig912=0_Z{k*b^@-Y09@Wm&%PrRr#8# zbfu%OQTUJZbhi;dX`b#n;wQ}0t%%R<5HYna;tqt^fDjuI;?9JJB;$baf8sx>D*MSr zc;dOX(go8Y#Cb23YxzT{HI+~7kn?t_42_JbWnsNoMvo-v`?jidirLHGnjg_M=I>4) z(f=lYKmD_0W<|t%srx@qU1LLbeX+c-UfOWK+i^~O!Nzq2n55;`?Pgg(2Ue+wLG};0 zUxWQ2tiNtlQlfbS_bU~{D!qKM9X5?pg2Q2F?W~FEPNwE(V;L?T$;Ia*b1^QP;&Smw zj?1LyvRpJW&t(_V=~O0%MnqqY%qHe%QGbk^iA^uea*269#qp^O7fB|$RF+F;QhXvA z%XVio(e7mO^|>R+^GYn5lNu)8>6vC-GtpSjLH^ljkJHV%qfXw#diWWZb;No+QAZ>i z<=rtGe2*{)Vz~xTmGj+%WIVj=ldMKCdHkVEGM>VIQ&+X$jQxQvsJG_9|w-s|TAs>`=er9q3iO?2x|~vf~W;UME93_~p1=qnxlw8j>NG zMmaDyXxNXMtN8q^(y$nE(aLdCGz3qDJK?>MUX2TFR1K10H?7o)xXd>C(Kx$C*#*-S zEQVeBC0s-=;i5X-0ACD04ex~ys1?^oP(9%$eWzjn#Bufcm&Hv1AT2F=bK-#d4)%O3 HdI<0r(61VG delta 1894 zcmd6oUr19?9LLYO)7@>k?VLIC4{OOXol)&dgfsRzk9wkw&jI8wEWG zrN5U;1d-1L!X@ZISZ}@f(z_2n^bkSdTOy*Jd+#~JNzkKSxci>p_k7RcbI)bJ`|2zH z`3c*Lms5ih#!1Vf{iKw?vDo>jNRe^Pb4A3gF_T(-=rL;)s8+SltcC7hXJu2W!slnJ zm6s^*HvNEDD6#tA3n78q0{`C+N_fhzxXf*Z)|e%dnI>Fdm4Y~=os(k6VA@p&kDZ&a zTN{OU?mB606Xv9DX>E&1J-n}J0vEZCk(MO{!z{Hoi38v}b}*%kI7wlNt}HItV1w4e zv#ecHJU)y!;9>o1&F1|f6}P8SgeZ7#duoV~5nUBRZtFTs$T;3>bDqh!J45-PGo0^m zcIAzHn&*dk{wB|lvb>uV+TET)(CsaBxQ`c%e1_-m@cabN-@KIEtuj7gRWmzuO zls7G!-=hC>5hI@S}FOq(V=$_{Za^YX8h7RfvXumRe3`bonRHxo(6c6A=XGd6b^;q18&y; zx<%F_MuyBC#BZH3kWnfw98(*(zYwhU%@!e}7P1<7ms zU4%?N84Zk9`ycgEvLRKpS3aF#>anoqcc6JaXTI6I%wQ5B7#n-4T#Q7~%)QC`Gr4e2 z?|Ctd^jC&GynLUe;k9AZI;x{=H)DH;|1{@qGMirS|b`vqKz-s=#tK>aoq4 wPi#w@y5)ig*%Kh=n(V@AFxFnkoswG8Fq!MKCx^}KYz5#Oi?49^vv4u`3yQGBj{pDw diff --git a/hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co index 7019e9c5332c50240099cd270620e5b9755feab4..00cad5b7ea73e8c07c3c67bf6148542964e3bd32 100755 GIT binary patch delta 10515 zcmeHNdu&rx7(eH(k3HDBdp8D*0d*j28`y5k`hXN1TQ{aLOPS*-SVw8sVz-v9LqbSf z=O!j14lc2X#_$S?L0*>_LdZ(|NB;m7LLz^Nndl!zqhQ4FQtG+)p4;_qYmJ0QM%twJ z`_At>=brE0-~G<{&dGW69a(=&mJ_ajzJY+SMz%d?8c|rbUFQ!|fd8Gv5)=w$h=iO| zi-ad)uR$_ zD@qyePKji*ict_6Xke=db6!?^Fu-{8Z!?4W+4+kWFZ`&|?{;^&g6FO?KV;Xy8~pVa zfBhR>XEFEWCT3T`IO05H0)#Q^Jj`a+6GhCm0uL-?m#frp1_0cnQdy`k@nzxO4M>vc z=^_-fyKpdiTMi4De)7BA>zutGjkywR% zW5^0l4_Jw&kd3g1wBU`46^jW_0s43;*5Ra9zA02`EsRvnCwpjXaYR3Ve9s-!js$(rKMDdY&vAC$glKh@@bDY-{8^ZS9^E|7N>F`Y!8vmX&hLR#(||N z98g8P8dZ;1tLpXYRBw2B2UetT;05$b$(3mwSe?d!WurU5o+;T1!|@ppmP9Hwi6Lqv zhv5W?R2mYglq8xEi55iSJVc_ECy`1?qCG^7;4nP&6;WwOq*9XDk4W5sNZg4?e3vJY zN<|{#kQ&b6s-(}DN<$)*g2W2dF66*&sbXeLw8%Cr7)GpX>%RcrE)e7~tMe3gCsC63C>;uUX!jMdpyT~L&i z3I8T!!feelkDR^U-iWp*GQlctay{%h^`Wg>w{W@eO32dd4=6GrJLVU!WpWZF#6@iP z#+yaNim7bJ*5_oIeE136AGU5km)ReF%=^QulU7ytv)MG-{_y98LO%R*HhYlyz92Kl zCWhp(yEj}VLwmzbkPRH@&(F-U2|~wiTDo{a{Fr~%;kH9VS}NPuB6qPDxbfj&DH4$v<5vLt?e#Hm){Zev<4lW zf%Yzkv$ey~)g1`%Z$}%lyBup>Yr5MU-VV3l;r4erT75o;zsnKm^t-)2 zS65Y6r?blE+t5B6A?sYuAYV|sjg~dEX|2;E`51c!BS2IH3x@k-+I1;(rQCLmt9i1E!wrT7($ z&$=zee?~>rzY-k&R%-ZDZ19;BzlZTX!|LboFH8y4!#XI&_|D1FH#!aDo?OV2kmyB+ z4PUCI4R&)yA{$)B$H&A5TyeA$R&Im~C#<4ZA?yq5^>gciuXT;f=P($IbqTe0E>>w9 zEmojS^)?YT7#l>@RKHMEYt0s{>h-iiMC+s2cROX0cg=C4tT{&={R#v?QL$W=N>jbqTe~W)L3~RZgh&29tP;xwcwS zZ7|ParjUB{H%9Md!4ATsimsEv%3N`r^pMegGHA*b$Gu}@^eR`rgvY^4WVBokO{6$( ziqakZMGmJ%x5y!tgO3+RsOUa9G*jXMIefYhog$3>Du=W4M845+zvy%YG*XQw;lpSw z5#*^HuHuyF76qJ?BZ5bxhZNA9b*ltMe^kKvIkzSWl>>qUi$F}AEiFM^`O^@$FeC_$!K{W6AWa5@%taI1 zU|D!Ewd|aS3fXjIlEp;u-YjO>EsIfK;2U3FFByq;S7Ze&$P1S$bCrqM2E@Ic} z*R2_vqp&?_7jj7_a!XH>_4&exy+9bZrG(8KzcA-0AXiet7+FWItYcFsoHLgScKQ$^ zq@#2t65@=tRPO+&(TgpE6~`Ij6J~WLX%XY)$Sx$=n|Z5BNKx*!6d{}_{6eTJn%#52 zF(to6T)EoVO~_7eN=|@3$xR7A&EJx~dL)iw3HrP=<|D5iw5AcF-NEXZJ08GbAyfMqmb8I7t87G$s>g9RC^DkF(yoWU~Q z!!q7iWw0QF1sSX+gJydWN(-xzqPz*~luS*U<&;U(W*90-!65N(o0z1 zXmuHRA4%_2H&9v7Tgi#XZ5xv^LB!S1?z z7j8(m_n#>tkL;6f&(|PfwAUh??{%Ws#Q(!knQe?=XuW9BhGD1lcOLWdS^ew++eyc%8xvU^|jdewwsp$$Y+Q!fPLv#9szU zyZu&G+C%GG15l>$3W(*cf15Qu#%DjRU+ ziNK~jUrQ1tVAT@=_SYak5BMP1zc$my6?YNx_3ihHFvAv$ZI2>4*AsB_ZeI8pJtoq{ z$P!CMpe&dI)l$UhTqZ9D{>ywO*S6&pvDBW)pMvRCNwReEeu89{@ggAd{;dLr}!|?(H16|2h=tv%{GP@~40&rneU$h$QgF*E0D> z!0#Q)a=G?r7i&6u9|;>KI?&01wO8Z6pD9C(Nv5NH?eL%BeukI#H$gEkAMonG0|z$)t_=hXnRT_gh}vL_pAZnk zi{I@*7s!ZR94|xeJbfmL(Bh>sP*mPJ~;tE5+8rdgpyS2<4WGt(L^j+CQ%D_m>D z8SxTMGW!1ow;BU$;#fK2M4Cf%zJ5i4uNuUC9P%+Y;LhR(++{e$u?kcnCOK4M)mgE) c%%OTy3YYnfL(h;YB)Tflvt)!6M=DUwKX%3BE&u=k diff --git a/hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co index 3f6012a25e8f446159b9d2c5ec2f13779bec21e9..25aa5a0e5489bc974df5c9a476b559952439a32d 100755 GIT binary patch delta 11139 zcmeHN4{RG(8Gm=N6DO_X+%;=UHrgmn>p!H4eLnjxk-Me&HxMXtLuk8-&2b$&_Ugo5 z;*6GY zyPE$0eBu-l+7>otr^Ql-maBqtz$8|Z7J`x6NNz*LXd*5~j30>$MuYMGeS4p8jfFxJ z!Nl8(;-3wj#1-}Pn)>-4_}L=f(7VM`wHwKI2r*8&D&8Rs;v?iH@x$6MQ70cXH4@S` z3sJd^u7FG7vqFnAP*QMnDOx;Tw;*|~M6EbW{h?-2vDa5#0EGA`{i6*%Q=SvFhfZzw zY!=O%zH_&gn#p_vypi_o2bputGnr|m*Y2DVKlEC~1?g4!wfYnx)Pa}N6gZcz2XpDW zz$@u-)A7u5Q~xdimX}Q+l`-odOM1Zhw1@0Z_K@DB6&!odypIHCFr!M`N0C2B$h4EXi3_lcJlgE z24*Jp|0$jPoJHaGlS{1SI9;7r3%R}NIDPHEa?)Oo(}HL2ZON^LT;6P)R!&@>&P-mH z5AjcLCF7j@f%jo@RY9C67sQ3>t;93=mgLuRL7XfX#2=?1PH)H-#DCJmg1D(dJR8_3 zhcEJaIr>n7C}-SUHFd{U#>5Bm<3sb+<-#~eO5u-=% zZON+Z@rro7K3fsW$@?qRt^ZzH)Mj zI{CzAiEkw81cqs~x7r&mZPD1+NO0U2_b0+*{?VXsBIZkk{R!WAe00JW@Q?W>Cgbtg zcmnna9Py7t#zvq$=o=0mo*eN-#zHY)C^qi%N29*jgfBiG3q_*AiMEOHKwC8W=;&^U zJRA%p)J9{-#UHl2hXX;!_E3Ak;o|IEz!l={d}!EiXMzqs!1x1!5ErzH-y<57TPc-7 z6-Y$TwvgDU$|`W`Ct7?9!W&=L;!Ox=4`}h-2!HinEzTqS>n%_b3T)lT@WP4#{Rm(1 z7IcQ{0|?iRXz^i$Hx6oXl`~KUntgd(UH>Q=z}}y|xrEI#2-ls(NLJ1w{P>ev{JRLR ze?*Hvhj2Eb#b3$pzY3&QO!NjCaJle!Aog>FUwc+N(R&Dg=_xJ#M}!YaTKqD?bw6IQ zz`v2<_>0;BHxRz~eJx(C9#|C!|4@t9BmCX-T6{agJ2$CHo_`xM4E^khW>aLs-T3g_N>}5IWU(Ljf;Bd6}f~fy+ zFzRDm43|^+cBI;S?6`y7-P?mv#?^;ackf=Tc5ysX9S%E#(QX!__C7oA%eE`?3hK&^ zJ{EU&w7Xe6g_Cz~`SgoQ|H{Y}{UqGQ2y7wmYyBurKzV>EWGdGN#vw`#QVTV-=KxcQU!z zoK6?!z{z!Cf1Esrmt)-=UW0Y8_%NM4cD!e251;GHuz9r$`@?eG-N1LK-{p@ag5oa9 zB7T-~N$;A8I2krcT^6FX2IuN9B^|X8?i!qX=P2pC(!PXp@g+)HQrb9oU!kN9E5TKx z%dRO}ItrT<=W~kX8evpW!21GL!=6iR~d)pBc|6KpD z$@V<&@A-J2d+y$Q@2{p{GDgg7;lRz39{yk7!h8ETTcRn$Lb}WcHwERSx zU^)wyYaL=a?LtlRQrdr?xYJQ7?zR`jyJnv_I? znK~qNIBtoQ~yA@v84L)_2(Xq!;%)#al_N_?j3oan7SHh+yXmw%DE z`0?^iDu1&sm(POU@u2*3bvK>b?HtHYCfgZyJ(`|Ou4mW{itJ$6O^V#Wuw7_+vAwC! zW#{{r*oD4EySGpEe+>LP!2faZ@5KJjbi(D#B$ha{iALwXgzCQ;{I`JrR@L9kup7~| z{QlMflv3O{D1(DCI4FZt%kbec0=SH2xQyj$861?sK^YvB!Kq~&#$_DAWxRsRII5Px zK^YvB!D(eM+(v|^#rWB3+a7B!Z^@YQ#WH#C*>-e}f6F8%>)W*Q=BP_blW=+c4Yl-N zNIulC%u)i);>YV8Ra2QfMWxlh-^%3faiw85%xZOW-z&>qd1ba8$=@x)=kTimIEQn4 ziSgsL_S{j!X zIw2oyScl|+T+||`A1;xrs?Kp~a%L^AFypmYUna(1s+5Dhy=J-gg;KeLp>6VtES$?Z zUy9~WCbRiV@}DoCsG+U(^3hYfO(nAkvqx4{@cj66t~o zZ-)2=KajhM#NqQ5GV?{RvjjS(Y>kQAGs{u3;^kN}T@&u$Z4uubl!g`mD3ONMmOptr zn@`;SP$)w-uT6OxSAKCuTJ1**=o^f*6HE3qJi3!->w-AQ)kq_L#FV=OO?(qCzK-Tg z@enFcS*T)M&7uFRR5c4VbV^66QUm;b6iL?MD76_@53Kxi+*=*Ik&WX4xVT zFmz9Vme)dcwas$I>SpjycN=*x@S!zEz6=^>hFPfOYerrT{OY?#uDx6Il&H$OBXU2de@VYT0e-`*CIRY?efM>ro`X2<|bc8{R~b=q}5=9D@xw~}#+7@Th?_Ynle%P*1im)7;UOnO;i-tNvHzGv`S-A1)&s`{*3J2=lfzi z+d8SrW?SM)e&@XJo_p_k=bd-YdC%w4b?Ixz%B9SPts4=97RiWoIa0?*RtFdr5ZYNv zfg*JdF)1c&Rq7mkBb&am7YTq!Gsl(Sp7Cc>{o_B*ra?MQM+ycF;y`l#?+3DL;i`uJ zJ|7!{-meEMQ}c^dLYFJW)LF@vphl?WT9C1#s3#I+yXBv<*W_jLO^;bmGzYxiuqSeE zhW$&K1%50X?+V8+@Ue-#S8QNM7s=2&kPRYz(L1n=-H)o+n~Quu4R@{j9Ap&W^0X3MnXU#`rx$}iPe&?;lC$;ERRGM+R)T2K zQhcz_3{Fp*QMAv2VtqC+c+qkg0Sg!qs3=3It@xQfr&$r*@rW|E(_9^OJ+e6V86Kx- zhqIsIv5LnDjrS=!2p#NGI3Y7@0s0G;0{}#$(K;*Ut%Q%o>TH;|5k4NTvt!;)_`$(C zVc{MGtXKgnrZ!A%nA$P5V``~r@>wgIeYOg<&t9SR2?{($75ENSU^`V{2Ll&g0lC>{ zm9O*J-TE63&}idz}1;M0;cR!Tu`X!|+D&eY17o6><~X-ty5D~Tr$f}teQ zUr7z9qy|({11g;huD%`{P=^g@zy>s71FjWxRc7g5@-GzW(*K3+vYnJ$7H%qLoeeT}?4e6c#pkgke%gfJ9ZBr}?T2mjLH-JU_-<1k_~C0U z*bfi?qoDk7B;Wk7e%*Tw($C{b;)|Q&=40k9AlXLDTS>J^} zu)Q6x-Vgl?9|OK(=>I^%oW#i*3Hz7I(%VMc{_S$+p-<95|4(HF6!iH{8O$#?9L8?Q;@BnT?*-t7Bl+Nl zKin!`@!%)%BymMC=3m3_LzyxE2KU^{qe6Ji>3-9GVr&t-!QFZzrAsa!UmEJ0su8mW zNBAd-?nm}KiR{il&Xh36YEzN@ZCQFlD1^vClQTH{(g-lwxKOtjf*| z-sHQLXjBNlx%?PFQi8Mf@J;@fk!q9c&jBPEIMTF})CZ7K2-1T1KU~r(QUugcD%56R z{tMb)PI0lm;?BL<0D&fHzgP%ySvk_bB+;=CDHyG`S>3FzZ*1`gdiQuju3%Th*W1j%D*G^fjK|3ew(32S$va&{J631yAdU3f{wMIm>Hm#OUfmIV{ z({0dfq^M4#w^FuM*CxthRJ=I7-7X5~^+r+mHmk!bN|F(mA#46A=F$h-v04t6CZ_;K~?jUl$+G_rRa4? srwpT7sR*7-d_Rng=tVF6{4RAlarvbYR9o(gr6%}YY61xU3AH8n3jhEB delta 5233 zcmd5=du$X%7~h%RyLV-;Ew*5#k5nQ}ZSVTHM|1W~k#a&!PCTSw5p2$KwkN@$r1r(N z7%btAHCY0PfQcF*+PWrNN#oWBAt6>K`PR216qduvF^o?(AK+A@pi5kxkm~ zo8Nq|{bqixrMbZR8m zCr~Q*49H{hp*5arO3+g~3=`A?rG+}WNDqf6sJCDd9KMG{67{iS36(&u0|c{~%mKlG ziXLl&ie-s$C{PxTiG>wOBPzYa$b4bE2y|mSo~#<0AgG?CA1GJyPfvD}-P|5x@H@wP zf*Z~2=KNSM+t^JV+4wDUWAQL(-?^xloAh>&WF45E^mdV?i&? zB0kSk8E^7b#TR%4`xmkOOW1w`w%>&84GvF*q101pDECwuDm{~JC_X_J1`MHQiXjCK=B zq6+KO#P!q6&Zm?x9o?Z3S8T);8*#-(R53%2JCu|@Ur(8yzT{j7BPGnMFAup-rf+$_ zke+-*!$9Yq7*qkz~rvhC}A6YG~Z`**gYz9jgOp&@><~dh}nb_T6aPPSrp~!oh*inQ<6q$D;h7X}eaA>zm zkE;bYqVI4Nji@_TZ^-Pt>*x4pNF*6RPtTboxx)N>>QHnf>FvK~fL9bOB8d(#fuZPF z!lt!=8+yicv0M$ew4FCQVNdZ3YTf-wJvGGkDjnWRE@p z27#Np6(Q1_`q&aFs@`alSepra1Dxmnj$w!T4#%KFgR8*+$O_)_b9|R1hIJ&!8CI%h zemSKB>~SYBL`$?AdKN85nhun*qaKi|jOKO36U~(YTS^3uF-Nbo0eZ(laOQE~*d2|F z%Mkh1Kt`5vq5NVSjsf$WJw?CSrV`nqG8h1CFAD-{Xa0;gV@$~QnP&|I?CtOW1KHTq zeZYm-)(Crr@(}Z!SAhe%Ke-uU-X=1J%q#z6o=x*k@=f~{y-8yD0?P%lZCY+7w85+s zH~k#nIEjqUC&|fpQY(@%kPY|Gxp9P_hQ0lWj3z70O#&y`O||e))MRX3OruSw;FngD zvVlK5@|O=EDC5N;k%}cga1v8^UWaiXmUkXZxA5u!o|le}BUCUkg-&gQq{?i|Gv%46 z6JRNS9MrUDH)3Bb^nz3qfopO+1>@O4{#YHk#FM0&m|7dbcyXnDJtQ~Z5Z#Jbpt|U2sPpRI&6v*lz%sNXbsGqAc$@W zZHZYlw% zJD=HHXby*{v`ZER)zwzX&sI~7RaBY{t3?$xQsBasHVZ}z3(R44rA;y@wp%5%xS&vq zV7F9GOVQ zm4*W(t?P~qM@YJM#&C+H_whsgO|Qd9pGQMu7G6^PNiXE46MJBuPFnC#6J6T_Es5?A v#Es$JF3Qgp^g)DcrEv=VTMx8C-*#HJ7nZ`~&2%l2TCznn|B(*%!1R9s;+!q} diff --git a/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index 58a114a3554c54b6ed0a9febf42cb5e14a8e3cd4..d17d1af878c16ce534006d39c81871b38548ccc1 100755 GIT binary patch delta 12415 zcmeI24{#LK8NlD}U2^$L?#@FYA_@O&N(0%u+r8Vv)4Bhol|evjK@yLUKLky3CkaX| zWsi_VWhe!Xa+!A8LUEL_8AZ?Wu+m)W=#;jom61X%*0h6?@$YpCV?=?z-TmHP9<8?H zU@w{RW^(V_-}~Ns-+u4g_rATn(N~L)j20J4!9^jKVLbYsg%#|P(VcT6k{KzVAt;ei zy=HKcCyQ69*Ju`98c(naAm!M0`MAdT4%8RvR6Qu8&`n0^9N}(ZKw14W8pdy6=FI%> z9mczvmg&(lbpTT8VLSsa>&oK!Y!zcm*09cEU3*tFzP9w^_;6`K>B6PnN9RXcS~{D% zj%MP2EATL9mG4pI`&0Z~9lx08i1$w_VP9h6QC85s#1zEuVXumRIH{E}CYP7ZWoF7g zGwbk3#8)u36PG4@b1`%BG2=0&C?j)A{t#l~edgzj1KsZJX16(ZwfpM$!mD1qcDbcD zy#*Z}bl;BB3tYYF9*|FV^v1uDnUh>(S;b05bZXFq-War?zYb1DZw_{qZA)LOj9rh= zrAuWfmiFXr*yKjP8g#Rw61aXaG@=F)lG)_9v-_G~gPm&?9Q#yUp2af{ z0S44d%#}Bw)ZOYWy{Xk#>S^_t`mh1`&apdX!@UwV@IHN(82~0@O-I>yW@?F)F78bm zy7lSdyq@$1v@I>$ZvqFJz=0-kpsDVPn=fDo^4Nh6>_8DaaDig0iLyWc-&k@<`DVQ& zTMC{wfu~L2X;b|bw-k9w1K80a?C4tT=x?RD`_A~8QTk>0jt^CmDNQpBIlV9bofy$?40^CjCB@_=Aq^g7XEiqpGXo)m{_c4;pkN=>=$-wvt%4uaYR-Wrg%P~o^fXDGG z(%3BEc=G+{n)1g%0uVJa$CLj!rrMIrhgsSCm;8)5WJnJG-e^dk9pMe^UIB0Zcj>;` z_s7k?e^Q@X^#`T|zxbaokv^KPW&j(@vTMae$N;{d^VIvPit4-+_ZOz1B*mE-i$VIe zA?w2Nzue*+K=olRSBzBV0>hh9QI45nkRHfn6M*lQO`rrIHS7dm<~kXLcpO?R$D?<2 zBE=)#{>Mu^KGTlfH?ud>la@o0?EiHpdjB6NNOeTmtxRfYJ+mU`tI;#}v4_nGyb$SA z1>2cz1?)gUeEPDjsmcViLP=6``IwW$ATx`t58-s+11812zzl!&W5AdH6YKx_e*iTp zn5~!gSE&iXGD~*#medfXQe;kH`)pHd%$>#M^8m+N;~%Aj3?gh zNz0K*@`9pk=)7P{sy)Is8FKx@08R+1u4lItD_KBvl!-AZ{^zWksO0k5Cb@vcoI3mv zt2d-pHL-QN)U@4fi6M3R2pgSrhkTGi{zUOVfaOoITA(RV!r=?_DbyDMC#N&cUAz>L z(IKcOrzlRDc5c2papVMRlx(*7+@mNb&-~} zZOxrEo$DHF+S=}IpNGj^&5d2kMP-X1uI3y~jm_4I7FVNHaB|K@p+)4xmL`s~HCx3- zTSH@Gi?jLb@qJ9WBp=8?$bdH9s|eaxi6R-0u}IY*71sdYJxueNz%46@rdrMezSmE4 zQRNbCZd$(yc(j)0%B0~NuDVU;QnvDbMKqvQomnPJ zy})bN(0m7QOPJF865*2P%T!j z5jHSgM#ylpf&gr@aRsO?*-0#X#Pv!?|ne`{|s>VIfj1z zeghT#7wHB^fd>qVpc$l|Z^TP=8O{F=`c;=1^gi&D*J=9d{EtEfuTH8qfKDfi1~gow zHBjm%0pBLld@Ass5Y1n^u`A>mY4X9ju{tiGzj~X4?z)9er zzCd?02>j$LH2)oN{&kwa27Je9ir;`1%ZU?PfD;WxLmT)_eEFYFicZG*>PgYWFNWDZ zi4TNOTf^$+wy@ph=RtIY90G`3kkdrbCK77nHPP<$6Dry?U5CRRvS?cX~S7s+ClGsZx7uwu>_iuPSaG(5Z(s2F$bImB zJ8_WR?;}RogIYVD6N$mR%}3@7HzT(5LXb4$MV?Ft?{$*f=L24%AaI;SGz6QSXb6Hf zcY@PToGG}1WEvd-ZN415gWR*=50TptLSAwk4jWH;a0o6EYYtIRn@Me9hc`%kOK5rV!q?qNarTZPrOH z51A6n#$28_~>ocTnLBxW9EeU3|5^53lE&-H=an`afY2= zL{=r^JBeLqScf?Vt5gim?#J?=%QNiMLIR&iOdVmJEcb3=*$8`65jv|bd1Uz12s^bX M3h+C6r2_{23m#@M+5i9m delta 7257 zcmd6s4QvzF9l-Bi9NYOmC!r7)nh;S+Qh9v)H=STqG1o&leX;-z^%FKF@VYJtc+c)R43uInk+MJd{a4H&jT) z+!B^at>R|0U7f}nr6Q#{*iQ|(gh9Zg1*aajq~5ZeW%CCWUT*9gVx(NXP;SS=5A_I3X8KNW=-v zQ=Q_Igaa6LIoWGu52iLu`D;!UUY-TgwLrQSNY{eXP4)KH=Sm0dg}Y`BH2w)?Q+TMF ze8C!1@mX69N_1|NVx?N=R2!SVpJ`7sOkDlz$Za3LDs%=OwhB^w!9e5uyW>}e!=5oz zVJJ{(tl5-Fq?JlEBTGMl8EZs6(bU5K;8^1cDTOYYq20QMKrB&*4Q`NseW|ptd z!M9W?`V+3{x1VJ@XHow6b&iZXXY3`ZEouB4KD^}S^{C~Llg6WIUVh-s=*ZvMfJHt%g)KKVcA2u# zANh!M&S16Zqiu(JI^^nSk-1Rrt7j*fnl24x`(mgzQ2b&Fn;Tie;?V{V+ili6;0_g?mjYBAdGD9s}G9%P56 zAzu)8%g3x7Kg$k&8QyFz8s+{A>?9PvF;nZDOj&lf#-no%va;s`_DS`oKia1=+&_t_ zjd$|0(I2^UtY&mq?c34zM(>>R$;!Kph??@QXFKO++4y$OLsVmFmE64yKQc6;|BPcz zaQ_@+lhtpdxV$1Cjqz-9#+~Y7=<|**Giz@D;TDfj3R% z6P)rMR9-62>Os*mwng6k8?@H+pbyiep)@;#N#6WDv?52|u@4oR;pkP@Ykg7kPNQAa#{aOvXP#oY5xMS?jFQFWAX3<$;W0OpIZ@D(L?T zRmsHEGMWaNm!1C_*ed`4UWAdojxtsRMX*GtE1J*ME!VZvZ!}qxnAIvqn}_iT^fC z4E&ikI0k(G9-6-kymb@J&jMewh2|H5N4L}b8otEwO-#>y0d9c7r@Lr_&w(>NG|y9y zf{E$=CC$r$?>$8GD&V!pM%Ewe%!Uc^ByGS0AHGF%5AaVf(|i%|@M)Sa13q}3aM=H6 zV4~?Nnb5{4@GJkO`9|QL`?{bVcx6HrbfX^t?{1)Zx7L3q##}&8ya@(v9L;|XJW)aO z-vPI(->~R$nG?Ww-WT{Uz+*@2Xh-K^;)Fo+4}rI8HP<5y0XKi0*1rwBT&uaRZ$ye{ zV)|C83fca%z{J8AXh)U6!y9Nm6FBoS&F#R4`)U3J@a}yhIgY;&CMsVYIl(aB0KQ+M z`6}RspV9m|;F}N7d;@Uv5t_#o9>b+r+ec5l1_q5+RY5P^Zs1|nQpZd$@V2?M{sG{f zmuUY-fXDHkX6ujr9fygXAJYb>fQvV2u6;XcVy@hy`DM_L-j~q72rp6vWd&ONVVG!p zOrKDfmy@FmCT4mS&5MDb)((NHtNg0~9^q;I>A*KEp!wV!HNJ_dZlEXTfx)RIG!Fs~ zucY}>;A_^={Au8YEi_*T{Ag?=J+U4puAQg(X5h8IqWN~<0|}b%0>0*Tn*S8|u^($( z-~atEF=;PtkOJO$nC9;Qulxhe-vi$EF3ksk51gU+d}guo`2?96@*OwX%uCqdi@qtI zfc3R+ij$w*K-*#tzm17L)BJ37wL=hnAiDfELahO72&h#My+pKI?L>6gfn1cCU)9>_ruya8Pd z*n%7YqlCayRA3?XnNfg|?i=^faI>7JB z)n2=wB;*!-x(gpqVz>iAQZ}zuB-O^JgJkJ->4o>Y^-_6#AwIcy7@bGa;^gdGXl@=n z(BQ|M=37W0mq)qeo)MQ9xa4JgX#wRs#E3~A$e=07h72mr!#}Pluec@IoWZI!+9mmw zE_^ciRtDAOlRHw=x#Z;xa+UVq)NWn<=d@8YgUj2fB9Fj{$>(k(0S%r@?zxQ?_l~%OGPm4&Je;E8b!P z*WpUqN^yAy?X9htODFKG{tvvRuhtKYJ^Q{r9*LyF=^NYl$LenCSLOY?^1cP{!+5H0 zsNyRM<=5*DP`GWps?k^hgQ~khrm+LEu#8Vx{EOjh{2B8JwT)kA->vP<@oyM;W43{B z$UkO0Um-E*qfHjAZ`PwLn|smK&0_Pg8KGj)fU*U%_FRTXA8+#XaHf|Y$ym|yx8~<* zWJXI0s%>PnYG2LRc_!=VWOE!}pLKTb&Fw%xW|S)KKwsvVjH4MQAfrFSoTQ3ov1NCOUse^rJKX8?n;LNauGL}%r63SSGvfXB^7Z~dY#-0Plj*57H&r1HC$1iB5OEe+k z!A6979;N#f5%)Nd@*zDR69XuJrsYG04d!11Xt&uYj~~1>w35H^L{=;Pypt=yC7?X} zw7CZzB9eSrsnqNeGyf$-DogiA{=GwwNQmrlK!_y&{C|hY5ABagi15A(+JissEEeQ2 z8Cc}sx^9tEpV^qF$~KBCtgD%Y3t&nk5K zj`nS&eZ5%T-beJeCPC=C&30Lq>1u@3e4W^i0NyRrmGgL#>3)8Ex#Nh}g`&Z!a8%&D z?jBWiwL4YO>-G`R>GZi(ZC`&sUVpI-o;7--4*SRLI`vdsWAMds&+6L@{$`xhwGysd zS*#0qbn4OQW%1E~w}u!Y>%JK9vT9DQ{<#d|u|Ol<4lufZ7Iy;cCOpOjS!;s)tGuPb l{s~WwT3yGh6JEOcB>r-ui)JRV?xn7pth{efK7cWs{sx7-bTa?| delta 1632 zcmb_c-%C?b96$FsH#fcAb5&5wCW_2yy0(;Ut|_LbWiwVyTliyJpj|}DB*-3WI#eWt zTIaw&vv5PU499|Dcn0{l7Rm!TUq3v`8U$IyV%}k|u7CJKVz=I1N4dZ^j4AKeuI@~%TmhIl6b3-mek6Ot~XkzQ-Yr4cYK$xV1lil?M_ zN{UaCsv)H8gp`wzawQ9s4^5hAw{0g(zE|vgIjTp0E{1tDDntcj>C99qzRL zFFp2|Pb@VltUUCBL2bv$1; zBcdA#9gK0<&}D>fge$s?e1cORjQZdT@fx3phq{o~=CoKXRy@a?#IpfySO=Ec>D!`^ zl|&urKA#%Tqw$~S)OZPvd-vp5(fD8}HQ!0&Z*LOg;;yq<*@hXwPj{4hB(JvYG_o@G z>;idr(Om$0CS8ryRxlX4-8UGp+G`!EX=`azO?zz(HEmYgcC`6IExylJ;a@#-Fvj8+ zy%s}^ibo_}>9reFEfYlHfZb3?S8E}{ZNzm`mk9qf;LuZ#?5CB0gI5{Tt?h5Z(SQ+K zgUFx`>G$Ebp#AhRoq3t=X(!#sNe!@#&%v=^zFs}e!)rkYT=!u^s2;9%;I>e`zM)gm J>pf{v^bbA(l~4cx diff --git a/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co index 3ba9bb4b82d2cefec6df7128451ffdbb99aaa646..f78c7c89cff5c6da4bb2c457e67e259d1d3ff74d 100755 GIT binary patch delta 8813 zcmeHNeQZ-z6u&7Nzl0gPe19qjfbkeQQtr*0l zuhfoAMI1i#5yAKo0y8mw?5R;hRzm!U0sfF+L`4G;KN5&SoIwGZ_r2c7==_Qbc1c~b zch3EtbKX7gop)}2xwnJ6<@~x_wM0w>_Q0YJ+#N~IJ?pz-eqvUB0A%D=uJn}nemv?g;~ zNy_nbg-G6|VGD>VXya=MSFx-sMzfvjpV{MTrMh`Z{hpd=DAXH_9X!LHP}ac_@%oc^ zy@;+B_L9QMZl6>_9D*!OIAn*QlD(HGWxt;khFZQ|Qw7<$ws@NKmhiMrBuVsG2_5^q z_9)j{3=6ntNEA9io-R5gIOz-a0myFB?JsUjdba31y2KRE6t=na1HYO8aSwQV z*h4IhHxg}eGuU+4xQqZs&?idCN04U4hIoxfo2Z>fCMl0TVVyTQc^<`;w!+c#D7Lgs zkjvwx?SgENm#%=!s1Z0mG_C_6v2>}ygmjajx3w9}NH+`m^5q5#(k+7C-fj>leuw}j zlz<6IGm>T`El66BG^%Z3lX`a8tagSiYIj&nUF^~K*gxKy2a4r zV(4)(^tc%M72>O4u5vNvqGEVZG3KLU)QQin&w3XH<1qs8wBZC~(08N9Wz*xb>2cZg zE3ZY>7NM%OpsFoKRcj6NiLw#}V<2!5{=1I2@_S^iQAhmA2Pu7|+~#ls@LyDXCRU>K z4;$!OR3f{7Sc|uxr*G_+GlwoD(`XS8|LyYRf88F}Vy*f+?NN|fKloPT1Za;AGir~M zzvOp&3{w-NJzDK4dBuSXskG1{%_+}go`oR&wX84Q+>i8)o=u*OAbpnq(Y1}p^}@;~ z%KG`bTdL&5S;*F$dYB)%RV5~s%Qc3#c*_cjo9z6egCof{^w5r zdPD0Ua;Jj?zisPOg1JCu2-GEIz7nXRZC0^Ma=M??!L_KFxU{L4r~65lcA0a&Ua=xs z8Z2QH=f;TA1!cYxrJ*+_LUX)@nj?YsFT@@1d~J^RCP;G}*i}*?JC{le4I&Yuqy4dPx4$ds>y7$iVSmimLwEK1 z0{(7a?>d@}_Qa4!V6A^mXZIRp5BgRISFKy)>+B9ieW7TN&mW2SqP;%4CmQOE1bb_H zdjhqQ$bDVa2)QR1h>3&pki)%-a;^>ptusS&0#=8Mas?bAH{}kkrYKv`>JHfafk4O= zG_ub@gK+w&WJY|MXOONXqEZM_dn6HtZFL=!E*CZ2WzUE5*n$+hL5492vs| z?Q@K;FVB7x-(q}U2?TeGI==jX4LfIL2dou_gc1zkvwJD#5>j+Bl<2rqZaebNoN{wR zYPgAMvNwW=e^oHzvsrD{G1ZP$s@^+CQk`z6q}m*mq&nS~eHYZ*B(%XR-E6kaDWO)c zUGlX$t#0JY2s2x$rg|KQUcSz7|HENWAY*%OR}4*FNTwaUk|9bn@Ols;U8=ybexxjE}c{p{Cf@byA19T5>^*U zMI0#O_LI=51M_gPr(-D-{HAj^>) zd%pXfbMHCt-1jB-_W!^R4zM|qK(L;Huu5|(X9Y88I#qlyVg|`if?B}IV@OPv1TFFy zIMg=&Kprc0_B<_d#dD<iQK!A-=I|^VdSu>Tb%5Ap0hTT=|`^fqL%&hr< zO=F>(2V4VwZRw9p=%B7s)J0iwSQ8W92690*T9AzvWTOSyk_Ar*u1x(1Eh8bsl8eXwA9;TnQl zy>b;9nvs{aa_dw~l$xpXzmN7ghe3S?CnF&s~NUS+d&zZ`J@FqHCS^VL0 z$Z5-brOr|(Is=ypJq1u3(ShJD>G+oZ=#x+fsyp-#$s_Nw_F$`3vkFE>U9JiY4p0UGarPtBi`^XtISJ+)(mP8j}9&0kgc&(-`- zlplI5{to4Pjp|PPPI>3U<|In}`gwDeyl;O?cyC#t+?j?h3Z3gKDil~=n_^wg>J(d3ufRIDPk}Y%H59r7E+w|i=Sx9% zppFDYq|DBGkA#y&xC5z(I$w&daVk07WnN`rmoE^Y(_Bt(3i<*{fV;-!Rf_O>Y86;p zT@Uu|-d4Wv^|lsNX(A>>HwsY)EY0PqK#6cKI-oaK5p0|jCZ+LB>c7bel?7xx zLB~YkM+#s?4!yv(n}v&HXjU$2^!^%uAaoSKe8FA_x!lyHG*_rBgkFn%t-QDPMY6ri l3Q@x6DC5P#l|op-_!bG1g>WO2FbI{ah;fNHhGO~OH{^D}(vtt( zAJ+$Cn^Fe3TOzf1I6aaF4O|XY1{tA}@~+Vw>rQd)g|pmLVQyi=-r)I)WGvPd?S5y5 z`ye*}KbD@KO3!)pv~yqRJlvVJ1=J15r6{-N2F&GtNELH`TN{T(LbIU^a=k@s9we)T zhxH(7MAJ%}xU)rf_z4$W%MBU-nm;4z%{k)$as#GoYij%aKQ;MH>Gl5gTto3&mfx@K z&pr!YAM-bXY%+C@%}$ya}wpt#o_jI&SY_Gk5d+N?*L$({hn z7iIU>7iR1WGxmiU`@-BJtpt0IeX#@i!jF8h6Zs+_P1~$mikzh!1%#C41IVInW5!-H zW3QR9*UX3RhtnQGPJ0wN?Q!I^J#ituu|UUKDC~r9FCf0W9ohR75dURAl0C|wil+fL zeDJ1lsU%PGM)7})q5I(tbqI0x(4dgoHmGH9eAbsm=YaI@)b@SbA#y*CtN$lNa^xdK z*i*g&LgcUwg~;^ht2;z)r&dsi>~UtaC2xP0$%x~&n0`7^PCavp^bX#_(Q@Y51;*NYSjH* zYV=KWv#*yLy+d)1Ckoi&0y#Cx&snvp(a;mmX_q@_g~U0#j_W=!#cDOH`s^2m4?Zh- zV_A=^qASQ7nYL+mc6Ko}(&wz&)Mz}qQbOcha*CasU)>>cJGD|m5cn>*SQ_R{m;DnYfhYWx@Z} zft8RpZ8Na#x>fx?K;I}03sdjQ-dN_esm#DuezKS{y_>&kQ>LNox@CXgp-&Yvh1_W6 z2-h-sihXfzb%)9A)QSm{nL?p?#s$B({gX^qj2DtS{b7wdcY06w-Q7ga3Q3zL4Dh|n zZ{$MRpyfPhI5|l=tQ7)dZJNv1H$aD(N*Vb%8{9t)92Jf&WtE9!$I)nKIMv!6?`S<14RSYx6f9}*IOYcr zDODwK0P}C|SMzO{zy6r~vop~qF(2RwBT6)W4_0hjv_)e2F+U%Zfbtl^{Fa15N#IwQ ze_%;J!%0FWM*B2E^0yi=L%u8ziea!EcEmM>q zVm^Len!mz>zhlKMlX~gC!2Hl(7VC;APpYN|0oh_jX~z8Y8ww?X&6wZ)w!+J6CEp~_ zgUpfv+_*vJPf7y`1Tg>Ml$!rO=3O7D`6kSt`G=Zsh0^$XaEp>^DB1*W;44t`KgRrM zP|f#Yz7gMvARYY#^DP_H_0M2Fz?P^LFJZ-ag_^&B`Eylj{tD(N8`L~4UZ$XO!34g1 zK@55mRWoVyfL!JvcRs)@Qpood__Gc;z}7o!L8A3YG!bT;b-_hggP|iBsv^+i^AO0m z9RzxOP6F$K41qNv8benwL|D3(Ce3KCL+qCoN2VS1^|&(4xO@b99b`D#?Ftdr=T>?P zx!ss$oQ#)12d#8gN0Yh?O_RwvL-nMYlV%wgY3EU<#CVlCGeM;bC(V#4F|~`Z-m7%X z_{ds1U3J8Er+0C3+Mb|0@)zV}h&iY!iRxYMqf$ku2hpWoz1>Azc z#g!YY_`Ob;q*4a{niE#!r|=6)e33b!C!f5)MveRq7b;I;{iKoa5zF(qj9z$GUC^6{ zcT3vD&!M7`Ty?=0geI(f$c;uJdv2nd?{PzqDe$DcHv>CmIS%=2h|eSCi#+r|FEz81 V-{FBfsj?crN0eu3_^TdR`Y*hZ8R!52 delta 4659 zcmd5=YfKbZ6ux)%1uKsM)*uuWF;WGJyF2@Wu9U|}3YcicSG#G5Zi&Q4Ocp5^ zY^{PrJ0>Nqt&KJn4LWV0)i|`3)E_pb4L>$*Nodj-e_*<`m{_u6@66o0%Mb%vNZL&{ z=iKj{d(L;yowIlDhE8e5?yBR1Zl8yOuvFC*znp4dy7Kl0Q-Jgt#1hc*7$Tt&)l_*5 za+EThA$kBnIV3X*L5b&?i zt6`3~hEx>_LaHEV0X8I45bc%}_Oq1GdoREw?guZ7V>?sF^<$!sK|2@*?9sGu*c!vl z%nga{!kggQb!ROQy0Y7aHmrAM^R~t3s9;m_abdFKMLMk-KAP;Pq0?$aSxcwYi?W_h z+X5f0Ni_TO5-tAxM4P`L(czc;H(~#mvHxc5zZLl>m;3XQUH<%JkG~+f(m&aQ;sfw# z&AaND&K>syx|)A2<9n)RudZFtwW@^yRhw`C$b<}-kO31iU_u6oP%$FpM1)EZp;9TR zK}w*F>TI|!#3w9^hs)9G44AC}vo&D02E_JM*=hF7);K<5ys2WxSAOpM1;$uBRaqXh zOg6GJ#0PR^7HPB8j30FP)ogXaK>hkis{lZY^!-BBx?lz26AFKOieemA`&LdZTp{>? zP0l_?5wbKZsv>QISOjmI^Q5y12-j8Zg0}gc- zyUv!D9dw-Lw^Vg&Rp%Y8tbOq+eo6IVD!A`IZjHM0LsWy<4gkC*RCVfw^hI);9M{D5 zWxj2~hwF~1=c*-7H~TcKWyi2@h>E42Z*TCCopS2g_J+?of>M|68L8ht8S9HYPf&#{t*_|e!c~9`*g{;(Gq9I$*{zZC&rymjZBFxKn;@2gbjP?x*&P!U zJGJf-8xHjL7i7V1ip}1l=6!EJoU`>=x_N*PoQTs<6BK`l{X-+xV>Ukh1uLLI!;gP3 zprW>~;?I4YtNI`G%&5BNfa>je6>oqp^L_1~pnqjs)e39*g(Tl8=^84mSt z%EU8f+a|G(db21$&HcO%E~6%BZW76qKKxot-X}-cuGDm{v;@*xlcUgBXk$jU`I_9gJsbmH277|8$^_RvOOY3C5*L{1V1*JgdaN#CSw5Rg{oPyrWWaAdK&{ z#Bc-oi?_qfnb*U=kwLy5a+wwIXwcp?k=;&IYlWGpW@8yq zt>qq~x)=vh%dI4W#bzhIrZO7|X0o{4$TuivHhQciu*qmQ;tw$cjvEs|!0YcDEI)W>>I?IGY)U1U6fujLo*_>dY<&S*W?FELyd4 zH(9gU?IL^_lZoVD7!M)A*cfrm;*u?lT_Rk9bjDR~BHW6Mq(Dr08R5gYD~M__*@>^k z;_fh^CcaL7uS o5l_H$?qDUfQwIvTE0wTB-)EG412}zp70l3Qn`E$Uchv81=-Ub> z>w*2%PsgR^$D>JU#H)z9VoDGSXt|Q z1KS`UwoX(?6!>D32Ddg3f-g7g!R^hA5l2zzjT^mu*Y9_mL$i1IzumpZFpBu|Yg7|m1T zgj9Bb&fAVT0Oms>n-g)T!oy*k3vrjiBN3Y$aks*wQJXsPGzpx@0VhHiLKi|eLN`K( zxm9$UJ)+CZh;Fl2R2>+_4!npRcnLc&ArX7;fQ1#EmLAb%=@s3U6UYIya-&-MQECh{(ps^72xVjC}9t-h262gas z2p}Q4)iu+*_Mz%w8wun?ja!6-&J2wuqp@T(mW-a*55)$MVyBQ|r;%bqB0Sf%BCj1@ zG)OiQ3-S0O;=%JM?Ndbj!-3Qf>D+m749Lq(->Pa*e;Yvip~mj`k&AY!Qo4D+P(l}g z`gRx!k2^f})-}PM~pY+8lpX*vP5O-v{NAu-LUMsL@Uy;ZAu2e-g zaMoChlcz%%)yxy|0rab-JT)4Cag4wGQBJ88*XwCsGrhXd0LM;KlGp2DJWP|TEs;u2 z^BFE3%ZbU@EYD?ATuzMTxJ-IB%f(|!E<2Y_r!qOzB7Pw@J(HY9^*lGlPtHwqGf5%E z38@ShOC-2dmP=<+!c2nCwq-N%wnXBU*<;9Zo{#6$LX*%Lm}GoYalZYi&=qg*^fUf= zrx0KQ!W6@J`Sw8E8;i#UKktSg5H?9mfsNo&2rmo60Tqql#tXac&tUtHExYYn25JQB z54BfPQGilkM&CBvEuh(hAaNF3FkDrTtOiA#wClC~h&(FSRW!i4ZuHD@cVAxzNW>=j z1jqRNj845BeLD5|26TEd(67_}t`o3ls1?0$lk0Wxn;}n)el9jBXbt*S-=g4Hkf~|L zYB$sHa?nTXheOA4_;Tn7yc@Kda66QOTSIZg3Pn3N?_!s&-Go6VS_;>I-zG;3e+H(7Kqr==C zX?4JdP!C*R9q2|VC}R&oAsPJ$^}}^5YZoe5hhSr!LM2-*DDk~CzK_PArSa!TJm(QA zxJJRod4)=@Sy;V6&Ie%KdQHE{uMY|U^@v>lCnh*#oRExBy)>^Imqvgph^K;hDu}0o zctubR5#%C*>WH8Qb@F7Q zv~z7-lDp78?z!vf1k!umgw#Go_?TzXGX|t3@tNlyv)fO^cFTRy>$U0Ol6bXMjo+#J zc-kt*&*{XMT|XHn&f$mM7RH0{+inZL!l^e$yLrX<3!fJ|gA7%54yVJ3pTI(L&-9}J z7{PcOy=4?~6f}akJKdj0{YhK8zl{2yZi$yKt$hAepF%c1bqX_rgWZ6RC%W`VMkW)a z=(M74&_ak=M~26isW3$RbSlJpCL8KqRUmxv(&g}ZXKho9MyuLtG+NtKuhG`(YK=D6 z)Zz$lC$AH~?j2;}EH3mrvf{LZq_2LzD@$wW3WsCFET#jq7~w^~i}z4d!+yMga`CG` zHg*ThCb~$5mp|an}jE+_qpbU;4ig_sp|j$ diff --git a/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co b/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co index bd6f281396091f6b6b57af5929c6abde630422a8..5e817824b125ab34e210d812bee5f656ad9c2f82 100755 GIT binary patch delta 3507 zcmd^CUrbw77(e$?pdBl{w+@uz9I2##$VP8F`-j;rfE`S2SPO-LQE%GPQdpr)TPD%i zTN`l1EE;lr(8R=SG4VyS#J-R@DaHqlG6mzCFUkHxCu^=ysC2RMK?<+yG%d=4kS}n6EX-h)E&Zf>g`&3bi|pGCzM}V4y;omo_BNMV z^FMn-1Nh1?l zkWldv(Ng<-e!$BVP99~80q?#-@Mu%m=)4KD?pVU2>rPm8y$}G*9F|iI-24Irm{nDC zH9(fGHGM-1r_~GP`l71rQq7m=(7AHOxU)jALrl6ygLPn9u2KT*8J0-K63JL187oP)(~9{(u|81j1Ss}IqSV&9w^lIFM1*^r4a!~= z?ou|~VKa&wbapB+ic0ymZzUy|e@vd8R-!t0@TPCE#5}yH60Y1Rmf--9?!&6$y-p87 zJg5|Dc1W3jlOA>Q*-_du?|yoG&;i-;{muV(c06nQk7UP|eO|?WabvzLW=Q|M_cib1 zsQfElaJi`;OCPyjr+57{Ep)x5hFJFZ%CYR`c~aW+1sj5-ENmcE(gOsg(h_VR{7Q+} z9vM)C;4gnxQj$sfl{&7SU3#e*zY$;rhoh70mR z_)K~_%SXdAe0DaSPGxf75j`EAn4Fn_{unjC(jwe4OJPF}o+~2uGuF zcg%{kzMPRRl^=SFgYW{x0Z+2Aka;aI5-rMGXi1;VRtNlHMpBqK+ z{oAg`NQY(HfC0i^-o{@Nd0LH>DqeDf#~9rD8Yrr_6~T+Z%SEv4*^Th$!F;f%$BvTW zNG!>7F4u9zbR705rpwh!Ogq=5n4GUm@wIcFo?8w*eFz71o%qwi^Yn5fo*J@g$SzxE z@P#4w(wjPbZOE=6Tc@4H#*mvKEoH++@W;CQ@Xb&IJ{!_2uB$A*5OTB1q{v^NH6QV9 z-J`6-`Uo}-yY$`9k!hZiaaE0f2<^vbhg&q=gt{n=y*%7bZ=J&54Ik4m;@ZKLz7;_K ONQ2Zwg+}(5L4@3<-u>1XfKj-s3-_Q4)d(Y-S z!0|UwA3N{!au9knlX?@EcWJVE2pfZZ3?PshkUVn~PsoG-9NTOy&1%f-#=KlIW%aA` z*5S>ThG2D;A}q1dxB3v*pPJ2N$^RA>D(dF0M_^XvUJa!9-FiB2ZBR6(vwn zLPk_Q5!FCMxrnI7%);c9Ns-*FFaId}4Nb~IRINbO3RJB?)XHhQq{S2FY2PonUi!cG zW1IeDmqjg&cGxJnUnHeht7m%K)A1w#qnY13O?>}$;I(Z}J5ZkEJN|bcyGPDFbzhz% zw57+AO7|DW+)vzXAo&eHc8hRH{^EYd?RIoHZBNMko{~JmAGxUVciL~t@_c@#kCkCn^3pyqWK!_|Yyv>TejWbYgT%LI=f5u5KU`_q`Hwd4SzyY&Jlg zA`Q7%wpl|6P-ien2i`=vUzv`>e)4Ozr$18oJo}LOo*c{A87K9@~k)7B%%8B5Qs2HCK72vsG z5lfs!cqJ$btS$NnU}>h@vV67^RzHq}8j2FbYNH9iY9$_@2pz_AA)}r_rJ3Ax$ilt# bV{^Dkzu2jAhqD46hK+hpmx_L3C{H$&MY?5&|U845SHFVoASk}9k5$G8(WR+R7IC#LJ}ij1`<54>h+w-E;+>g0YKpiB-5+ou z_dDm@bMKsczkBYSeEtowcbhPX1JOZ}AR=|EhLhx;uzETpT7dd!XjWAMLA@+$cAIJk z7&kAfx`yCy;~G%cCWrf%YB8&-fNoL+n>LU{`achZX9&CP{`0{S!FSwjQK!|g4!^Du z;f)s9KspJpJM+oThg zOFGdEzFVij_v^>O)_N=0UaxjW?EqA(CNN*N*UuHh;EVMz87=mcBSj}z-mzaGfgLO< zsQJj~tiMtm37h9H^iieJ@bUSvzShz`G|oIqRPUj&<_Q@`i{?og$BX7EqG|^Laoau% zzO!(hmdD4f%8QdEaG(Gj2%QL>2wezW2<^sh!C`a@ zPNP?F8U2D1z!(nTWgNgI9Kb7b0Ak>112BaJhv}T)G(`lL>3I|YS~=QIIdI1tD8PJO ztxboFQl<5P6o;CZ%8jME@~MW!@*G$xS8Yd21ciy9FcB0cf_gAS^dcgHhzJ@HaYk7) zRkIHbhizyd==QsWh|Ua!DWfoD6sC+)C_8G#`VnJ;h_PYB*m*&^iQsO=*Qa+`#mxW0nl}`Pr&|&`93_5B$Y7+=XCa%G)6Lr!jea=QH;-(4l z=lxO{T?fjqtD*F;lSP!TEsdiiqUgUQOM`loNL!xAB3UYW@t1@Dcb-_!K2~`$rSJM_ zf3Ym5$?eL4B{6qaxdFUFF7sAWo1yHU?J%jI(kNm3L9bVBB73u1j8q1@+XQaqP zrHR}y!_!O$8KvNPro*B~Uo832%C>H5b)gNu!`b0vV`(Noo6Tl&1?0iL7E4a2lc=9#5?p*X$xNsD48vz~Oe~dRGI=JO z%ka}FF7L_bSWhbT`b-b9T;|wcJ*1z?E*)$Q~9Qs)rBNukyUOetq-R<+B5H2&7_hF2#81U{DkNy?#xk zgU?{m*BcnnL|-Tf;F!q+uf#qeZ?-^d+zp3WGn|Zjn>2Fu6uce}Hg#jJyD7L64^mOA zYM6kZGOcipHNX&SG2l*AT#vIMN(&wTH`%<1ecPL~)oyZdhYecjWp&v!ui~ZwzRx}d oLy2~SW?ECmrV}=@!o%x{a|XPVAp2(kza#fyL7hQ$M`01*4=>)BZ2$lO delta 1705 zcmb_c(MwZN7(e&A&AZKY?_GsjG3OASveq`{wq%&4O~PRsR+h;$Y+kp4l0*^3bf_2% zQcn+i3kjl!sE6Cbpod#ef%srz&_fUQ50pLxGOTs(z2|5XQ3D6#_dCDyo$vgW?6&E-3_BnG(HH)d5CpySgR_{yC>ZTh<8P zPZr{cx&*5jZR8-0gxwFq2CCY&e?FL^qL!>l%}Y>&uPH`2Z-P47PC3LgwBHD0M@ryb z(;AGjZS>-r_>Qfl<_0wua}f$9wc^8Wggi3$frB~j)9efbKl1OXRruKaT(lqUE=uO2 zVUFvd^0{a`$92ir!ExO(Ug5Y~RKD8e5Nb?L!ESO1wI+|C#CPKOt2q7!j_*bBY*46S z8wEStB-FAOgxo`vAAo%IsAikriwS`1Sfw3|Xe?W&yV{JCQgKQnz0^)f4?tRyFX6E% z9-HE^DPA$cuX#i9x%1Ew@LL#B|*z)7_g(NdSx~{}gY9-#7XK2*8m_aRok*mQu(Pc?5a zl|8u$Nv^(iBKS0T2}rNClTzz6;^V=o;5d+0#QN|vdcThc!^yYW!YZ zhv~YEIMUN+=*+U}^DyF9Qif5P8i=>xiD1C`0LEW`^1}+n`{apM!Owgt3`9AdJK%IV zU2vW{F*nF0k6ptDF_b7+45rT&>ZMp;vlr?%tUuZje;VsciNg2**2nr4(!90%Qf4#* ztjR4&A#%iSV`CKQcx)^TPBpZ6>p`Ob_MJq;aHNl%hP6O$DmW6XeJ_DJcfP>3uS%v#r#X)n-$CJ-MZ~(4HkHeoaJ-iNfHYD1w#F#7XSbN diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co index a4f44572a00a84d6b96bdb4254c4e7f83835ed09..f21883f13feddc953c04469f3ab32ef4f39f49f3 100755 GIT binary patch delta 10251 zcmeHNeN0=|6~FglV{B}&9Va2MB!py9RuS+A{-ALqe*j506j3d0XcgiSuwxd39dIa$ zBA(+K7FB42zD-V>Y6DVr=?YDXD|w9=Mp3h|C0SXNKTsDI+A%4Wf||5jmu6|$eed0e zd2H*Xip@sKlkhq3ch9-^yz|bx=e)v>m;9uF_d+hH6{98;v6r1Q%IdXIr(g9@5xC*oABWM-2CGK?254Uhia=jw8pxI-nHsd z!F6?4;sr1|W_<=E&Xsi~B7~oCchU3H@5W{{S0JSXH^x+8dQ1cUG`1St91G_5B<2f4 zTLGA#&jq1`P2ACG1sBGwDAZYj!krG#ea-d)0yfaaQeK9k4)L+h{Z?hD?g>@+fK?N6 zJ+V4`AL}V=;QW28tK7@bNT;%vq1~Oz{g9fs0plNRM*#?hLiu*g+ZjF_&Uawm!SInt zz7z9KhVSmqXBTcpz>W>DW9q=vfvFQyC#JUSVy`{B#Ouh`dY##NFKfV4#DJ%X0nZWx z>L|GU1}LmvyJDx;p|E+K3I{d-UpaOsW4N8d20kxJViE$z_8)kF=)!zCbxzimkc35v zT5%-N0eTXQ{VL)>6>*@7I8e2J+08d%2bN(6ny>>c*ny9;wyFg7XZ{No3+3NM6k{pz zw2F9IMLeylTXsu^h106V`6)36Qvm z=nKOy#wHzCGg7|!jE_XbbWQaTycqkkx>m&4lgbC0mLEGCtC_eV!Zs;mdHbd^(DbI* z!|5=jr*8g^-dXqhqtw6rIpAxi{t7Wwx{6Etr=-bkBmL8U|Hs8JR_NyPS4wmL_*}Ij zHWRc#>ar;5!L%;`V!rkx?B12iy#goy-W=-i=gFl16X^;{`tg~KFulU?WgLcvf2^o5 zJe>|<__LmzO--Li5=kD-iJdl z-k1qf|4tu&^*X$*i4Gx{6~D3`jzX&7(`1VOi(-YP__T<}dz$zI4LS>>-xEPOmHov^ z3eDqdQ96dEd@uYZEL!(zB*BCypY|tmg=zmINnExAy$IuheK7TJ`*EcmZOM!Gw!t0< zKZ>tE3Twsb@Wvmv9Y)93MzLM;cwP@W4&yaHhUcZ|IU6nvK7g^%&8;ws2H=H9B|_Si z2ti5w>>F@5qWoL9LcyX=?kX#xP{5nYond$>_22);=Wc7_lb7I(lqy*C8D&Kj4M$S` z#gHi%CO1%*=aMyoQN6-yg!Br<8U@0k_-Fh=%l{)U{X-EK3F#4vr@j$?$AZexrr*v* z5=>C?mBAg2urg?mOLw3tHvOjZivUv+gf_$WcwZy2Ce_~rm||d}^Z?T&!1P5hM+g2# zE^}ma1mqFQHfmsAM4C8c1aBquZx84F3_DKr`L2jrRo{s<(T>wdN$K?^+7fNywo;$J z<&eA86{rt-Tk4zLt~S3b=&cXBS_92(u7>&+SKHA*z~36g5)Fsz4>h$M!sBjNqx;~| zL$0P4kKg6-x4P$P?w20uMeLTX^?F2t>7vf9<2VR!~HO0aKCpQ3ITcWUs;M!)P{Idk#c2I~n z5`1RK1+X^_U-0a&838J(bc`tU_9roh>tatOK+6d%4E7Q{(!T()%xQuL?h5hO32wQu z2xr!Rn+((}ap@?*kGBb3{TqVc+9AZR5q!5uh))yz#uE8k7{}{Nz}XR@!e>OGkQU+( z2|oHOrm-k_NLeunkk<)u4Z$rMA-<8|wU^-{|Ks@{BLl@t6pRF)Ixn2aMsV;0AQcB1IWPf<5zFgk#*{-E3EG8hfzgf*FrM#5?34n&gkUzbX6vPIfJ2`NA;Cu zm1GKR8o#o!OjkwFGLvCpIg3%BM9sXKp^_IcI<>q#c2k9&pTbgUF%V(1Gr62yXG)^H ztLz3dkCvBfwb8fxk(oBAc0_OYBOeOpMxPr%B{F`$?NCKu8bGGLQ_ASn0Mg24$v!%( zimC^ZNzL!OJZ%NhKMtbx(dP$|T1xihkUIL(ATq1?<}lJ%_uebf4+im4{Oz9@j;hb# zDJ{R}!o>0xM<*be@(fxh<-v2&m(L(Gy7gA{+8MM{mN&x52FUW}*U&oINdjKtCV=3d DCGi=^ delta 5249 zcmd5=e@qis9KZKy=?`eRLmWC$5EEyR;n6}{6}4Ev%`wLeTWX>c32qIti9a%NAr!F? zHE!b0?MxurEo56Zf|ElIXSX`HMgKrU7L)M@CS?D(8K&USq4Hzz?%tywF#@B>yrlR2 zzR&yq>i50RwePy_7>BMH4c^@RJP3d@;7T9)H5LmEX%CXSNXlO4GgU~%%~@wgiB_JT6A5|&7uh0-C| zZRR535|Ab=gZ3z{)Ev$2HuZC(S{v6G6Ae52xm_>=b`Bvii#x5E%bDrd042Odvw*8- z40Dr}$mH%o3WF-HX1G?S>-a}{{^B`k=IUenjJ=XgT;#=mimRJ-i7SlzdER-ZHh7Q< z{E+n~72I#B4f?RYueg@mQFLDDo81Y#dt;6UAC^_qbO9K9SXM*Rg_88qv|EyOG`$&& zIipfN*->epwNb3c6=m_r_OE06H?aLCY`+EBo3cFFrq!OcCa1?`TI+dOh2m4xm~)>d zr0;*vL(v7o&60bNb0kC;ju~o$x*AQ8()xlGlu{Ifi(+t53@(b1i^@Pn*-%jqRMZMN zFxpQjN#wZB^$#`+P4jbMBKn35u9(3UGq_>~RV)-d2xz$g8_gvxzMk)ep%SLmmnVEr z!T-X5oSuG4!&;a6Bi#{?YJwD1Eq_KdHDPN4U!Mx5+lA_ts0$q6_Q$(tY)IU6Gv zf=9lf)QNSztYcX(Q^6u_ZE)Qoq#wwt%Q`>>TZH-R+?vVOlPh@8AvES6(!v(VFE|K} z2;XkJ8X+ zNC5%h-g7s^qYq8t<0 zO(%@D8-eIA0k2M#XsKoqEl7_MTodOX1a9pVevBldwI#zfYh;cI(SKF}e&ZSN-Lt^4 zbB4m;B86YQOlTQr%CGhS*fnWK(Z99fIMKfYb^)>ZH$dz2@90A%goA5e>MI}+PyGI0 zxE9?7u4~20PaxtQst)n455O$wUUWOeyBkaxdDp9f)9cyMB#3$z^H2DCCbKWRP3v3_ z9WZ&}ZC_Ao9AV;>I5iWCjS3Ul4e;uu`$qm(I2=GsEGUM}aiCZnc^`I9m5G({Jlb*^ zX=PuUvq?HON>@2PyGAKPY7Un2ITq=|c;1Nd-B{im9os3X9eAD`8hg0ovGKg41=5k} zjx;unh zI&4zt5bH>h6|a1kZZ26vfhIU?z<9?yvNC?#gmG%iI3z127$5#ijmv#OiT&R{j-&h^ zV29?Dk55okEy4Gw@p_E=C)hV&{OC=!{TCQ-`c;jem4;sFJU^kO0qoGZUES)-7(cm8 zjVp0=l>32fFn+s%?T=2d{{!RhCV=ukcKwYfl4{inMlnu(qr3qjE(Y05NA<5)<8c_T zj8Nl=7$3ec9$(3yf+xx+IAmfx;gmX%6XPfLs_}q?Lp`O(uWCr4uOK`&0(Cl>Ad0L3 zdh+UnH@3n$uf>`}m2E5DUbfY0Wm&A+tZX_~Sr@yOsCG7&sFsjwA2&!(w~ZU*x=1jq zB`*)VS!}rutfuE=OL4tY=JX63$$?;EVNJJLiJ!}|9IL1_5;&91BL*2(n++RSGjoWF zP-0+XmlHJ8;UX!rEjbw^nj_ol!XfOAOj5DUW+y5_iNW$b){bqLXJlt`D`V~Gvy#tg zgJrPH%(u3|G$VPUmDBvKHfS@FCsYT`r<_CM9{j}Wq4`p2OrCJVH2?KEXgA=Odesbn z3k?~vkg>`_>JRglc9exIyuN`iZHKn`6?h|5;CFb3$OBPw;|7>nd{;Z1V<6M<{6IUj d!}=7@KVbF$d>OsMoSec5IshG1Ir8XC>j&_8aTBgzt7VPK@P{%O)sWgWu^`Bn7pq1ynMR6xO~~|zNZ&;Mk3v9 z@uTDNyTxAetonUU{l1947s!_id3o0~3w4Z?V-#mPMi$HWQ`6;-rbS6QeOCoT$_Z=H zY?}#b!ma|{0xGR&ZV8+Gb9zkfwO%M4SE^91bD zqp~miF!ADuXC;ySlcz761et!YPu_Fs=5*tXW=f)olOtB*osk*DUq>p5QzP+;9ofrs z6Y~h-^5qI5k@Xh#Z1E6BMm$tvOMvR#;wSpw@;*!vUZPJeaT}%d7jD|J%0nmaZLs#P z_RL6x8!CG*pnh}{dHDkBOW&cC$u0Cm%xE&0* z6AZW;47gV!uf750o~W;UY1Citjf&-dWB{5ua;IXrPeKMhRFGq4fa#`n7b$sejYX0y zeOYsFLDpN8%=QpFvWopyaG(_&XaxsaS6y}UIpjbmav+Z!C?E$eQEjyv>@WHcmQK{X zR3Isqf~T$EX)Aczdhb=Yl<<}ZkfTG$(aVveZ;Pg5b9NL-b4)qEeL>(|{bkqUZsCN$ z`+{|cWlxRGCl930&Z|_NNB`F_{VJSijMDVbp#z2TteY+JICDb!?fKp;y4=*ihN9lD zI}qJ)C4Y%PoXh4xAkr=rh^H=2r9eErKJNl?VEOUdm(C}%N-!=4)6w=Ad2Q>z`g_2a-1-BB(i>IUwjU|W-8J%l+s`zWk_*Um z<6dh%Z2kudhc_K8m4*v*9?bg$AcGPBy| zNFx8$wtbY$Dlv(3fXy~z4&W)tOxsOuCo`julDjAUev~;qM0sYU(5kQw@2^7OS=0$=@m*8lgv0a|8|ERdeMRZ9xq zs?6O#rbJS3fXrBe)Xw~PGoYjf+h$8=N^?$}5=jAaB3%Pcob1#Irv2B&9c{gO(WGu}t3 zy{3$3l-gm=B+pUf)4r#isZc&W=s&^Ar%Wq|YAE3x22lV9KuUpkLHWt0%bkii3jOhk ziubHLw^pUjm`$|AvWu8S%-s6L9i1E7+q%NBmUwhy%Z9dacV{>rZHb4wVjH@{tt}hF z-4DcKon3L%qIG>s`?`(osJ<<{wr$M=?csGBBc0($XIHqTqa)ne9gcN%M%Hz-buaGj zYF*sX@!*Dqh}_rK8drbJjc~#mhF{y-W}g#rx7s-u<7(w10wYA$G7Q^h7h2ht*4But z?HlrwWKD{mX)2P;MC12iT2NTj(o7_(l>;xi&rDS`6Ye-m_| zdvX`M|8Nj128~GgI z^qau3{`pX`;LpYm9B_+G74%9kaN)d;RAB}1;`fbwCGdUfF{50a)B+VJKg&Ty>HyxO zO;N9W2zcy@tDqP7(PvaaZ~rLpXuFZ`(zryNt20(S1s#fSHS%8ouhU$jEBF=gfu+Xw zuK-_lh2l4X3yq3^{g1@IK*b%JuXF|P10M~mg3iZ*_iQloe*quVT&1@!q7)IG1=~8I zy`tX=75g8XP=WX?;IC-DQhKYOTYxh^HMVDgzy7h2Hvm8Jdm~?tIMQz>EYBG$z6%}p zA2IUPz+ZXQ$kzeC_(vm;1K)MZ$kDke^f41Jd_Yd{KSJA~qJF-1LQtJ9`uZ_&`q!#} zF);-Ec&m{=3w(#g$PWRZH63%%{}NOPH{c3Q$5G(><{A0hz-h+F{|5Z^D-8NO@WV@u z?ayIX99I@&!pEdKz)X0*W8~9-AGpWJuLHjJJ|nLKp3v4p*RUA4^x+ST9kjEVX5y|# zj64V(h{uh*3HU5+Ep!F<0N?hEv3&&i*dIb2`sFD8& zc!#!j`Z!Mj-=?h{aHM}fRP56hPw$Whe)6A21tY+FJ~Q&)0Us<@1$~_3zz3Af-}g5JUx#71xd7hVo7Bw`P`oDOUc@6%V5cQAMw zJg1Mq3w}HTFFNpg@gY_V1gRtJvg->b+C{9{$Lo%FGY)K}%g19KE|I}=b_Mk{5SSo# zyx>qHObIYF0ybFSgV;ZU+r>$+qJlqwr!NFqyv}ZW0FUN&>Q{%G_v;PZb{x=dQN$y- zLr!d(&+fyTeU1nrMa^Vn5> zkrfGrJc()Zvo>^*Dh2;!yqv8}+C zx=>|umB3S6!j@Vx)8w+<1os&@#g9(wryibZT3Cv2LrhKT$V?M&8-m-?P>puqYA#Gs l)h50K|9&pzt2Vi)rt4Fis!dC&xpk={N;y!Mx=?MJ_ix9mn?%+#cM4dkFreDvVV`gLiv-w|fj`xW5w17-DD!<3CJOL!E1aR!7WO z56PjFVhUNC!LxTcE9cF#>l2e_R)&VoEm9IN}AmXmEGOw(pHjAA@co9>GTD&_RD;``61 zw9q>iMRSd5ahn0*IMmkVDi)@ME=#UvB%87`*RJf&8CULPdX=78c4lx~*~N&=;OD5B zt-O{oU&-Ykr>W#t-bpK+j;#4u0GLrmtpjD;>`+cShPqz1ChAMsT%|Mbv+Pl=OTOjY zIHeq({cEK*|NO#T`K_s!sDZN;Pf@Af^43%w?9}eoM9fNmHCb0s&&2l3>PSsQn|Z#5 zzB>_Z;rUvvjq`k+)*j~h&Gg+$+o}e?t*jwnb2bERLWAD_3F!ZI=)V#AZ$bTYDjNJb zk2VByDjR}1frg2LXnu;iTe&YIZSQ}+f#PeD!TB^>d&$g!B8hX)en3l^DdT=Bjho}WUkWN%rorEcLwONyeJ_1{l3TV_rMw2==i zL*?aa+wGZ@Aut^o*7jJ6=1;^^niOl}6+ftWj7n8mT2pJ=kbS7)aK#}iRh-IJ>S0A2Kt`A)*ogHg3 zwSApxgHWjqWF|W=9LQ2To}XBXeS9##&u za=#XQ;Ha@7(-GxVcG2}4@suV9uAYdUte&Fe_MyIMRgda?gKn}aL-CB{f&X-HC&V$D zE=qd-Zj7EP9ixa2%w>vXt$ud0Uxw1z&($7TVw6pYPw&!9l+Ankr*^96i#p0O=kwIg zGWr--U1Uv%f9Ma%ke4>XPqiU>(WuQGj5S9+_c-0n)upbbE5^^yY{g7hj(2uxYFzpE z461=j=E2l(zJhWne(zd5r741orZ$f(nvyJK>-u)J@kjLWX_ro4Azh;W@hrWRyWcHM zD;IUiMY?7glebPQn19l0{WW^@OYy|{HmAxHbQM#3z0@e47Yow+H50|tPphMM=&t)d zbhHu*Ev+8NPKST^;wfR)=o>n%;;Ajv9yu~o_r0s>;@MX)v&Hiua`45f{9SZ^)^%M#>j*?4J6Voq@KS90WsIjt1 z5SisEm)u0Ru2#_$`uO1m1g)vC6gbLyisbpgFTYFj#lQ=H102UM1;^MWa)1YTiB&gDXC?5yYbMf- zuK_>*7m_~(oc|liW81+|b%S(7fgig|@_oRU+*1WDzzchH!yI%J_`FRd?>2ahvMwMU zCt$!CLo0Klw}Ho3kp15UJ}!{_JaGFxfiD4Gggficw!e4df9$$tjid7BLU4)A&w&jaz_2gj-$>@emU1l~P|gVGI*sT%FS6Axph@at~u z`55)fW6_PG7!#wNg9aFHj}v)I*~2k0;-RA3pZR8Vn>!##U`lS$2c|O=3Sd);c(Exs z!`O7Y{n(TuZftsmAU1u1fX$#w1k)u7+GH^;vdiVf*clGva4x|ag3c&499am7E}(+o z#?&PVGM-qL@MJE37_SKR#i_YM5xh9T8S~;a1-FdT6g;LVg3oDA5s>gg-9Fhg-ClFH zu!K`{yF{D>k`mABmb_SOq_CcF3@zUimT?R*EaD}(!%lNt#EaDyU11zU6x}#?Gz_m! zlw`B7-)Z&@dd$9&0G`()2r-Xyk3I;+Ry!(Dv0PnsWG)0lTLA(x0_TVj-{6RbnrH=EF zA~Ie+ngQ!0yIrQ~k-afUNR~`pWxuJb95R*gIZY*ef~ka0jNr&VS;h(b0!}QIFTEmP z#FQ%F$0f04n?Gb0jo&TcDg0h@3cno0y7~j==K3S%js^q)>lzTv5)DX!P~!O#_7f(W zo7k{|Ez53%0}VRpw69>j+4z8ShEIH;U5|i&gimCbqU&35jSfWXN?9ojzNl+ED#90$d?c*m$W4vzGE=14}PnMn4#E%<}pNBk<6_vCU1mB-;!!zs-z z0*Gr3hY46V84jr%f@g3dq|pjCXz^#$y9BWX-=;sV_ZPVL8@a}O6W1g@(|AcEP~eLq z4c3bL!I#AY;HzS#Wyk_RrD6j4l0|zl$APbl966NplfyY1SpMGf3JEM=Nk+Aggf{Kv zoP(qC&J%Qj;r8cUCk_<0(KsqVRJPGr>WqYe9CcR0(Hu2SR4l+F+_uaEkRKW{TY=}c zRpP_LW*g!*i3b8^JK}bUkB*w~)1v?H`5v>1=atVT!7W@KV^qdO)$ zFoGR;6+3VaI}nr{5IlS9fQgG)P2Dk@$rrPmdXWQYm$i$fi{QOdrEPD8ps^4%7J|k?&<{t54kSb;62gmw=#tk=SM7stw+-#X z^r4?f0i78dOGabKXe=2mOSadF`H^A+NU@iYVkcv8rGG`-cVtH^d_!U(9&JRZ=TUe_ z5%GWnDIL;FiP%{njy?OMtOoV30kqd@RL2jb0xO^HSaiZW+l3Ok1LXIDw(zJUM3C+( zRhc~!=D$RUPPsnBGwn}EgzPXVLVExBe@Dnk_Q{A4@ov|Sw)M*PVo3^698Pp!(rz2DIGyH5Kn&hL&G#`q``DBJq zrIXRwcqHS@q{Gg5{H?imWO*YJ&dQaRsK-0SbWVpOuIA{AVV8$x*sv$+WxUa8hH*z+ z-mp6q4oBID9ezTXArsUIwcy@scu^Sk$yf^x_f*@T#`ccqtL;kaSqrFL{o8T>Z{yYd zz1V&?gh>B=Z8s!N>OmQ2>1tCSB8?i9H1A4s-$ZsL$%P+|EqD1}=m+u8R3y$bY#*yq zcZXl4Y+nbaE_bg>wGIsQ!u;zF^j;^|8(`bG(|`}fIt9nZ+0|<%czxVuz)ByZ;pRB2 z*8DUy2mh{Z&~Pls(yE*I&r10;Yz-cT8$qg0U0Iojo59W_zPz$T-=u;YCI#gZx~Qk1 kZ{naqZC!(*36|WNgda_GlXpUJbE4Z2m{LZ3j;3h-3sirA3IG5A delta 1621 zcmb_cO-vI(6rNeyEvu|M3npNLG!TSEv1}1&Ln^5tKZeE>gONXjK-+8xQS>4lu+RW8 z#zb*AAc+yUdNElJp4=WhYJ$dt;l$M&dNCFV&UR-g5CWJm$@cwy^WOK~Zr}J9nD_{V zSYxn>gD{XWBe=Lk_sqoyF&S8gNo1Lj2)n5=f1%|ER4Of&%_P`$k0&9elg3B)%3;d{ zKTHrgMeHI+jDs)YT3AwabUqx1(Sl_$l@%~)vrOP1@{NdjPJ!im1?N4D2hU{TN8&n; zO0C@Ib$waNhchqjcm)aPSgZ7I1(7o0mO%P%^bEo^JeK`dS8_%&V#&cELLs=F9K4B8 zyD38mb(r!NLOpQ1L@d`zMYpDis#YdeYAJjh#owm*PKv)v@RDCEmFhG_3TS0gy_S4J z@&mA4^4ONvNAGI@h1NKCrhX}Ji);TQOc{>2&DgX}8YAGjVHS}oDVdUzDJeNcs)~@R zA*6hSRBb9TdCQ~-zoY9){x8mDCZcR*%2uXqWx`e;uKr}l!3!(FF)m&Dzvgj3fBM-{ zt28=fPw*X;dtK%SY997iE5={tTpYNEc#qGmx>dXgop?CR#~ffiWI@?)R!VVztgFmp$z1Bs zFWc)cqJFRsf0^NzHq^-2s*|3Q#!FTQgeQewmV0cvqqP2 piUjt^Ie0vhC$MNPUW|CT_nmk(QpYWHVP~{XK;|ULc}FU^{sEw-j3xj8 diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index d822ecbba3372ba14720e975c5071960d263c554..ff65790c0fa4b306d767564ba0a7c5e6d0c4a62f 100755 GIT binary patch delta 8807 zcmeHNdu&rx7(eIkc54S+pNvoz*g7Sw>$di8eLy-g*aK9Y6L}0BG^K0lq+3f@3=gwg z=0-L{5V)GFLBmWG@e!EZKSU+#LP$h~IFLmCKt+RzF&YONf&-oB-k!r&M2)<XfGR2hjKolE-`tR!3uhVseLSuc`e%eyiD8MP z@pM^8HmSHwYBpqqGRiS2qcs}lT9m(W=aq8h!bj)7Q5y35+kMd!!`wIWD)_5(y&_$& zp{tquLuTh*%$ZD`gj|@iWt@a^Zat;re$EL%wa}cVTarXal*O{m9i=XHtgKSgX>sXe*2*cJBgjtI07plVt!}9(mv`tIMcLS) zYl7@(5wM*q+5kXo$r6JJVUviLE;SS*TrA?{%ME6P%_44WG)NQoQow{9FdrTU=<7RyaTEQ0h6jGP^_v8m{p6A1L)4t z;}jh?y@DKkfgaz z6t#L3wT6H&l($&MG87KNy~~L4?<0GcGU6XzNb-{M+XFEmv>C5LF%r7JO`y9GiRAp@ znreE4{QHm7>?>D0<7g3({=IbPy-tt2u~z+;^vF!CA3~LB0;EUSjMC%8@9CW$Im`q} zkHLy#bitmhU2!o*7IwLwb3Fj!xpa4YVGqJvUE5q+L0rLI+;ouJIK4^epZlhi)~Q@z zxuLusL0uW3DFwx?9(h)v7ZgdWek#`*p{XXAMV+z>pFFhVueDM1|Jvwwi?ds-jpP~Z z)$BxF)FDgPx`^vDoaOp9za#WN)|&G46&m)wBjbjfCv)@@XV`)Cu9LVn<7pvwa9YaO zFV51tUkD$f#mHP>yGO?Ar7f~_t(Uke(}YZ$&>6NWz3V3~hna9`v+;bvGb3GbF?C2C z)gw!;qmBv}H#O3!{y^m#_U;$**&Z2nG*LfQbf3&f^|zW}>1|R_?-vR?c55^KvbXeq z{dC)#Tenz0U84m3ON%IWi!M<_4JgvJh!Xnute4UxmES(7c-o%Benocp(sXmF^uW55 zFIc!>`d-P_ULGr!K2)S@u_W{lPK5M0hSDP+ZD&Xu;OUwkpHGzZ_~L@Hp!#xGTuc#C z6!oFc@$s8W^}>el&Q6q~=p21$xG5Xf3#&#FKBaU|$P7SCyRZO!Ey@}Qb!m{LorC4Y8VFx0lz7jcI@(LkH0)#q*xxuXG3 z)Ex=8w!6KaHh24ma5xl+A`S04&)Sx@wW#0cZuYI&u-4tu<`233p@`cP47x+@?rGMXVM!w(XTw}2}dwtfK{yAQ&&0%qPZGNZ4>2J1J7@yVY zWjtQ5-{C9bUWEqnkW$V@*Cg(NTqjE)3!FJXn1q~)`Exh$M$A9|NV0wj=H>H}c_-$> zW(ly@d#iEBwaV+11nMz=#huKr#Qd2R$$T^B4_78mn@Mj7^H0A(7*V46H{*_-W4=gC zH|Ae#k-+uKPRw`RpuY$6``^B-@F!Rqlt{enBTJCY4r#U>>OgZKcS*!+RO%?`OMkbdHj zoBwtO+{V_`)PSI8jW6hCm^#}SbYN&%z!27AuVx6#*s2M%FQ_B1uDq7OnsQ=RVV_S} zYxNwWYjrRcNS767wp!=gu^pnV7%~hqpRjdyCx+$DTH+d0#}FqNo1M&o*g06YqTESF zD6cIix)nC29Jn8^w|QEkK2EMK;kkVHIu%y&a|>XZ0Tc)jLzbYr=)=( F_!CtB!7=~< delta 3941 zcmd5F7(VCrwzWvp#>X<=RCoVEO#-bRvxW%N% z3UN!5LC)MYk)ZpCaSl;AOXP>{;+AD*AsPCsKQO6@ON>ENV{k@6c5d&vTrM%~3?y5V z^1knLo^#&!zUMyWoUz+%{4#5Z)CK$ugf*I8LlF}%?zQcX^gj&E!F(gJSx@IW` zC2E^6hztN^J9nN=#PjL+D{8wZH4A`ZR&q+@i42LKJP}pH7fUj)hdSYVg>5;~va+RF zfS$lxa*&RB8J6KyjAtoo&B;c4jY;%aTY>sa+01AXwKFbe^Z_vzbd1eIIoul%;CDD1 z?CsIPC_Pbz6Qx4c5yAcZ^-aXW+um z-Yr1rE9w%W)IUmmG~Pdqlg~`R$X-)Scz9?tX9>cYhld(COM_^)aF#~VHgc8_oGCRr z!ZxETyxdq3USX^Zr}&#F|1HYjO8MIe-{cG1OtsCOzB}(@%dO7 za6!B)Z<=ZNQ{N@%J6YkTCMxU)QbAT)kd+o>r3G121yz%RyriJjq@bGAqSjOb-K>vd zO>yikEQLkn>8!L`E3MW_tF@A9(U|WDvX`=WPw@yFxCfOIo~kELct+IG5+8a=s=$(_ zU)WxKl0`Kuha2;!It2haQs2Kcor~5CJXHpnr)Werizcq0L>DTu14|z-UPJJRM=B#j zH$vKyys4-lc4K|i`=xgS5bkKYg!PArf6&|QJqUzJhzHl;jL-Dlm`}@0Le$?gh&$@F zDSo=-$XPZ$0gv}T3vukteK4}`KiA1&1{H5@YnU-eS^-*RUlytr9BaCp zv6XmYTc0ivVU)BB)d%iqkysw|E}?Ro#^=|{Eb>?dP^ z$n@fM0F8L!ICwxxpP>sR{rAza=i#HMAa+9*MzdBej6rmI%^4JGAHd1(>CBH6`l3b$ z2ho-FXHf8fGACUvC=njU1z-QPgxOqzulAcY{}*vI(K*{B68WZ;K( zO{O1E96Q~Y6F9~yX&0=J;)_z+bo()M>3B}#ulE&!ylPlffehf_mESVINwL#mBc8bK zf`RSi6_x*uFSG+hAp>~yqLe><7fAZ%Nf#`+bU!MH?T{6+KTEBUYAqUE^98Q?`*k+s zj!r#Fnqc+$+e=|QMNU_NsEhLxx$s&(L$gVDFelbrI&PPux}^R0r3-ZF|GnTJc~L_LUYQ1e$({(~wtUPq`AGXA-Y(EOj$fg2ww1ESX@ z<2%%N55?p2_(v(e^O%}{g5m>Tsqu4Cemw|Hss}F63E?p{{=1C-sK&2SJT#AgjpFr1 z>Q3CGcym^>df*-%7bLk|-! Y;B6IfHIpplTZuI^)A#3Nap|Ie07u}CR{#J2 diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co index 1d56b231d87126c281c764aef8dec69f2e4d84dc..f875317d421ad3fd7c82b56932cf1dfb5b9d7882 100755 GIT binary patch delta 9405 zcmeI2eQXow8NlE7{1K;++D=S>sA|$MD0bZBeEvobb!^9h78Qs>no0)CICdQ7#9rcb z6_au%O;9$`F6^id)FPFA&<;hk-EE_-suS5n3qc3dKMs&GlE^F%M)Lg9FrJcw<58j*c zbc39A{X}jY>&$w=WH?E&I&gYM58jwDfH!AWfwyLcw1;v_8#*@tu(YHF*_=`J+^7e< zJL91`M*~#XXcIU%YurTvBbbmRX(P%e)&9|Dk1pHNsvo00hODJ^)z~f6uVx2ax`q1I zbcwQiw5CUtJ)<=t$SxUy{V&E50J5E(bta^nM7^u4t_kT)qTbzIXGXeN)O&jBq=|n@ z0TVL7gk%$vO-P!NG$U!OViKk*TcWATkuX=e5|RNsu>rfV0ncCqcC+x_8=&9NGw^zo>HTyAF~2v0wc^A)?OjA%fSm z3_>JrMj>+PtMU$!^CQbBM9$hiP&U5&<#zG9P0nb6t0g$Vt zxIIfh6X;FGlJBzI#}7>ky4Hk}s-%SdZGvP_DK{D+6i8N3k324X_~b9Yo*ZSqPmaEA zcJ>XEqeY6F+qsgxyfU90?NgU+a^&s2s4RJM6q=q^ao0j~tW8nYyT4Go=Y7c=CEc^K z=a4skpSZ>zyLl%$>Qk3(ax~w+TtehXW{&-EvAjd%{K#?%k>2cFBY$%|Cx!`0jxuU_ za^zA-$6Ex%W|RV*U!wU;Z-HK<%MYqS+>9J^qGdWFstC+>{bd#C%!8EKW5o; z*hb-#W1lD}uTl8j@ePWv{jBtT=Jc(9*ZPK;Q$WQ&x;mdV9a&MXSrhka-L&xeT2v*;i4;@_6s!tSMM^>WFabb9DMbA2_; zTU5%OhQoQ%oKm1K{918l;ZbN=1@*kC71ks11QL3^_$Chk@|N+c{_ufXeySDfScYk) zo9Tv)Ey>hiUu-Cxj${&pk%3rvI2Fz$BAM_|dSEyljSPl|N7CukPzJS#{xH(lKiG%r zW8vP|-jTj=|6n{7j;Dsgkz_KQ8V;w2Qt|#|Y`A53DB6-tK0ELjBKu;|j8tfd+g*EU zM{hJ{sgJv(7Q2&nM(uGI?TYu(G!wJ9qD&+jjXPsT?gXq8Us!6`%-;MO$a$rtRDgwd z2+7OUn1grayb<#w+vWT*%r%?k+=Y4CEGcZE){iR|TMI~1p2B<~Ea%T)e&uO7?;Hm zkcBcXE3u7n`-t(3zpx~$)nB+&K_^kI3|$!5ZgZ14oB?9H)n_S87jU=X1~y+nQXfal zx7iEpu+eU^4x6=bt!xfIG0k?zG+V%F1;5bQ`Ke;n;6q3KB%Dz4WX!=y`8yZi2*Gn?IvXY z`R1Shwez#P*LYDA2xy}HE{~gnuu#<;olXUu&Ds0>Ng#Cf;1SUB9HK!ZiOKRD7?fj1 zj-&wq8OQI_O1uECZ&i*}pc5|RH@-gj7U=%MUIv0Y z4fVk~EblOg)>&sLe|2J0aI|qFol*iHjW(9jDW%d_MyGhC(MzXnfseA{SzmU%*_RV< z^(~FJ`9k^|u>Ol!zZ~mtMf!;jUv{F?my_uBElphQ8*N4Q0eF;kOdXMhCLf?n#NBBR zsL~&G^+8>&I(So67knMa0qJo-dK{1*2c!=Ll#c?kqkvYRfC@uK^&tn^u*|n@?icL~ zvS2#OPLHG2<7o9bT0M%E?{l>9fr=Y=VmPwTQ(k|{QLjo4*9Tin5vMv0)QjIHG!Dm&#`j13}_;EXS&@Y zwr&1FJ=Nh5YYz5m_|rT4Ji}DP^>jN#tlTZTp6uR;o(AN)s``pIbu`i*w=^@?9{gBt zznF30Pt_Atp;haNelT+t;Cr}!;e0mi5C?s(kpC%4(W*7f8m$XTr64`x(JFf+Q^j|# z?Bp{JbaZXWfmbM=-li7q%@3yjcUW`ql2~!HQ3Er@TgMr#RF28Q>1t)*QH?n7Ouvfy zWwH3z*+r`VLCX|qN^@q*Z8AGHa;K)-Cg`EkAqFlUf09<|)(DIhs=G?&!_z8p>}y)* zIWn$er`tB@!O|g~@0CND>;|FM>DF|iP})`EysLVh_m81S2c^W$Xb1hv;&|?%;_OE! znmqicU7Q~7sf*K984YROvv-E-f>MP@W37u*HrB$f6Ey4ywZ)`w6{wB6s^&mC2`=n9 zoP~ExcAun1+PeVKLd<%Yj)rVB(D;p*yh>i1=Zr&$vG1OxB}UT8lUVUkpj`N(!94`6)f+3(sP$%X?oRWWr`G8x=O%yUoMS%?bIZGUTgVdz2!nqSa3rEp{ z{8=bGR|*^aF3tohx4*KpavRIKoDtEDMJCTh6c~%!MJQ{u64Byz5Yg#06Vc%$7R*J( zMAztW5@SY-s|e}(CC`jIY}jaIi?Io$!4c8SGDPHzHX;@=M3+Sup{|IrVwVL|mSOU+ zXmVJLL}dIXk`ilSiDA~tk%(EFi)3eFOhjZIZjv4A;z-UWqm9I4GBaTxoRI`!%Ckv1 zODUVVqENsqB-zN3!kdb1q(COOg~VfKi^8HMtZU9QIfO6Epp|En^M$|5;9jaSLD;hi zE|11fhALV(xe0P)dWjZpjGJDg1%nrINmzM|Yy!d^FPtyD?S;wFN%DEBNfJ&B=cG&;~{{S-M=SBbk diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co index 23f3b033d111edfdfd11f24feca2573358af382f..5ae898b9175d0ad4e1c1bb629002e8cc5933bb19 100755 GIT binary patch delta 3413 zcmd^CUrbw77(e$?S_+olO9pP$0CuE*+$fi3v>m&!GCG)B=^r-DG{c)xdMUHe+7`3; zr?)gfcvz-*Ud)UU5)uVNHOeR*4-U~3<<~76+i3rpipDCYWa&={s!eiJl=S! z=1U66YmNIT+&5xqZK{DuwOt{faRIWhjB70Z(R>4c#(YX`;@8-B8v-T%QWM`)Y~fqv z$C}1!BnEx8&Y~OZd(hYG&FJQO)_k#CX%E^Es#MIVShm$(DDdc$b)F6u0(7Y0K#R9* zVH(-cf{I!n1s%077eYK!4EM362*0No?Q1UGhH*@Us@#UL%oznE1?H@RqXp(1B~@&w z=ca8Ip<*y-u>-d&JQT7xfIAc(iCCP#oeCcvwWuGSq>&vQumd^(9e_?iC!o!AFljfr zk`5D>beh~r)qyB+;A!H(v&4ZhiQ0Jw%zV;r?oT?*{-o1<3><)!!*(i;+g||(bGpB5 z24a*qn{_qP7j+9|LrGT_b@Sy52!ycitPm^_f+a$*LG>c2#XzIt`YKR6za=#WvA9D{ucx**x@!zYv?8nwf<5gfNj9pPdw@X2i50rn5pU zl@ii9A(KsuQ>jF*E0>LTrBW|VcY@`7BA!<(?P8B_obyb?6Wwj%;dpnCm-EJZL?7o9 zCpgZX==R0kv3OkcCY*ScvfxZeFBuRYB8wvMh>8aEA-~=JD6#K7u-#rwKn>`fSoF*n5l{LaEqzA|-I1 zfkBPB4|z1|891!bA-7MXf$kw(4tB!3N?&Qj<0Gzm?NnT4@Z}ND@>>kv7~$$Wh}In} zwuC(O+R+HG__>hRNE#vSz8vzhT2fB_x-ds^EY$jDn5oy+#ze6t>}e{Jr52;gLg8l) k90+@iT6GPthP^a5ioXr_)9t6RCDLzv<8-y_2QX#ZpRE^l)c^nh delta 1643 zcmb_cUr19?7(e%HZf<(J=jwxhT9byj8M>zEHpd!cIxStY{y^g*LS+OY#^zAz zAt*ULNIm4Dhk9yoeJi;39%ygklRX4Jg%p8^wa&fw+;k#p;DO8U`+L6M_g&7n^q$G& z7*jeN>4ppw)XkadAhd_FXDnp<|7B%}>)ea~M9jdM&*pp`lMHP*2Ml3BF{p0Xmo_@jUdD;n7p&c%0qD zDXtHG-V_(P8V0W*Q;pUqO>`3E39%)Ff1u^LXQlWP|AAS@k1Ae?j>-{TdYpSAZKom# zMVZ2Osvn`aj8TM!WW0jV2vcZan*=-S6dbHeXk=RjB|b*u6EyxRjlV|XIlo}%0)m4J z3XNPy*q$Kw15jwVuiup)j0ylnMXvfcj4vB!Bx6=DZRn<@hd?F7Q$jo?#8X1NLa2=p zauY&cLdd6FN`5pcVE8ACXunZ1bb2Q%0gzF?Z@TQsb^fOLSe?JsJspm0~r~I@2DIon~#AHA`@-N~^>%53Uf*z8DiEcIFu=(SlT~650 ziI)By2=g4Sjai`|VPDM3Z*l6|(cYc~f8nceCWZ_xUZ>0H!a1fE=M#ps5wu;T&x}I3 zf=005Ew)!tJ7_Gn+o=8dzW8Qp-x@5ASHp~;K1%H2J-!85hNVED>q>KV5}`szw#UA; zAg?jH7Agj2eI1=$VC?3t+hf;V?P0Gbwz^!J*dA`z#I8_S6GNRH_#WSk3j;ZrXYuQ} z)10TnBO_glyUp4xx5$JOZnK4K0r^-E9ww@vs$}pl5*`!P$f{aPczBI5UEo3o&Lylk zm_%l+Nud`HC*5aOY0s;4Wjp93V%&{;AStb9Sp$rK0G`aFhwIu KEON>f2pnE+osdPf>75AKMLQP!S+9}g8MsWZf=qXHf4 zH5D{re3n6tG)Fm0{j{g0ZYi5WG1H%D-DEJ0&kTR+NJ>&h%zk?Z{bU%TepjD&)#qRM z*@x~oxKXinAAOTTDcV(clQN)J=>6!o)+oi6LJUixA=ubyuER6wTSeyX#R&1*hRo<) z_8R(fvb7Y19Tas3Et%hMIZ^PPH~Y-_c3*pG!~82Pv*6YW1mCT+f$vuifVGuMx8Djt zrNV%G+1l`W&Ii6)@zMU=2|AGDz_}l+(=@PxvnuL73UUo^NjEoNuhV`=~|k^oMT0M4i@*XEsm$jWzGt-5_LP?fDaqwhc|+WsCTS9*4r zL`X=4ghWV4gc^y}TPW8^;t@=u8qgoH#$NQ8t$sFGOCp|r0znFb~Jvy@pk#f zM(IOWjyfMzH)dX>TU>If8%OuOt=mGHKU;{a4|Ix7yJ=W323S zI(P@)&GpBViA6Cjq{7)~BD^FDnWT`7hOuj=mnzkC=)TliuFin&3;natz*DZ6o_D3o zbwPKfA7LtAV@L zeK@KL!E1YsyuA;1~&eK{xa5I!TM>epCY8Bd~-b`8U{lQh8PSnn1&JH z7$J@k;TXLd215*n7z{C(hS9KhXe4@0 znw!0Gb8f~T41|T*GZ++RFc1)CBpMRta3m(o!J#NTnD{QoZ7`JZS;UixQ&>!dEMm}k z3LD(r#JO)$C~{Xkrdt;RDAC)Cc{jC<7>(pfWc%F>?Ns1+^Z`F#bn5_f-j<# on11v3%`lV-nT1;qk5XazO$;_t{qo22P)zlkT_bwwXMSe$Z~xe}!vFvP diff --git a/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w.co b/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w.co index 7603a30737c7f15bb4ef4a4cee465071b4be5811..51474003211876b60841ed20a12ee6685392e455 100755 GIT binary patch delta 4028 zcmeH~O-vg{6vt;512)9$jze5zMaH7l+D61IPW;ixp$-tD3S%#X5JDSmjIps>gI(KH z6$jg~njTQq7R@ESs^`ZKIs!ZramaH=9eNg#I_Qq1Wm!j-3D4k&4B#(cHI>&;zZX+LG_v^8I&w z??QjpyV0$dWAqw@(zL5?jnbkw=;P?umN>{06kh zeqQc_ofP#5%^Kcs9w_Wf*+Pofc537CBOkttumlk z;p*SY`@mPrK01&epo4iEc;`nhL<0`olvV8`sjYq?AM~-s&^cHVe67Wib0nrC07svK|?*135Aih+R13S8?}$qoihHf-A@TM$fkY}+M!bkroj zc!ma6oPZUp4XX{S9jhHHr|*he^^Ukr&&Tb0XIxHTgd{LZ61Yqf7?%<#xDGb~hD}%* zL&C*iCEYt|`42I~&5WN`USs5ZE5FvpG2}CGE+|RI3<1S(>7-9#8_yUH=$q*rd z2nj^E8{&S3*LH2+r$K}VK;#%3PM{Mf(1jClBwqfYsq#>Fv(i}7RNR{KV%uUv;Xgj7 zhmH7I&*`Bfe#Ue9ZxOQxB5uQoO&IY>jM$v;(sUe9gI^9_)KpA#`HOZ5@zE)+<+q^L zQ~Xkmrx<Yua-&$>6(COVaQBxLiFK~*>D>wQT1>CDqOl@L>ohF!Rd7H4&Gwx zPNwE&q8TwA&c)}#vr#dd5_9ozPRyidvtlGXCuZl<=~O0%dqiFj&m`t%aC=moj!w5vaN{pp4VmO%;Q&};cNyQS$Xtq0>iF7BEZ_b{=k=LS;oa|U)F7FiYo{mI&+G8(9 zdR!jf6LH17yf-$@^Ui3GH{uLOA~8?Yj_yz<#0Ip56;d}T^8i38lN!OcPmL)0oGgvN z(zQ3wk*GSBLlZ_mVh2B^!9JkgH$TM(~rMMidQ_ICy1m{8P!LwLmTS)Yhxm zcv4k#1ILxBNa^X91sR7Ig#Ob#AQ_&DCPm)uxuBZPGhWqnd-_!KeBa-F2m1%qXrITe zniu#Uv@yIwk1^xb;I0-$P{unt&-MNh$gYSNnB}ng*-Y^BI7#l ze#isWDvuP%TYhvWWJZ30ZBmzD`x2TI+=h5gnPGfP(Ob>vuHe?G?q}$M;Gx?tBhyGP U?HxyxBfYv$#+9!3@H5;00$TMVa{vGU delta 1891 zcmd6o%S#(k6vpRHG%rnZZxkOTsEGI|HjR&rNl=MZOTkA+iRMAyP)VxAv`Z<(h89W@ zT0In86x!0Ql)_LbMbNsfi~a#Fvgoo4H!TuGduQfcsZ!`t2f}yGx!=9>%eio-@QG}` zCyw0k=w+FZsAbtvFGt6h&(G(WWIT(g5;b$erBn~SW-c`z*KF9#h5o(I%9pYuxITMA z%~X0>Vg`+3qs;SvZ-fYGZv6jwkS8;Bs@ptPD85Cq@p8lsR=Jhg~k7-c-(!F4CrW2nCl4(Ii771zbwSceI3HKb$I z+7a>x`~i4J&Oue<9P5uJkJs1qRJKc59L!=>Ki6yFbMnNTd(7+RoHLzKLqM5cPzQnm zY3}x&yK^@K!LCap>=@`0VX!MC!qCu>cW8J}RKww*2nRa@EAdTPb;DNN?-ci_QYgk1 zr?{3?3Xz23T*q5lr?9|Naj82Lz9f1a_$A|mxtQo-;_u*hEE|Tei8{EIq)yR+>k9>Qox&SD1oh{;t**ykW`PnHv*AVzg#m03#op`voW(LpZJ;ee*mJ{zEc1I diff --git a/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co index c5c8e532495c7cbc04e83ca3a921979454a127c8..3d4c113a331e146926841e7907edf155b9f62102 100755 GIT binary patch literal 65080 zcmeHQ33yahmcDrjNytVvn2-)wVmc`nLpC7VMXU-**g}!;A&m^(MUhlekt{`}5<&=7 zkFr8DG$Sz}A|N7)t&ZBkK?i6|A};L$jsd$roKI&EX`7~L8x?KErRJP>?@eBkN@Z=E zhL`WV`OiIfdH20~%Q@%XbFL#NFIVvK(fSek=S$8KA4Vf+8GZOFCGtifFM|Za|1c6l z{DF;awY0bgwz1AQ2C{m{=Er2LF3=W9vm8k{&0{+l0wGZ}Pj2ao;)gWPQQi*6Dqdmb z(R5%ULksIh3}ys7o&eeYwBzmh`7Qc`tq9MrzsIAaA*2A>;qeCghsTnxCk`0@x}kX# zWpm`$J^;loAa%ie zvaPk49xYAlF~ldOAy!tDt@S#>0)DV?4+raWgfSi(4fhD=^AV~%9IVd~sy#Fs?h!8G zBTV*ius%na;i1tmk8pq1Vm?Hzhokj5#5o=s4fhadF<93E;1#|F{C{;bYR&^CznT`s{Z zL+#-8p>}X$FdfWa(s>Qu8B7N!g2nm#lAZJ+^ne4?oKE@# zdT7?G9C9S7#9CQxF;tmK%1sqELvd-XsmOprz}sjQctO$#K^5$%U@R*wsqlNg2Opc9 z^a($lRBo&_6qgxohBcMeRjhNV3{D3%!}pR6%kuwT2%xhT}MwR1@h~pHO3lKv9%ILjOVT8>T`|B zT2)$E(ZqFos)2X?k7bd*poMcg1}jbli~!4n$m}3m#>e_R#Uc+B`FK$FkykXL`PEy- zK0e78XiUiCW&j{N{|0)_WS|Y;B6S|E4+=-w`6n{FODCt3x&z2Nd_#x}PLA7II!gyf z`M7;Hy4}!L@NbiP@bzz#hT!Mlrb5OD|F&dAfB!ZuVt{{JK4Q4gCIV;y8v#YYeTb1l zTQeYu5Zc-Rw1Dp*MhR_~0ZFuf8~!kxiJ;mn_^I4}ehF?kAGaV-Bt*iE#9!44v_PCK zzT}=3KjLcf9T8@Cv~6j1FPPPYeYh9oNtl&4zsc97X&UDqC!LEXE>0B3jdO;D{SWw_pkEUg2KtHIHM>e<2BKNkqbebj&+127EO)bC{-DHP^*-p*9k(x+x zjAjh*ehEB*xXD(A+ZaB;aJPgb-6N%AhPp!~jBrOtINCj0!m;kL5<UL51 zjL6Fxh+cIs)KxFkn-ift(dSd*vrsTQ1tH#tY(8~34Bk12NfeLi8CF1V;rVBMBK*Y+ zK`={nq`x?KUvR6KkAukG&2Tw;<@5gJeTYK7od~gK64lVom!gmdW0}v!5*R3o zBS%VOp~nx84}{|f!tq7&TFV^w?2W|jc4rG^_Lqc6yPF(p5q$jv+~oNd9E+R0&_aCt z0RIb6@Vg3iMh1?AM{i@xPo6q_hg*oVLr-qEP$8ix6x&5%iQO%zArBo3lMo$lVT#@D zAOx@7RuY}yjE;_Xk}>h_0D4_CW5$?W@Y`(E=f`h_-*1B+u2Vt2*_j^`e-B(UsUEx( zL)o=+k&t*H2hK4(TofZBoY=;t4~u_TLcPkZ*N5QCCUI1lS;(2$#IL{f`$R{J;^@)N zaPn+R1o?GKG-+-bO%AqLN5L~XL7#x*9!12~#1XZ24LRDP5%l(KxXu&F##4y}0!btd z67o8T6Gzn9lQiMt)R_^^WKFb~GIO-^DzuBp_i-nZY3+&CaFz7>SgCEfK(0c65sQ7? zYA916CSsj{eo*0^Ho)nsHmX2TG<@k%d9RE%)$6wOR@kfcJ>QT%M`077|8I3SeA zMAPz^7+M|^N6W{mX!+Q9KxjW!4G8VWP634WvB`kYJ~o|}$7ay-SPd<26C0#@FS#fl z-$e1{EfimAr1hqCM9fD& zc}glD^9;q9gA`+*hkEc#{wR;ui#tO3<6k@#S_|z|&>!N21Lvf1#^t+2%*Q`@PAX4$ zhNAi)#YxXYJs4lyVp?y?5z0?`@my#6ST2FI?@-trqK3unPD_mkAd_%6DJ*%#b8R@=y?woop;`#IHI>e7ZRsQHYCGa5pQs&xIOcIha7}%S?5X5*yXB)d{Sd8xq-gsy#hSmq36FT} znP5~_;QEd;yF`@9=<{E8CJ4k{9PhX?m#8IDUMEt_YNI%d_(=S0EyWy>Vs0D7*+h`a z=ivE`w?IEMKNsE;*#EpXit~xDl+V*rTp&`^w^3Y(=QQ31?@}1&_*yznJSXiv^jxxj zK{5N66mz#yoU@JMyay=e?WU-I7|tEeZG62;RD--YV;t&WaP8q+ovQY6r!FiSlR8c{ zChX>7zci2!(|jHw;l6Qpt?KR|ZEny%Q6=k$781KjIxwOcm4>sk%C8qf-81*`(B z0<;0z0ILD30c!wj0M`Jn0kr7O+E?q&*{!Bra;mIIcHeZx9p7j#HM8KpyZtYe(Ap zN2Je5wC&GGpO@&Gzassybey+Nz;^@MpLhpx*BN+#;obnW0s{Z!yA7T9Ncg2QA%s{& zidGUWiLixj2fy?uuEcU2$`3&@j}sIv)I1;Mcz=+#l@fcA4( zfwo4S2io6J1$0%^$3O=LSn~jvAnAqAmd)=2!#tny62J4tLZ7 zT^n^7=t##}pw~uy26VKe4(Ph5D?pENtOI&o)R#cVIO>6}kNOJeSjT#x*GF9iI?iDS z+8*^S(BmBrpdC@)C*ZaD@Au(1G8%)^3JZhNOs3$pl9J%G($e6xRjY#2%FBb(Dl3C) z7j9{7hIfSc;qfp;=O2!T;l2I(*#;5wu0K)!#=lV9^eM&7U(kAQoTv4+d`=y|3dMJPboh21+91T3a$6_*OdS4cNCj_PD}kgH{!I^-`ii&dWS|* z{tKZL4@XcuI-1t|?{8_nV`C}5W!&koT$g5Rvj+65xg#Z=OOe%4j{sf4i;NM`tBi4N z$?B3$?UVaoxwHLRV}(u`L(620EGA>1Jdd(OHj^=Oxr_n2IP&K(8Dk!kF;MPDejbxC z^i0OcXEMejCS#ypfHKBnCSxpNGR9ISV=QAb2I>u1Z#k1Oe$Hf!6->rh$z%-FD^SKL zVlqZClQGOp#<-iw7^rt(y)q_aR4^IC!ek69lQB>)K^eowWQ-anW7IMkqmIcKsJCFf zdM0DonT&yY%y7vVs0$5L#z4KvlZ=6S)NsfcsD}+j#*pi8IL_gaF;E8_ij0A};}0QY zpl&)483T3L!N?e0b>5yb2I{~a_sB5F7^q8Qf7gSIf%bs@${1}-#yHMo474GjjPVMS zFlVKT-kCS#xt0_(lYWQ^Z48RHC-G5&|i7-*xw zdhatCi zfi@7VcZJCqUosivD<)%HWikfZNU+|wOvd=$lZ>I*F(et|daz^2WegwUYCZXl6Lelt zN@;%1E_Mx+>$yDx?Hd-jPtdL*w`ZVz1NmszKv|yKGtj<)d{1@_xjh5z8#but$*zGq zDz|5#eFN)xvTNWtxjh5z8@<^zaD3dJf%Xlo=gF>t>5~)+cVIv^(kU_EZnK>G%M`#sq;upYN(pnU`Dd9rI@J#NoH`v%tYWY@rY+@68<4XmfwHP{@e zm${Sc&8~6&yRLQ({C0DD2HH0)a9^NZgRd>XJlZ!9(XN5_5w~ZceFOQP>>BtD=k^S= zZ`h!oC%XoI!?`^J?HgFnlU)PH$?X|v-{{S*f#c)$476|bX4k;+aeD^ZH+r*c;P|*b z1MM4%U1Q*LCyHI8|8pmbUE_zJI~k~51K%;+o`Lp_W_V|LvTI;HZqGpb2G;Xr*T8z* zo`LoatmnzDf%UjO1MM4F&y!sP>v4Mq+Bfi9@5!!#^|(C)?HgFnlU)PraeD^ZH?W>3 zy9U8UT){WyeIoc$1wzp-9<80uFTAkfJRuKmzina zkf}90GqcS1WM-RZMClXM0?Uxj?FR^qQl{zsI5GxD>t zBtA?@wYfMSJJG z{U>%lSKpUS+X8Dc@m(vQH^;nOuf%tqeBK)Ka=i}cy_U%Ad-6F&yr1QBiI|tqA!1%G zV=MPL8Sok&ov`&vGCf_b@Yf%o|yUuT#1`w1WVe#f<4crOz+ zpFWs(ybapR*XCi~lkFP$-fY&W7kaZ*W4SjQHR^%hY|~io$tI20=URGxyMG65`2G4* z&+i~C^KMW4bIbv;4HZ`p^;($`u=D%1RPuTkvJw+Fuu_`UHaXJNT~T_WZ^+3k_< z&0de?-t6@FUGrw2$8v9Wc`Wy4kH>OPc6fZ>obaLZ!2EkT6j{**$C0ly!@MUuEb_hC zU-5hB&F+fj-t4XTJ@jU0#d1&fRpomL-y!9DDftk1@*U`S$12~;znc$$^1b}w*YpqM zd+AMI!|$AYjUDDa*`Sf{&GwAt-fYhJJ@aO3#&T~qW-Rw+TgGxvHf8)ifJZYV@Fki| z$hGu7Kp$i+*t2wyvLMZZlnp5dQZA&~kmf*|3uzvt`H=D;Er6tlv=9=vtxX2MpveoD z2$OS`*pW9Gd_5+Eug7HY^_UF49+Sb>V>0-9Oa@<%$>8gO@@jLTFZkB1a3K9$3DPSi z`p_z*izNDmN~DV&=%=|~9~6Q<%;2vXq7Du+2Y|o$CbZcG1(}^eLGh$<*O?K`U@hGV z_#7b{Sep&#%Z~M(U~6S{JAQ}8vNI5Kvsp#)pu`WioDFOiz_%3epdbdTo!7OGy<4B} z(Wf4L@6|Hj`+3Y~zq?O<_j-zN{os7-ZHjMwr*Hpt;IG5iuHd|*;#;ry)_0Flq4?JK z7Ner4Z@n{+`qpa|-}=9sZ#}-J`s*8yzF)PUQ-ATgzfr9JhN9iA_|A9to&TZy9uGrV< zzO>itzOt{^UA5bF-`X9z?+4_MUkg6@Ya!J^S_i2f(t1dCNDfFt@z+P4qV{w2-M=9D z@z?zg>5CFw{|(ZYBpQM+0Zrowc=hjBf$bb7pNJX|CYA8pKCJ%!D#`y}6e&7Myp(r| z6gLvJl;0#$+)SoO`7I*FdvQD%F8~qOq@y0wmyWLZ+bjO|N^C4yY_L&XV`ItJjNPR8 z+n@e>`P-x1(O-Z0fyf<-KfT!6AGu>VeCV$Qxud^6`u&wV2D8?EFmi`ipMMW>$FRl# z7zVjR9*=Jza)*3g0N(+8&ANO&SNHh=tjE`^dz&Y~dVI~gr?u*Qz5wg-HS3<%s`GgR ztjE`^%h!`(x#EN08Ix0qCD?mR&i=*{l*=9Hga5M32Y;q)jV`WJM0sEF!5`2P zd_$KB$~P1?hUw>e(t&6TeE@X-qvd1nhbAiG_c%L96{tO zKAxU^JpbAKImjB+uFF&Q)u#K@|4{pCi{a9~+$-8B7Lf=kZ`M*Q z69o(GziowNgH!{l7E&FgdPsICFC(a(>x^@gd><=M`&ZeZ-i@#?H|}xX2iQ4D#HV%P^+VcVg8IJ8HMe07nOkA8#Vn3EJ^+9}5Vj$+(<6vuzCC~POR zQ$c&g39qh{^6_s_OgKqV-A-}R?2pX2WS(>NE<=v!pR z`Par<-jZVcH6>EqKAGZ6hxFyO<0jXIU%KHUUI zlP=z|O_$))FnF})TKIh>Czp$b<-VtbQz9^bhkV9>NJjr zb(xOzyOstu`YyBI2QdknAZ>%R9n$@fc0hUn(oRSZLfQpsH>5p~9)h$N(!-E$y=$qo zZ$RTM7|%>4#Wrh7M0$cmZ=Z~GyhQJqigbcR?@U8lEz!GXAU#Q<_xuFuDH6T+4y2PD z(q06OV4RA9dDnn zaW9*n-N#}T>}RnG9%HuU$62g`Cs?e4r&z3lr&+9mXZvGYPG7w`IK8SWIK8?$IDO5U z;Pka?gVWco3r=6ZJ~-Xs2v%Yj3_W(iVUT&mn?DLebpDv4Ya+#sw^H1cLUHpnid$}@ zc<)S#jXH`=ITW|grMP1O#hnEdcil;G&s`Mv{_LZ$V{qJkH-9eS{vT6(Y$C9J| z3dN_UQGEI~iqFoZ*sP=YTn@!Ub1A;Cfa2i-ibwCHc)VXp2VB+p~py`$V$wRAK?d3IY8Hk=H?-IHJy;qzMQcTk5$cWG^p3f_cxL6EhyjT1m%0ax1fB#6V&(Q zF|6f#3wn!XE#F(v)1HF0&@aZc#&Pn!1&f}fdj=LiL(zPYBF47H@$tO{%Z^Zf#f#^p zwGG1=yMgO{TBi49F}*LF>3z9O@0-K)zIjaV%VT< zTgLRhvc_$qW39! zA0huxy-&X02*1DFhsxW!Bdn*yjyWyqdu>ebJI?gJSD4=S8q@n;XL{e8Oz%6v^uD*5 z-uDjE`%W>v?_H+%{hsN4XPDmiKTPj?pXq)7%k;jpOz%6#^uF^<@B5hPeHWPCcaiCR zmzdu73Df&7GrjLKruSW8df%5!@B51BeOH;@_bt==zW1c}#SgFE7wJXsi}a%RDY0{U z*Lw#WJID8H+@FHR<* z|CISZkMob$!QM;q7^;qMyUzc~=l!w{&~=?S|0kdK%Z2rys3+q5AHHAsyx%-n|A~B0 z^MCSrzeNX5!FCaz_p+7G`+3{bRzC0NZEsr~AD{R0w#V&o#V+G|pECc~srM=Kf5W5q zDf55Br}ruIfBn<@y2o!*=KmDEPtp6XZM_fY|4{DX^L|R~#Qw)lM7>X$|C6ozRObK4 zRbi3+dhqv?>wP|7yIOJnkH?O@J)mps$lVZ25_K7j1IFhB5e23b%6$QmFZTh&yf@!J z)cvBLq&nc3XDG%Tq!{}=#keCB$G>=~b# zY2J{jH9Ir2%=ctwn`bDoGh6Gx*C+9d{RCrtPFO9Fwrm>Pavk#jM)@}(AFdtFz1JiE zYsyz4U#pe)>yiHr<=>3_tSpHS0Up{Dzw6n*1NrX#WGKHCxX838-u$REcKKNDL;fw4 zFCPoI(zGZ3nDXW4x()e>lrKM5aJgwuoY=Gf`;kAnXaC@u)1G*1&;B1k{?wlR??iq| z&;B1oep=7|cOidT&;EBKe@4&#_aOhap8Y?B{Gasfe=qW9_U!**X6>%6|vmgd?ME_C zey`aa$6~gOQ;i9`x!5lap1CwBW=kB6*-~5_l%^}TD|>1{%$A=nkz%$eduot-F=mUR zYbv^?qHA9Jx~AMWwi#q?Z$7bDem%wFKMdz{BhPK)JDNbubT>IB#i`dUC9R*52H?e> z37#yA0d@`S5he;t>{|H7>L6u7nguBvQVyhCNV6f$fixG=JV^5)B)Y|cbdAGB^s4>(ppdOZpWxKzhp2;t%mJ`x!6uvo4hk|mgM#8oGbw#USg&OIii zy(f+`{%<<@7vY&!p5aRAH*PM(-*1KVTRvBU^h$|7vEizX#?9>aw4v1>-Xr*Dj-VJ!fy6Kz!SD(z4`TUZhF^T|u0*n# zl|vv|>F-^ZXW0(OE3H^%vfi%JEyz~ow3#c67^b%~+0qPWsfTxm5J%gPLu zRR)W-vbeO&R5i8AS~#_=tfu@{;M{F0v`HCN@wAMU$nSk5J6 z0^`Rm?#jQF@fXkR%Fkf@diajY|7J1%oNo2?jIW|ZN0Yl4f6K}amPF9g2_WeOUHQ_F z4Cl4{U%T?b?g@VZq}GpK5nkHB_-Fo|b0}iySFC>q7Td`GV4Vm20e_X8OUUDlf1+Fc zgPh;3|6`1AfdwM+zuz+c6j?`9<;6)Al{OPeDq34nwYHojl~hzGnT=Iu!u})QW+h41$|9r9NRn)( zTANhcSYBEPY*U%F5+2#2B6wg)QekCzxv2swKwmcFNOXp-x z_H<6>rlm15c~*J`C#T;wjgzUUNMh(|ErrvodD!9&&HfF)}qbou5!@MkkrlNoJ(;(}cR5 zOq-sO%z8*m&EaR9He<##Vpyi1W-PUttZCCNMnkbBWxAoFa&@(_!e%I0ZOkyF*h&oP zYYgQ!i=n(~RdUXN8zikNB8Kd{^tuIe>5GQn9fL-^J>f1S0`Gwn49%l>0Y$&AS*b04 zt9Ud1{o*)>)yMA_9xIUL`F9Pw|I+QQi&&zpx#wKHdz`l7o|K4#_v5LH$ zkDudBQ0^;h&%du6`S+D|)9vXd5aXAg7w?}xw;Z*+A;a$ZsldnU!oU9=?}I4XK>Uzm z$A7FW`eW+=6 zgKT3OQi0nVY)KKCXyaee@xh12CDCXj8$)Gv^FBfq_~3C zX13|AeMW0jyRn~{gt&Pv&aLWgpVnj+S4=|0Pi_0}gt#Admv=g~ywnbuLjV~9$PiQ+Lqx_1k#Uj8ct@2X zfD8d-2$~E|IDjgM?ora=GU+fzI$Z62VaS&Lv>=tX^c&No2I)-Md9x~Vljcg9DzuKL zE6dm!B>hqeYBq7oh#L9%GQ4Yt(RobnPcuwX{arFhKgQdU^s!qs<;jHk)}-dsL=qPQ zNt-@v=3aRqeN2|hcUC{j?yHn`x3sc}Q~y_^vgtgu8lx7i8Yh)%B)3yM+?X#bY!*q) zN3Uud#HKGUwIsVXl;pTllAvz$w=cI+jrlTH<_=PbU9$B3*>fEGQLz-eu-lk7g*brl z;-#kid^CP**NX9onG|C)k~-g=wdKinFB-`6kw|L$03_u_MLH(#KX^!XNcl&qnPIRg{{{?_mMJQaYK{iNa# zR$es(4*j9;a9P3G0)`%5X*57^!9b3ZIY+^6g2Pd8l_{-%?v~{9G+VV@^_(0Z8tu_ z7vfLkEjT0#R`{wx&gd~Y#M9db-!$wqz};%2cbc2Wb8=8f!dHu=ww9SBi?7I!nLBgu-RF1q*~gV%R3HTf8AAyB3npht5YHnRdA?;blNOCYSr!R}|4}4@ zjD&KmEA+Z!2X9PZDCpuJip4pppxeKWSSAoiqS-OlgDwy`OgSUwpnLZ#&BH^Y!(7tT%)#hI)9u{{G>)==H=7<6k$l zj$vYf;@Xb_@e(LY1T-zon_GMnA?~Lv)n(Q?OY)j>drf&&-QC+CNw(Nm*S$&7wmy|? zDRr!~RFzmtOUvu(9=DfU%1YMTD;?!fQdVXMSxKGEQd<7g)m62tELD#r*I25{A6)da z&i>kRMwO|$y7^Lt*fXgw>PmW57{hbqEe`zN{_DE zP(qs!Ejc#W${(ss@6pA_Xcud1sx}1d;bL*JXg?>rY!Bo8R2u3YE)YFT^mDSy_E7Jq z(opwsndo7PpOam-hgp6q4RH_m=PVUnH2OK)WxF`fPo<&mVs4S>VzHmIUABwM{Ztz2 zE-vuhEGzw-?Xq38`l&S3U0mktqRr3QF5AThKb3mzB7b5%TvuAPT6}lMWY$86cs${Q z?s6?CpLxeTgKc2jU>ktPK0d4a^${MfURAZmcM+Ng(1PkB>>6wXErV@f?*JNjWR=BH zYV}=(2L{lD>MA@u*ajXQYy(dWpn?61>wK4>bpZ8Mm*APfHgIsT4ICao1N(EjEZ>ln%Y$v;)xkD!Y#$F+rXFoX`uVCsc&PFhZZj_7MJXJ5Md|m z-^l5BP^h13z1ks%l2_Yn*V#(y%2!vH*EmWlDjUknO3(=S9gf(aqa#8X3i=c*Rh6r2 zLZ0m*$12Bzq7NolTN+9#sw|F@^|kgjym49v9|uQ~ZL^eA*ivVg z)YPtBXMxkNUTeuJNp-9)$yi?kw@^uS-I~<=UnSe@mGu@!xvvvly(G^h+soI&fK+v| zd*I>)1^qm^Zne!`Qd+yN#CJlh>4iN#|?nxI})Dyxvk@USY3= z9^>aWis5s8xxKEkwq~ao+p`h4`ahP%2E!;^@EE)}4KN0n2buFAMlMHvktRYOiJMRu z?4xY!h}KsgNgWJ40-it|D|LJbNa99z;FoeIkrMBeLK3|pAyd8ZdAyRu zkPwPn$;iZZ$V-h+EwL60KnQS=(moRSHNm!*9!nVfFJYjM-kKge!!xXrMPwEw>`P3>EF ze%dun?H-=r2Kiw8TyPx>@$s@eVS>AZ9HjNsW zdC9YF*cUH(u8jnR0R9i46mki4Mu(1uTW@RIcOO4^qgP6BLQ7t+R6|jgDx9*k%;}Z% za10Gg(-B=>X{OWbA_TA94iY!j9T%76CgYR5qu6yZj300Hz<0CNT%5E6zQ4_8xK4|U zt?uIZq%CmGs2)5JBl)%SkccEHA3kGrv@FNOxUr7M9F=rGMRTIpY>vQ(jc!boRmz{U zQ(S-g{lvw|^0;yCX!2BB4EaS{9BFMEM-H^v$G|;0#XJ@JJ%-5bx)BXd19`s9Aeo(c zaGj@-ttWMhC88ru6on4rv=JMf$%bh8`Z+P~6hoYxI%k~w64Z++4)W^AtWMoJxJqX8 zL|V66BA1}On598pJ(TIOthjih9@^E@vL()#&qEnPZsI+RU^GF@b{RUesRofes9ogSHCrRQH9;QUJhod2c(=U>*#`NznW`Z4ax zX=6N<)5mPBr02gS!1-4MIREzpod4$D&c6-LejaZD;s$Gs9PQG5k#n!{6=&gmya~VEF5Y8NT=^!u)`^nAkC%1V?;?^ZglBU5u1^3fbuX#yCXb>TI>@E&XvdP=wwRI5&&B@m~0r(Iew=$YU=?B943#y=9^8}PDq?yGhnRfw3nwERpk5-hhdAYt zbF`lc#U2@tlO8)q%cnlcP=A2o^k+d2`j@bj>CHUEo#enM-GK|$fdJq^$qc(y_e3N z({DgLe_p>4@tt?{n-DKt&^IE!`>y^D#ETd8PQ>@$*Si3*&y&jFng*9S)5xh#+gq=N zAlm=J(D5F_`VU@%TYTc_a2%|_^_^t($Y_(X`@iZmOr;vJzmuvwvXRhZlgu!ugJCWS zqVhZ=!+e=xK?lRRM55*M@bgW!K|AcY5S|m*{`?Mx3rH|ME;2G)C^Iy7FkFP6X|e;J zrO?mG4XmH|nT%W5XUX{i!@M6dEZD(t-aQQG-^;M*K8EJ|;j_bMo80J;^pWG4b{=V?{++x~am6pXt?(Citx% z?X+!v-Ln>OEubCH4p;|R2j~EF0ImaE2UrhS54aw1J)q5W*16Vn&S^KDch;HSaXL&F zoa;>QI_pgro$F2S)AJm8xk*mHd0Ch(c!hKA_f|-DNOh1Lkk&z}hqNBjbvG}QuwSpe z+9d19?vu8+z#a%b>%_eMFPJ|^^N#m0f1c*+KfwGu^gKt8!FvPhA3K4#=QP~F@Y?{i z1A_ec>rGw1k?4!3BM7m{4DBS29y?@)^(3AiH^>Y(k_39(C^Mwz``u~Q-X)2+Y+z5? z#-5#eNs?S^A-{I)Imm~&?2xyQJrDVjt~$uqjeQ65p)LpH9b+#*KFqZa^6SRF3;76F zJ>=`hUW9ym*iRup-n9wxo5ubF z^6{=l$TyDt9P$%gcR>D*v6mp9;BrFVIrdA)Pj{hv&J*S{Dx2fakw+d1MT+TLpynBK0@OujpkVM`3dJ>!_( z(ThxP?*t~_H|eFQXP_Ss{P`rs{coS7?H&8DNk;j@e`WHY{*B?IA2EFV?@aIbr%dmO ze=_+~|6fOq1jN7=4v69;utGJDU;|jDf%D9bD!EFpHw=r(#HU^G6P_K&H7&Y9+uyGs1 z&TR}Fm!OT|;5J4*w=o*Hjj@s27&vZ0y+&?hIJu31ox@UmF9*P5x{Q97hd> zje+B^!Ppqe@f-GYC~OQI0}jTLttaz zSQ^_K4mJkz0e!VGI=GGTBDXP+LqHqjWo~2qhT9mgavS3{Zet+FfO^Ncjqy8fW1QeN z#z}5tAP0eZuX7vY58TE$&25Z7avK9V3ebM%+jQ1ThbZ{WM%AFqLWg3myH1NHpz z8mK4u4CFUZ&mXUWdV6f~@EtDr4CFT)py!X*z<0RdGmzgvJ%79g_EYc~$Zzz0_e8vbq1Nn_scxL(IHBe9R z8OU#-o;3T>s3-Ug(JvUUF&E~_427gUeneCIJg;G^7p8Fy z9V6Nsf;{bWw?Wxv!&bv)(*8VYQt=)(_}wV^4m?)!9C+-H-|(G7vN=5@JJFM!9WkmI z{PVK2t(&urR(E!ebxU@hHFKbU+#MFEG{Isr&POg7QR^8;w-H7Ugd- z`86od$)R$XlJ3;K&E!*1o|i}EbtwPKenRK-!I#30z7gO{Q3q&yt8BdRdGK+}%$LD9 z*!hQn&N~dnJVS533*CJdy3ZpB-vcG@k3IxSUSIeD_^#JwIC;jkpL2H2=fC6Zn!|tB z*)?zfzEjNAUyjqZflnqrYnAinc&r>(;`PQd`I6d z{4EncpFWUhyaVbhee>|xA9s!NK%6y>3j=Z0SRRO@#&KXEZW_z|ang8wuB7|7`+LxV z@2|i1d=G+~cc<>Xp6|hWl>e>gdvHCmI8F@KIz&1MwI`(XZxE_ z{#noV8&Uqxp6%a(@-KR}4+{f2b^q$wz6<5v{ouYoc%R7r_$m0_bHl(c7^vStcROLI zeHxYaMj&5Cr5(~YA4au5f$zJ#jLmX6Uqxoq{CE2(_T}A!?+1L}1hTWRT+C0#?{Z|#8Z8LPc7|7~jkwD;xL@6+F( z_hq2*8ouY0K6ZHQkAp^eAnqB<198syJ`2P(V|gHs8OsB4%UJG@Q^xlLtZ0S=Yl&tM zawR(mY6okmf^L0I3MlLP%yviy#SZZ3e6hnz3k^ zG$VhR6J;}Ct;Y;l>oEh?ddz^e9y4IA#|&8OF$30m%z(8XXs@=G2E*E#l`hOby&ChY zX#SZsm@lLG=V~!u;lg#A`^{kyxP}?lYewk9!>prVz4$idY{SB=?y#^V(!A&Nh*n@r zTL7OXWHYa`8P~F-z8kn!uIu|AnyXGj%+1zBh6kuT+IBXyRf4spfCnVGEXCdDHTLtC zosqrdj}T)3R!Eco3+us&YHj_0%es57x!CVIdt8sNSFFdM&)4yHU#H()Pg`$4u=Vy1 zZM}WhdjDathDZ3b;9R7(-d)>d;yo$+)BX9SJ3#N7u-<;fEhF0NV9r~_t62fDhHmlB0lS8f zIj{y?%w6NW^$xJDwiFj9%H)N%W{6WWr?}X9_2TT?yU%Is$ixw~zA#nRK2&-9F9}WYT#8 zb^AC^kV)qW)a~OuK_;CiP`B?wIm{D$Z^a6kuH%4vLc~(Q-+NzUA>dlShEG_0fQ_Fq zZ2W?u)2qcw=pHNK>xz%i0C5r;AWp&tzzu*K0XG6}0^9`H2-paC2jCrmPCzH13(y59 zd>ndKk>Kj1n1~*4iNRk`V*yZD3)vSG`iyJ zYH>k(kFVR;xS&d#660ySamfvH!7QG}j1gKqjTG>;9;$d6nvFTUZA^`~^!(dny!OS$ z9E!NESAvb%*SN8LwJ`@0S9Tya<~NzQMB7cogVo}vurqUQTmJlPZ2ffuzMx!1bZHSCM8f4nKRbUu%4W z&Cc#ID~39DQ}_0@m>4X!75ci0m>7MDY0+2z1G*-R?)yh$Oa52K!cxY(xSU-+*fw^2j?IK6ppS{BR&*`e;a*%F*0>0nhAmen9LHY;u#UB7;k!f)iE6s@X-biDDnu;`G-pQu<1w z?_wYyNh}ZK8;RwCd?K+tkS`?qJO=WC#BzTzw^6ReL-8LE<-a`!ijl=ad3q(oU()eM zx0omkLPKSF^k_Fh`O`dB$}c5Z$gjHE-R>o)+a7?WbHOHvyW)WqA1})jCb<2@Na-L4 z+weT|b`;8mp28i4s89B^ZGrX02H1mIE|$nQEq00r;@c#K5HiX;hoWIlqukecD?+ad zVh?H8zfs0GJl$i`h-=ha?3$st{sU?IKNBlwGAtpNy<*M3ELAvV$qH#1q~)+|RuAP~ zD4&k#@=7zEULf4izj+6Vo9d2>OLCL(NnVHnfY;hEe!LZ9Hf=Q*Ct=K{&1Q3wZgH{I zT^ygZg_xP1L`Wpb=+wcYc@K$5lJeo6kB*k*m>4(K@tC8M?x(25&Y2#WVZ8*g7h)C% zd3D79oK(y(_jAIv&bSv8&P!^sbH1L~Idb$@XW$jIvV&n6iJ`|!;dG$XJ9k@KFF`7FhuELX5hWfeV6 zI>K=3afbR%hST3*IP(m{K>Hqm|PjNY8T3<^=$t~ zD4)@@{oN?Pu4nuAqWt=v?YE#jwP*W3MtOSA_V=KCR?qhDLwRP;_V=RvhMw*J1m)lD z+5SG1&*|Cz{V2cjAuXmsXXoUGq4T5FVoF_pU&_Al)HrU?)@Jq1 zkG6YEDV)b{4lZ+|uT1l`M`@h1NCrF`GlCu@(ZQ3Q#>Cshj8z-CjT>u%E&*G^NC>mJio*DjOZ^&``C*KX5H*S)4>SBojd^DH0*Zrn!SH>;N!0U@J zkbVql52X7b?S=FcNc$k&59zvFmealSny48jK448T9>48z z9>47g9>49WzRVeAtX&(PQCAn9v2I;>#`^W)85=f)XKdOOo^i(=;TbMhxE9ZC@bTOZ zf;}r=^Hvn1``ZjX(->~Oj^VachIh_lxcvr(cgTA&oR!t&onoGfJntBI?(0T0I1S|aHUsdE zc~Y@+8rgbMw;19lk|v5m2XWenjm~64w0!-X7X7@?@+nNCykLDJ5olua<5Opy`o&X=cfaXmHRlMEqqm|M@D&|{hW{oxhjI`-FoOG zgxoxS>b4h7Mm7Mhd{7EQ#C@HxpYhQi8IQ{zJ4eeao@8h}z!3L$!v4j_F}6X_j-IW(DVIZsk19ZJeiB3H4i`{wmJXlyRP>g7Y+1&ePn^ zd73KD)6{UD#s>9~r?GRM#=&`-dd|}{aGqu(=V=-_Pvg{h8dYrZ;mFfy>zx@-qpf!y zn%Li3oJL~jV@C%cJJw%rztq95!$4c8`+8@Mr(t^$^y=HI@iZDwL&$%Wr&0R-;y6Xb z90=t9i+Wm&)Kiq7>EJxgi=3x^EAgePxCv@)12Tu%}LJFyv})= zKX9JrH0Npl$a$JKIZyMyoToX=oG?zF}^CjnLzVgS@X#OEE7hntzSaim|vg{~Fi_&dU5&aP2_y(2e~r?wX2~PWU&fz*jnc1X#j{M$pMQZ>NOr>sKtv? z@8LaM_KOrejpkpY@ifT0X#O>)I_p1RsAl*FkCf3G{6pl;&TRAxfOX z2>}iVk4JLuCiDP14&$6n#36PZ^}>sEEmJhq$2Co{oU4g@jBzpJpJW(+fZ;^W(IoIS zP?Px@s6?obT+0;B%_Q-)PgD8YCp};LG@WxYGx?gQWWMGJeP@tc!Pqp2fnw5NxiS__ zGaPF^75y8l%l(X5V#BpZf8O6CX31KVzs2Orm?d_Uzs=;zm?d>6{|l2VW0q)Zyt?)V z!8Kl59Gb59O2dVJ!8!DKUc*)|Eb})=Hm8SVCwj88BStktJjv{A>*j2u)t#MV-IASW z&D8eiY2OIX3f zqcNV;?c=?rqcNV;?PH859gXp%ZXbQnbTr13x_ylCq@yvO)a_%8CmoIPq;4N$Jn3kR zCw2Q6<4H$jJgM8q7*9GH<4N8A{V0bRPk3)>dj@EG255UJ(fBWoJpPMwuxt1*{)^`O zrE!Xy@7I;-`=#w;cKlS^B)0DEo9w@Z&*-KQZ67z}hR`nuIU&t|L*s;StTbHwk~B`J ziv!R&A;tqBFRP=xta==cysVD$vg&a(^0GS0%c{rG$jj;|FRLD>BQL9?vA5LiLnypX z-Fu9iQSun|DF0he&Ia#6-3LsrV)PUuvp=4YnG+Xnx#D{?)ck`q$Ka%zP!aiADZSU#NgaQ&wHiXkUNU|&ryIZXFA zH7qCXACV@I!5#-Q6fx#`9EHKgPZ((21bm<1{*6X>$C@DJK+1)b2Pq#?0i?N*=0Pfi zG#}CeNJWqqLNY^I1StqUuePTx>~)LZ@60Rh`(5rd!oCV7NI8&lA>~2Jhg1M*E~I&o z3L(vhv;a~Oq=k^okQPD0?{{YYawkRDb6_UyIWQCU9GD4v4$Oo-2WG;a12bXIftj%9 zfDYm@9IV5=7Ho9ih3D5}-cIv}H(}mE^KCB7*Sl~J(EaAH2;9%pZ7z<`hlg25!M?WJ z&{rcY%<2vcOCrsCPLDVUj8Y5W^Mq{XbvDD^zf|81ejHpEf73KZ3EP|aU?Rgqe9z+N z`JTnUqI$SD@j<>f@u9B0iRqrj&vW^&y5#S|J+0ltwe&k~EyeF|CH*d+UXA%xH2=&R z%$L#pbG4YS==*ov%D>~M`}G~S^6&Tn|BeT>FST|L50iVC;B#OzBtk~wKlro8Fbrov z;yI4s_$eo|NfP?7PGCq92VTszt{OOKlDLSWzCv$`*#ve3-c0-it-ZE(vnk> zQxf&pSJl?6F1MH1EDmdprMkSNuC~NswKz)bw(7c)QcF!q-8!4C*6sj}(lwUVl{KrO zyu75WeAT+uC6zT5wIvm`_7Y1~RY`4KiOpVHQCU@9cYU3`^!lo*`s(XIa(j8HgB~PS z%*tAoGP|s_JhkG6($ra*DVe3SDzZ|tD#}t)(#lh_O4BT*r4^aw(|q0Tk-;soVRvy- zRHiNh`-=V(L1kdSzK=YrCM&}@@>zlWvB5TU^=@#URc2jb)@`OTus!+%PF_)(1-r&n- z&Nc~YT6(U?rl*L;QZr{~3hAsHZV=gAU(Zsr(nWVtO}RohEj7(|iu`<^Lf$O#F;feC zjisf{7N0GBR&EYIMOtow&`rxo^+_}GebTP(fNqMYc0-2v@M$LBMTK@nchd55e2wL2 ziZ38NElphNbkN|9rRV2~Pn>S@eQ)Wrb7u>M%yge_PDWQYS9B-6AUB_stT4~AR65G- z>9cK?k_ubu?2?+=wd*W3j*``DEmr1K~wvy_)HL3ltki4dhl;quFHZ3e< zuQ%}<9|YoE2X_;}+l~H-3`gr2QNU2xGeU}W!y$=k!fp~ck1L|xgy#z7$HJb%=KtaK z){6SVZW5Tp^+ma;&&6Z{WUy^fF6=CUiCj^Xi~b9|8p?we^@Uw0P}p_68TuJS9#IDg zKbL41< z5(*75+Aba2!Io6o8bkWSw2qB!V!D`6t#K2RCQV6W8)DkV7=Dn{MgkOiXXe~l#+dk% z{n1G_=iKjn=icw0ojY^q4viZJ&Kj+;@YZ@7p&G-WwU{0-4VE2^F_dzRfu$U38)1_n zyR6!#O7ym4N^KLnG|z-X9SaaIl6%5!VpW)mt1#{s2BdM|$%#T0T3_wFxG=6jeyi)4nX>kS$y2%$XJzGAe~(dp4b%7LL&#Iyr;Q3gNslU$8m0 zAxe7P;Y30YW6dW8fF5~l8LYU65k6-o2jdpenTJf``%YvL;_ThbNtu^x8k?qs6DuwY zp4_1|$kv~Dhr0HC^)4!Lq_98H2Y#}!U)a*LQF5&G(6LVY>BMY%ql4Lw7H8YHJD4Wf z?sPECvTbrOuc5^aITdZCIj**iIec4Lj;BrG?*RTz;J*U=U6^n0iI&=XqZ{pg(K7qH zQH8%7_Hl=-q)&IbDsPWTun4C91Pqc6oX6PGOg z33IP8F=UXs3r?A0%UCnHWN1+sTUS8eM$+%=K+Q6gj8qJ(ncrL>)NC@^4$t@EKe0#o zeQ1zw+-OA7NJVu4zI2Zke`ZsqF>2 zCmoqcmr7;aeAJh?CFMW3{oHPO<1Swj*vx0CO=>k*^w3e}${UC47dSkvH0w*pGW57fiNd%~anxJ)Q@6_9uE=-Oih-p)GoR z6Yv{+dfX5A{Zhq=Y0)|eR6mx`2>5VVqbkmBz%6_9_yNGbYS80KVJI^-v2!Vo$M1%O zHXD3PLA)REi0WuraloBj8l*UCVa(KIM2}xon)u5}Ri*{Mmc1BOl>1mm*C1i_5sj-j zHvw;YOwdn&*PPJve*?TZRb&OZ5BP{EkH6&l8v+CGEdjBseuJ8+tpj@8suavj9T?Q( zxqx3A*5k#1U%98@>iA_4NM6?hisJ?Rj}bjy1Gw>m9^V4^UA5*~a2McX*EL9ST97in znd-Qr541x07T}|)87PEaz^BIb{4;=$tN)!A90I&K_Opg5&KD4HP3ZAUWB`-m zXu$n>H&cGr>~ZT0=0F7Q0FEDcaRHG?J;#^e<%P9&f_7jt3F6v&gJJ)gi7M_ zb)+}WAJR}Sr*VB=jS#BTIt%hd8W#%$$V9ndh)jtK2VF3R+r@{$<$d)Uw_ck*@A8wP z@SNM9;t~Zk{4pcA@aqj zGLyXEMjc|O+)ly^FX<33<2FNHki@o%OdeV(pDJ0!JSSqUa7@?3h`bZ|7;;m>DP6~u z#?Q~vj@%X~QF0)k5;g0nJL8`VIR7Ed}+A)@{Rp+Z`d diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index 28f7002b67a4a118ee2f3fbd6a08c72c099f7d05..4d59e8f3174c3009f9ec89879f6669687c9d652d 100755 GIT binary patch delta 10234 zcmeHNe@q+q6~FHn8{1&R*^n%u?LugjB`H77#=oF$W^9KvYc)~UwPZ``5o5q&)uM?gE$SpXg*2NcZPL<)Ev-sZ7WVy~ zzlS-tbyCGw*c}fyLjatL5zPiTJTS5?!Wt;? zm?V^<)7JnIC1%Ug*VMgos_;V*km&Mxg=j?GmmXJ-%4w9SnOCKmDrq7X`}Y&MMzFc+ zzw6`TAn?s-XDovj(Bto& zn&bYUc7jjE^SpRoK<8@iw$jRtmT2K6z(t{@;1W=CKZa%8jglas`L@z+pbERpP=HpF zTt(KlA*x6B7Bld#(bJ%44&&v8lN8{F4DS?s;*Osg9ER9h$69V@*}Hf3>jzUus0$O0 z$Eeh4W-yf?{P^BM?xRb;U6@ZKD%^;7V}59; zLR>fusU~DV6QV9eU5L67btCF5t`0U8*9Kk1Owe6y3W^3iLJasmF<>__U@r?+-vFH> z*reMYbm^Qyx6XwOKzEMZi4AXJk%5O48B9e$exdC;#g-CJ>xrlJ#MAn{ zt8OXf)OnDjn~|d*M2`MWkZ-w1Tgd(tX8iUQfw%gPJso#7%L4B!o;!K$TqrWeKe|aN z`i}mOQTo5|ojgeSsqShe*I%XOChwbJi@%Jg(63GW`%Dr4f5%}1`5}LeIGn4_fjE4x z4#nY-f8%}^uZ5j} z?Oawj<$V69nh2cpQh8#*=Qu&?-VK|Zt zVfg&Rx{X^uPo!`tN-6&udL3Gx@;~7lULF8A{^ ztOh0c#|rKw_8dvVa9JczmE$2V_*#M zAKc9T3%zjV4RGDS-94uk7S!MZVDG<~$?xxK^DMvTL^RGDg#N#RGk|;VZ(8=nFY+la z7e3CxxV+E^#z9rX%|r^vrj+o{l*ki)t4cUl0*?Wqay!WWYrfD@4mYhm*9MLQ&?=nR z4b~}h;Z?rwAA&dbbdIAh)|Fb_cd)*+=uYgJo`@?0QPx7K#FlPNrAubf? zyV-|sWXglL=sYVAaw`;v|Aa&F74#yTHy;25R3PL=D9)@G%vxxM8+YAEq;OD5bwRXD zUKgAbo-Ki+V)o7EmjGN4Ojd$(g5E-`$@a$pTnk*>`ULI`z-@mwGEcyS*5}&^Q^Fnjo&|M-Hz|j{70anoPA!EmOIrE)?n7@9*|T+k1mu z?P0&SC*ti5w)c9wqv0N}uf5CLb1)i>boU|&UuXON16}*kxZm61-*<4o_dr)5;tfQ) zz3ripH`3#cc1HpSLjIn*o^D@VDD+gg1|d)SeZAsEWx!(J$5=aje&dEfgU@KOF*cti zU}x-s4u&!Ljdq`@-RBF~{9Cz~Kn08YDJV)sX*)$`>7*4o74_>sFF_*n4T6{dB0WOJPk zZn)!ZEWlmOs=5*B4zR;IDwF08sw~`U>$p delta 5219 zcmd5=e@qis9KZMIZ(6(p2IdfzWJDQ*wiH@S#ZoBN*(i(5;nay^XvM14%=jZn3oFbl zFd=7hWV#LAvMuN~;Y_YG&5kTC?hleV;~$&DNivr(Ln6^^I%TqVckfzCjFix1UIO3G z`@HY_zVChC``q*VmLSD8Kt|5PRIyFu z#9G&SC=imJI!=LlTn)ItE!O&Qn2<&UInqGh?a2eFAz@?yoDTJ0}sxd`N?bnS{kL4xQsd-vBG*0bhA1xegG_8)P2n z?o!b9!%&u;nuG%7;F#E0kkzBq1A3-Ue-&s(cs^Y*I8IRQY2Q&b>6f10Mz(S95aDla z8whS+K^qsuc*pB)RPv^8n6ddo!2kBV4(_2hK$6v9@}aktBx`sZB*|Ldwv*&$FlkTD z_mn3WcpS+FPeroHBly39{a?lY8?pZuy-t$Lxh-=9w(*8N$GJ?dLbz@N~#1UWkpFX7XqW* zgeFl$Fx7kO9CKo!94Kk8`Prm zl^#}etpii{!OVpi_-M0kNz45p$JatqNC-VTrz+$+bM$Oadt8W`oHDEzSa#^D5;U(wKX?0Ma}rM(Pc`2 zX8MKwI>Y-UobI+LqSMZGcMKfN2Xv1I1ej}+qFkT$97B4^($c;yK*olTfDXWT|E`s@ z?)e}LYN>ls746n0Rz-1DMwP~n7Jv)Defgin)d#6(?gjF?KnUb?-wSelktBHqQex#* zrevDWY5-fk2I%5d*_f6^tC6JvWo+jTkSUMnMZ~MDG8^736$HkN{oD-b)Z^gdvmmeo z>KbmbD6Tp(x{7lZ7aKSUj5F30{aTwyWk+nV3$W)dfG(xGaLP+D5lZ{~mgwtMg=+jX@{x3&A6lelh@=~*G>1vLZ%T85q;K<0{4p<}r}F~PqR+ugiofN#*3tEF}ZZES%Sc%^}EY=LEv*h%;C*0+Px# diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index 89458b5a3d867bdae347037a254175b91e26815a..b5c49bd732b6d6dd446e8e953b554071c780aa62 100755 GIT binary patch delta 12418 zcmeI24{#LK8NlD}-Q_O1#N5T8m{JK4ZR!Mae|GQo^ki=DE+MUGfkuTIyoCJWz=NDg zs7Pt{2uW<2NP{OmrbPz_#!{Q{59fHP(j2wa+KyOYptUX3gzA*3h#7?z@eh5w`@Ow9 zsz+F-j*BMr4pok z_Cbj(>NSH4gDhUDUZYub89l@*fRq#4<>8v>8&I#;sd}V@ZU)V&FpD&l)jy?S{3hnQ zng6}Rcn{NdZM0bJmy~)K&xp&0;&>ig$vBeLtlO+x)fJ7eD0)9WT4XAkw^01aoc6Z1 z&epD1#^YlqA9F_eKBs(th`+1i9~)fpZC4bs2bg%2<#h)bQ~X}`%J@51gc(b6aq$hz zOxb58Iy@5b6^!l1r4>&uR+4u&H`VoPtf_T23) zjfHNW(sLfSvn*gPoX2e~^>W$UZCNar{oR%ZM!Mib{2|{OgkrH+nTTakmV0{2{8;wO za&K>00LuYc?(Z*C242M?5gQ=l(vM3&E(5p>;L>NV3XA4BVZYfH4wxNb#ejui!0lkb z5-?z?#9VpIi{MZ0IbL>vpa8ber-lxwnBf#W%^C>oNDJzuH=DxJC zN1q-w^rqLMfwXMD6&z><2U@{_)`m-NK93#9VF$Xf0|o5B8x>ovl>K@C!IIIk4SGqo z6g+JOPg}v$)}@!+QsgTNVn^3vN7rFTe?6QmpI(?Jy~Aew_8Eb9>2JFlce5u1-e=r* zxa|qwRAwkyas5%9>^#e-hUpjKJUU8~jYCz2`232(cw_Zz(ya46Y5ce;f5#0y|L;Kb zz?1wr0&&om1A*w{aUd3dG?@bNV13R7Vyx!K%o*o<({eD*>-GM?`!$rdohCTUt0WiD_1mwpfUoLhf|LAqs%y6rz+kXbcye%s%3zKTh{5>X;$Bl~z8}9P#*aN-C=KC#jAZkkn+9ERI%|L?-xq{x>n5q))!}%ib;?Gd^h`B*QkD-G&YCK(W~LgY2gWlA!2QLOC;>Y^Oe)&NEXn$Ulq&CH_m(8^TBJ`6 z4=@8JcE}W;yJ%DD;8tddlBQ(yNr%*6bOw8%1}6lknbg63%;@Ex1HSm*SpV1m1gL32 zi(cAUsip*rN;AvHBuEK{nAE_v?84l56O-5Bx@l77gx^&rMM{93IQUOSbz(M`3pp`Z zJ41TrtxQVrjnc`K5{yn`Kg#-+fd4n8=BTA*@~;9MEByh#(@v^i1ac`LebZCIO6lBodSfcq#5U+s-YsmQF{OWrecJpj`7DL}iQ@kO%O5yzfu^dL>fsJ7e+cyjz@t!} z{rEz=EDl3GIY@ELx@E&viOR#QMRGXi*yh;EE2|^ztCzQSgrZGd;nhv6T0@=fp{{UK zSEwVpsx#Ewv^vzeCK_$;=)xM!_cSeEv3fbKZwdubl>PY0#2UyVP;lu7cL41|p66h4M9@l&W=7*V51)gX0O1Luy@d?w*{5lyw6 z4SZjK=7P#4lD$5 zVx1W#OMSq@%V|CcoD0$X$H3>`L-WUhH%#=XJpMCK@$wP6!Cv6rCusf~;JYRmFQdc2 z&-5yS_H`Wig3TIIgp>FicVR@fiw1ZP8VEn48+;7>;hi+kS3<#v2A-mMDe#lSG@lOq z;QopJ$2vDaMb)cx1D*_^Qv|Kj2V6L%Aw{?ac-^09eh2W1HwXt8HbKR9<>(QqL?{A$ z?FShqOZNe{UGx<60AD|-2wI~Dfp1$x^KI($XGFVIUug|?LW8wsbb}{>ADKn-p97Co z(R@Gffr|$IJ#Z=Pp&R@eD)#$n{wDCK>RZjwap2*_wEhRc_p7ee^z)QsZ$=bdsVHPZ z*a{VkRo`k2rU8F8Mi1~6;Ems>xdZsQqcp!6cwKTL$K%&Q#rmxiD-il7@Wnf5emC%E zpP=~);8nvk?*eZ670vOHDf#&`qLu@6#U^MFKBEYlC$<6~+)VQw!0Xj1YWhC|UU-V` z|2g0d?=tlB_e-c4R&Sox(JR1P^opSIKLWQD(fkDPx{C~Y8~C|0O<%qL=b)m#LaR^) z(dpy@9~hiX^DBT4@ie~%_^trWD@Z@L(!5%yjBi9kx6>6jLW9NX&DLgI1Kby(^%nr& zzJ}&^0k7+(c^mLEQa@dBFI4RR2hG<5AKpvz2Y_21rujp_Pd!TWM}XJ;ROQ<9w;L*U zCFur9;L~2D`3UfHYH(;C`YrH!H8?bW1o+cqR6jSN+vLQFHK8aRfF&(n#|=K|u;>)5 zuO1dnuzz9uCC=|hk)~y>k&x5F3n02`gdm8v8k;5xP8*>PUK5=HN2uV?bX{%-p&pl} z5a0>5a}K!&Ngmm5w~Iu%RvXUluseZb)7&6B0$Rrod!QDm-R|&6NjCBJ< zAsDD3o!KM@Ph_Wq4Cmn8#2$y>*18Y_nyUm&*XiK2-hv)7JEx5!_Bfp`V!P92C%V|f ziaipZKJVwrJUD-i3+!99L}Hr96(l2jgj(VWzn$EfyF;SO zMGW!;I5KBXO)W7(v}?wToMw;c4v;ZK(M@^~gLdL7u~rPAhb(;J;n&$+Y_vFW<0xC5 z4{uvhYocS6b(70sYhurY%VXBWyZF)y%H$O;iV`o3v0qNyImVXc!+HK#Nupy6tImRT z2afYc4<()-W9Jl*RVn&L;@vUURWb^z)F>R^FE>M%C)jK934AJX{|VO34!)V#bAp{) Z(0E#1^2qR4|Bbz-;2^*|^-2c}`ZuzFFlGP% delta 7264 zcmd6s4Nw%<9l&=VcPwyx>;b7_Nkn2QV-fFTZ*MP>^gfY{H-=Pd(E1f>4RL5B4Ks;} z^$r5)IA)5j&C!HTNwtk*7(;uedrh63TBb>+OrnmRQiGKkJ8>o^gCAH*W9-}A|J&m^ zDdNaX!;HTF{{A2D|Nig&-`&Fg&n$y~wq!;^;VQ(k-n3nrv(ez$UGq0Ztc><^1V)C} zpIJPZRWV z8}4a@xMVcJd>kH^=gR4*jID?-L~e_`F*jTOan6YRrCF4_^Rm&vi2M{1(7+fT3glm! zisW4WVU~%n;pelvI@3$lisYx^IHeF01~(q9IQ6JCe!zN?%^FsDVfv8~M(!v$Cf64B zf3v8dJ^2E2`0dK&OtMRCPqu;l#>RGe?bvB;g6t4LMe0W_#m%?*41qY`i=xq-%wAt&px2ryH;7sWi(2j%>MP-l_1P zP%4MpD#}+ZGTu|xVp6B)C#hDvb3sXZ-SgwENrs7P|Ju#1(^m#>;I_-q^WdrQqU+OG zh7#`-%D{(;(@Qsxw6dv;N+UEruddmv$?&(hb((E z?i$j)CrbxU2EORHpl3l!ddDEj)dwmFQ$6uUj@_KJGWDgn%PQX8GdsJ zqifSx6x>0*Y|MKQMU^can&L-~dbd;;mvR@|l4^BG8=dMJ#ADc#S+Dcw(_RcXrpy(pVgjtrtci}t>z zzWAv>O?>RR^Jq59$6m&R`l_P-kaxHtc~}07*zkG8r`x3pTZK)2m7Nt)-^&<=W17jU zrS{A)*}%7a2>^x}z|X(0dlIewTE{V~mJ_0n3&6hGOLIx*5vFWDtqAA}rmKSHOMzE< zX}$t@kB8>#fhU4A-vs=~!>SNTDW3yH#*-Q}{Mm^W zRp+z)i<# zz8rYxX~JRu9|J|kCq$u-jllO@qWLqxhi~eFR^S7=U^u-9yt+yg4Bn;pAIBv08XAhd z;82!FI~)Wq%%u4rfDf0@ydQYwO%s0rJkqd$b~p`+W|8Lq1YYtTnvVca=ruPM@E_p) zwX}UYQcXCfX|={v{bvQm*b}rvF>uq06)}C^Cy6pAE0?N@WdgSM^qlcr6^3&iXGt4`;jIXCH)!j<}Ebu0p3+k z^Vfm*4bky`54`6bOZVSVQ1pIEI}8A?yGrxF0XJQz`8nV_Z%XJh!m~6%U6{WAW1#3N zFcjLV)6&#|V}kl2(3tvPF7T1Lw0#lq?fM}w?B@dCvxv4|n5NCoG2S4pxE~xYhG`xG z-n^3LHNc}kp!xTKmp9UU1MtC!emaayp8`e3AnmXPc=!O#w*kM}Li63gqdRH-3*bX9 z>D=hQ{h*Lur5)nH``)1W+rTT|qWOEkyZUK<3i#FcDZYqVs(wE~84USVHz~zg?C@2; zFun`h>%TCLefM+J5pjBgOygtqKWtpfHK}3hmM?|MXB%)g) z5p1rIM5yEo5z+6qfoONy)L;>HX1mQ!uq|XvXSdr(5;!#y!r>F`5W-=zlbP+V0GZD2 zwh3eok3e$AwoXC=4ieWMiiiOc&F&(rb2vRNLPfU`Lh_R}J6$0|v>DMHe#uBJi@z`a}{}2>2r;V<8kIm2lash=ofKh=EHqN+o(kGKXjjl7w9zH(7=&VC)R86iGsK z*!)mdQGhcMX=$ReT!sV`SGI_jK3gGOw2Nh?vA~773F8bU+bdQ*fG_# z6z&}OW8I--ENc{1WRV*waU$j)MPh#0W&P?ZJE@O@IS_Ib6=f1QBlg}XawG4_7`lcY Z%;G=PZJS{F$FHHHtSG=oO=0Y%i82JE$iFF^|>i37B?D4_&HO1d>>&C(cqWt%oZ zsn=Kpp$N61^U{~9hA{VJE z?R;m>nai2^=FFbmmWj13qE0#)>LUrFw|cd%nOr-&`g~F{gYu&xRjv{Ubc(3yTg!TY zQ5-1OH3awct3f9547Shj!g3W{Q3nTTINXs$hWmGfU4*su!Q;U^5o@~MP##~yYCN4D z;aUT%A?*YsyGVb1)oezDGse4c+gNMtKXvS5XEGK`^Ol^^NP{?8f-rF`s9vrmTk!)Awp&^R{$qq#x9=50xpCf?{IbcI*M`%arKj zqf@XO8Np%n2&x08umh*D1LN3%vx)dbhnn3wv=1W(uaOcuGc=}*#+1>RGFqi#`+Lr1BkJg1QN zF%ge8BDC`;J*0?uz=2c_>4k(a2IR}FpQ>z7{~ADh%|>PXydPSYe|s*clYS}Y3+N6| z-vhe*qmB?sxvMm0_K2AO5+OC^>qGwL@Dmat?Jg7{slWc;5mM}YG9qLma8dW{PsPQ8 z5+(hM?0f8spg`#^75bNvyudE83!tzh-|eQVANTbtqw7^iEXkYwWnUiW;AeSdxn+ zQ(Pn-=aOkoOeJG8aX#%zr=qTS{H@szq-U`=9dhqkBcu^RhQc(|9UZ~U`!1}hsmHKk>sRy}T z?K^P$PNA}W0P7Dan|bmc+nOSiwLrwlx@J0og!`qrBGThD*K>NQ>xBOph({**ILG*X zM>NsXc~leqJ_d_!cOa-~`xrlL9_>KyRq}cR+!}Qnv{P||g2hqa>IW3;4Ks!|%ykck2xhCjh%#@h3bX_&Jxu)2hX*6Z!EE9Dz&@LinE6Bu}4wW8+ zlJjCep!rtN!{qj454WO+JqQ`}2kfzzKzfo%*txsssuNKI2jlbke$V%Ozn|~-a=!7e zOiE&m(N_Ou$S}=03&t`SzOiujUX%ywPeY`d41rvQYW{N74=5(Rs;wc|bx%)@l3n!t zK{k%5CU6z1q(Zw0iM{_t>}ISLhv&mFX4sNo)wCMwXgLGH*DTgU8&fTvht2{#a-tAN z*mXS2U4^Ub(nrq3;5B44X?@a*c7i++_8H+n==;KpJp6_K#H8?(qPLRmo|w zz-*;QIuQyg7(l2;!RrY1FD|2A#%P*$~soZ*(*AIGC4}B0LqvQ&Bt> z#b-oaAfg;Zl#7URXA+YSO^WEZxm0RihGj}3s+Ol}d8(EtYNdF+FBeaqS@U_Jn5kHlS_mo1AuD6g zE>LzC-35`pIk&go1x5#N4~>QdN5f@})(Q0*bu_e4S}W9Tdwt$oe4nquq5dV9U@;u9 z=O^g!$VlA-j(lyF8)U-7G}F~;MtFso9%_=orw5!yYLWfq4mo*EG2PlpFJ?j(91o#< zt;u8;UI{r&u7_&3uAqv^0u=$SA>ZwExYVefl;BHYCkzDfM7RkycjJ|ClQGe)M!zTN G%l-k%-IgK% diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co index 20da3b15c0dab0025e35cec9f52adc214bfb1e2a..b1a5ac1dbf2cfc4cce7f67648fc0b0bde39b717e 100755 GIT binary patch delta 8865 zcmeHNeQZ-z6uwbfs(Qq^!@akYy%) zWo`r)0>g`WL}Nr<7=I8&AAd}OdrF8OQIbhWm8e1Mbh-&NSU#;sYN`Ob`C_VUR@U|egPL`}q7hxTZuOJp{YCLeq%EBId=y>EUJ0*D zzrRSoH}S6n-BNkcp*$^h79xi7WSxcC=vm5)e#nbL1K*%8fGBC0HBX);GOQO%BfhPa z3B6$$=7w}I5A_eI~ytHOD^(tu}u;zdx^AcpjV@^ltz{ z)1q#ni|8S&qg2lTf7x;cvIg+hpb@-1XaUCuXM=YJTlIT|@g;Rj02m+FgOp%XwI}`H z3 zzB=mA*Wdu~o#W{g1223P2lx^_foTZ%?!LVgIzKPUZp-c#G@Z2IrMrZ7fOAoBXLDg8 zxv-F2SV%4`o1~dw?{F@b;avD}E>_@NtdwrsqL>QxREz?Auk{jS@nd5lS+kI=SxD9_ zo9{-_*5IVo;-sy`Nn01?56{=CSSv-6aPJCY^5e+frGogI=TUw}xxLX8Ks)=6pqh;i zekeaj%82*#J|L|I0twqg(8yLb=Hp1sj2;I%;dme8H|x=ilIlJ#%OM4etnc*Zsj z$|PmSWpeqq8D1tIO|vp#O1kL$&u(-HVyUd|^6&RQ0fcX}x`owyu)fQ`+rJA4*C1MP zp@%>0%b_y8+M>hr_wbF670~b+|H9)3rFb&A*6_6zXVi4i-6`;?p1F{BtV#TlDJAYR zrTO*-*bjg)^c>CmM&@Z^kCr(}Sv+%AOEon{FjD z+C0KC_nFi*E1A-^^Z8$obqQkG$V{q0rDResQX-Q=hkc9D(7|JT>dN~WXH-1Xeq04# zx=lEJ*KQbY&zYiQvZ9`>%UQ{#VYoOF-@OtW#2RU zG?_h3gAFH4pN^qVE~NOrgXurkncR3H6sk|lCgmN#nYsDI@9eW@WqRr2hx*s)Sk=!H zl<7wH45m!etxVp-Ws<_%9MVSlOf8d-rbC&0TC00F>*p>(EESn1jpQh4(p7%>g+me> z$>aiynGucu8@f*O+DnIWsH-&UdAx^rey^J@JZTgCCUf0f#~8)qJpg8<1(-{ube@X! zo5Y?=@D?I}x-}jb)d6Cc4=*no5A2%54Qrr*b-Rn4Mb1S9#j$wH)^KZp2_~W~!R_Hd zTRf161`~l+W_w#86l@8!?O>RAYXW9dYeLF*TTrskjqo%EDL!eKBq6z zwU422?PVH zxR2nC4y>?~Y?Z{|n9?DAT@5(8d6F|$HxmEL>y>yD!B^VSN7}Mg${9XkKCxd$$oL(k zL(Ov&22txK_)tWN_Y&Mc#s3w8H})z0(-Wovb6-~C@1)1qfTmxRhEwE1?dO&F#{^Fu zSK{Xg9-HF-6~RXrDre$5g8Q_MO2d!DaAL6%-zGMxY*0hq5|G^R&n^5lW#L2nDd*`W z*gt)q&OMV0ceB;*Y7h%<3C99%=c);{+C@-jdF5@@>suwGZjVn^y}k-rT~%H#tCbZ? ziCXGf>LjYG!d)e+UY8rItT?jESzUS?c1{Fwdkj}qrI$FAcs%lj-IX3W9(RdX9s{Ft zJSEj7m2$W8QrWlE>8%2Rr`LLd&51D57#y5c3y)BYo;$6DMY;0Q!x*_SE%fHfOUZzd zTc*S9i)3lJXyiJ?w!9SGG;$|(uq=nX1(GsxW4LLQr=Pq8QlsWR)WZc_V;(f*$~mi< x$93dEuW5kfa6o?ZL(QGWc#hnDn}d2-Mn$dMGCf>Djn3yfMA`TdcTx|R{0_ET&}IMt delta 4001 zcmd5a3udqWWS!1*=T+cvQtvO;WVp_~c%G#q=koxo!3piyAvC+z|O&Nm{ zwdEfuV`?X_)8Q#zFGs-kIkoLbp#bz56sLqPWbpX$3sDt(wq)jbs2jds$Y&{Gsbxtz zJ+_f7q+?29g}9ulT!40DnNg=@7>#TF=tQ=exi*Y;F<$1{2sSzBB%6n_xCRKsEnF!S zVv8YLXw%>*v<)P#A6qM~J7z_nSqD+fY8I|Lpb7Qb?wW>Vj$PM34A8-xZ&1Yk{ZrMJ zgmeI0x*XgDq+^e zaXHO!VyVT=ms!00GK-%tw*>eUeZ>l2c`o0XE9xU?H61=&eKc2ba?6l6~oRD}x);(}JqZo?wHGdro&PAavNO6|B(#A2Kr?s_0nV&%#^c?K*`!-3g$fK%e;T zx}c6h%f$-~hnZ;Ce=LZ`pbe+BlFu>u^){vOjVJ1;h0VH zKTtz~HZs8}N{AUU2Qp_Rbe(o-W@@5n{-M+?2G4n6c-15o&RGh)sN|mRK+fO9sh!^j zVAFwz87b^F%%c?AuFk3j&6co)zmxQbdCt3U`DHe{Z?+Wge;AkK%~K^L@qfh0KL*kRqslbh-S`I?14ixi{WTCx8( znKu!9C$SI4C)#EAESaZQ!~t}~mMbLlAg6N$Dx4KgbQ&&}Z+?cTeKT20j5%fXVE0FA zJeT0HDY$ZJ)Pu1#YX0+tU%En#2ME5lEagPgvmr8(KV`vsf`i61HRULE>p@S08sA3v zExF3=oX(~i1N4B4Q$)5n|2wI%9-P=Uc>;h0!SlDO@m_-WPT?P?{601R48fNiR^#X7 zVH8)iGFPT1{QI z%S~NhsFJ$1-t@8>zlWfnT3>*;u0X&`T~D37EGlPq)_Z9Pm!~exs12mi8s`cc&E=)8 zI}i#H)a~@9-9U&EJT*=l%;R^{+C3qU4{UvTy|4AnuI;GGRxW%{2+uHlmQYm$D@;`1 zJSTJ(LBEOW^(-fxm&XG{zXP0bACIk6@6_%Tb}fbrg&!TzC^R~t%}8FjF{{w&fR$GI zk|PgX{z00bXG*_X^}>Az^xFo>D|V2)YD*2mMF%Vp8jE3`k%GBOfWyVGgc;Tg=ZoP= c<8G7kE$K)HHww8Wc+nj)(Is4Pz(NTA1t~gn$^ZZW diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co index dae61631e409032845a586f71407eea954446fb3..63d077d4d934119ed5bbf905d706bb00151c245e 100755 GIT binary patch delta 9500 zcmeHNeN0=|6~FKKldt$QtgviJNJjAo@ZR$$)XBhrvv#PWh%k~7IgYVoTE}*QY$&aG z4h2H1tRsDj9Iai}ty7XoQ3AYb&9(@$DcZCxk=iPnT9ryfirW4WsY%x?Kr+~U&-dDS z={~eJDM-{3KIi?;Iq#f%&pr1Y{O++2Ar8FU00njhr9Dp)xNJ7ECH{@!G^0NPZ zKW>l+e>0)U+#gdpa;JiHr-oCK6$Haqldb~UzP<$4T{O?l7O9K2JyCzMDjp8^hWajE z=dP&hh)>1uIq~}r{I+t73MY4LO(A)S;1Z-mc8O4PPm>1j<24aN%Xerh2yQ@IP=aR( zxphLSgTqSdxnF4C;gVW;s)!--xiQ_f{Of{_Q8^6=Zbbiwg2qA5i+YbfS?nq1wi*6h zdUDNh`Z;iZ%F_)fg zzN4d5bo@LC%rF2mq!vgmkXj+NLTV~vBIY7{#8TvpSc}{dF@Psf0NYUjJ5T_-SmOQ% zpz%h`n#PDlKSqlJMnK#NF$WeLxlQUlt@@q^ZM^ zB;Xe}z9(kF@E;54L1aQ&KU}kSlpotXDrYa<9!$ecK>V}G2fys{co2Kl|5F~y%>KbA zOe>%~k``DVSO2-H%Y&PoIl=$3%&{^`Bw-tpm;LqjP+BOHZ9|?PdmaVp4`jpXZKIGs z=o#@G1nCM@D9m(8`O4MWC2?b=@9-@x6S8IRl&H2s zB`cS$>PG?p;?C#fLME1V)2j1pmF)a{u1iuWS8JEVB}^-%Ja#_`%VXirsxA+)Q?h^m zeQ~o|))GnVFI1M@njcCFWrDjTqf*)>sb&1}XFgUe`_4LjORnIiM^2M4pxB{IK+8FM zT%G^a8_#NpPB}mJl3GSql=DA1S}yy{k4Rs&Mql_>tk2mRT_gF;ueeF}C*ezv)ij>erGmpxB`oUV1(GMq?fk5QAC?t&_c^HK7w;*#S_HA>biI z(W^{w2g&mh9z*oz;!<2-&mlQ}DLJD%a(+m=!?Vx z`$GQSxW6wF==1j^_VxOMftbH{e4O@~Qa)oL#{Xy*^wMq#Z8G6?BB% zlsnu-QA~(-2bn-H7M<7!`L80#}KZLO7WKwe$TkT34|N_rTP;Ye=0x?Nd@PTfhXfBEBq6} zht6gpk$D^8U9U;;_YqEIBFhRtLO9!UGK-1K-;tnMFP*wYgsVT4>gS1SDsb+1Qrw8} z&fiP%^$4H-i-_mu+l&NvK9CwXF#b0wUWf4Avr_!K2tR#Iink+N`;WVE=s$o2&7{a= zrBQ^B7fA6RBAlw1;)4jkXp-VTLipukDSk}2WZ`N&w^1s14H>wrr1%+xud9*bzd`t? zO;Vf|PGFE~Fp5rW;E`Q`YI>Op2q!>_F(}Y^D99XcNbRg7Mpz%?1JS^qP}I-ZyzU%Z zk65a{fyS)U-Go`jfpsaT+l$er`m9-lw+5m1nkEXdbR&foXcuLJC@XlTDIZp*89RfO zT~yZ2VRvBEZEwJ=*Pe@=!BH|4oxNETg>@O4wj(o}w^8scIJVI&h21eu29J($)wqxb zQ{%%y*eGAtT|jKWTPvjH{u@IY<8dK{0P zX~?n;HxAA2_BN*8tR@`7*C{fgNgbews(k#sOXyOQ6yeOr&#!a3)F;CA40@i;=u!+# zICc2>hiQy^T5AvvycDETG@;K!M~ssC)Fe%~boi_cI$<>WaB6{uUh(=HtxYiu;ndHe z^*C3XS+8nE>NZV$Bb8!^k~|Ea6kDA7g1@(gZbM6NkIDaVhLz@3~9mjGzgP4IR_Cot@i1RGnB;9w>H)!6?f?7tTKzaseS%h-Iqn_Z$WXAATd?AQ@u zeE=r&K2k?@u9XG!D*lc1KPcZeT~k=spbqz_8pCe^IUxg1$bb_v;DiiPLd%7OoI*my zLP90dq6R5|Hlm|YD0DW{3Kt2RGvH(mI9UTu)*vK{vP#=g*UlI|!*p4N;uoLu{sN;} z{HwA&WShv_7ec{J1$;0kR!SvhhPv^?4pxo)1wAz@#~Z=`)JuO~sv734D2M!md_kt1 z$sCKaN9}!jV*U!k2W{8pQiu%AQbG;#+o~nFLi<39<7?_ zIm}0U9Lg-EJyV}=#c9azvBxB zC0UN|1*5 zQ25H6dat;s5gOwCZec^$W%hL@Zol zFb6M3TqQ-$TL)UJlzbERKkAU5ry2lg!rZuC$%B}8 zyr$$IWA2-_&gYo-kW<3b&WJJ~WHk6TCRm)8qz>F{RPsxhmz+@YtC;&aB_G1P^?;J! z#JpC>I9h)J-ou7?s4U$C=Dj~jO5|Ux zuwt&htX!xB^Ryq8d?n^Rzbbh(=Ka@|d?S?N>p<-trJ)W_@Q*3^F3eXglaz?D0dr@X zlJCd7O{3)PnCAv)rQrlNbYv*`Y0RC^D*0EK8yF=wiEpP=4Cugbvn12+1h=WLh2&)! z{3RIV_d@RK^RPMKc3DB)##cAjtv4Gz_NZEfRb!F8oTye;DN)TPGf}NBGR^2_jD(ii zELdf%r4Fo`N^K^hGH#Iu#K^{S#znBvMq-#4H%ZN8F?)!EwcJKjM;Y-o8?7X=$!(1+ z7EQwBv63t;b{9#_><-w;JhR0VS&gxhjLo(pHx6fZ+ahX_gY3gx<|6r+J*6Z|i?N)< zuvpxY$Tky6jj>o6aou8J`xdu@l-T0%kUd+<93&r$rmGT!qQy63) NDayg6+$|5x`3FVZ*pL7K diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co index ef2de5c5f282842e7fe758214852fe096bc131b3..0156dee7341a907ced8b9cd33a8efeeed7ae5142 100755 GIT binary patch delta 3361 zcmd^CUr1Y57(e$KO|*@c9&$4vKD*t)i355v7w7<<^0g+6TUhWqZ#xm#1Zje)j> zUC90YzVH0G-#NeY9ll)ukjj5bRf{M5Cuxf6)~r@H()r=lgGtedN{@<6$)o@(Wvcou zB|jn<;!3uP=#g<%&=d-g_Cz_RN+$Y5{s24{bs!DKlLz8X%4+%d`EY@XHC(AxW<@G$ zN+!|*p0CAKw4HKFEwpD(#cVc(r}e+$yLz3zZ@A}Db21joM6+LS;Z0pPbzgb^qP%~D z_aQt{>#Mj)A?b4MehLrvYa5IeFsZsLSy;w(7XNIxg}-Dzr?&9h>_>Zg3*0+K z&X}*~>ZJq5@d}Ya->kFf*7`nld%X_bSx+0z7PrlQD?;1b29z&aE9Y|@y0*^I{#-8| z$l1~Ix7HyVS<#Y$Y9ATxm2cz%9Fre9$`*p$zWngfy21k(#{{YE2QZd7En_gpgk&7a zF=LdtZAI;OtaAwE{eF`TxJ~APfXNQrF7sf}b)8ZN3Qc+vb6y2J|;yeN&EITU%ON3yF5G)ab9aFYJ++iWwKnOPo z;RPW&l{K?v`%v|`k4BPr|0X5EnPG`!ERl>QlCk5vp;#{{b`lip2gL>iobOnXZXa0G zh$fl{@$@p{(eo(oQbzp4fm9Fa+>8)H(#Pf>6g8N?O`zRSqdb44H49TI-h5an!UdrG zj%W%`J3V&ep87B8QB}G-q!;baNqVdug!K6F_y2c#EVn!d>G59c-pZH0dbn7WQ>1T^ zdzU+fia%8>75i3zFL2A;0xI53B5Ll1WEe;_=sK4})hW8p$e#Io9r-aJnZW(bmRTN2ImgHtBS_qOIPD zD;$Z$JW&U}K$&o`UnhR%CsiPUy$Wj4-B-%}FA%@(<#K;8=No zFY%u}t&rn?ww{+6twSMFd#ep{k0({FN$``Z)pTmPv!~}6iiannao*|ncvb3ZbE(wr zX;T}wW(>OrX6P%m_~Jlw%~kTJ0F>SsaIcBp=ZCOqu!|OgcyzF<`ukw1>m``8@ekF`eYgMs delta 1688 zcmb_cT}TvB6uvXN?z*ef+_49LlJ%!p*H~?}b$7KvHZ|AH#ddWC6C(@S1`!mah#>qy zr9eo#KA1rf1W8W=V-N(5K|v2f20aym_8u%jAR=yeX71G#Q3D5NzVDoS?mgc*!#zW< znT4l}ZSX>*k!6^mY06f>1}{#X?i&<<@lb?@$zVv+DD@W^en3$688$`m&pr!Yl0#(u z=1%N4Ob|^sNHN{WVten6sD^PCAABAfVG<6RBk2g5NjfXWCmgb{F2=2#Vf`7X=U65h z;F2i8H?xaLWuDJv&=R!e(mKh;GK>NiwgAy@_+@G`9eohqF)QeP)^o*myvsB=z)z^_ zi3o&ICbgbufiSLN6v7S-TOsUXQclh-m2l;fi}OgOT&1MP$4GoTiN8$Zui$uIluGzo z$;AhyQoc@FzlYBUAmzMe-sZP^BmkodpYw~2_gTkO>xfx>YZ_JW0z;5M1PMftKm-Z8 zATJi=!-4`>P>p^l{?RyrWs55tCsvsPT$w5^?0GMSd{bY5%Jad(@?L zpEATV;VWS%D=e}Rk>tF6KCfmCpV9<`asp8s>i95=jMyT z8>6b$h0v(@NDKq@y=hDhJ-~ce92bXyx~zN>r`dykB8oU>l z>bDl2X^c$=G8t}kL5s*T;aRh5h-hV!A<`Kguc`J`gWhY`Z}eXERMmMYt!S*Lw5qOx z(uT_Ol-7A0P`}{DuLs?U^{_IBe#Xk}wEZ%V@>+d%+RZf{bz(E0G(R7rS!~jOAwu-M z)o&whb#w)t`kC8OfPCIeTr@v1$jOvrv5&#}>k dY*ZBYvw*FZ4V@Gjy84P#5P!f#5Mg5eDHEE3f4XVwUjr2!_=gn#7&36{l=$xMT`@%*LiYna zxcB*fzwi6*zTfwG-*@j;Kcg;uNtxuH&{3MAIt>e^Mw)qkp(Q0-fp+Q0Xex!FS`l@7 zvt|bb7lWFvBlz368r-#XTt>u`r0umx1Ap#O2;Qb-uFebu>iST10j;J}q zig~EnYQyuu?YTk8c+EgU86=cJLK##k+iu2!n6aanv0lvBvr@5h@2+auMiUYqY&Ga> zQNB;raF4~PEYO)rX&4mWZ~9(kg8A3t*={B(Yv=8r`6Bnkyg|Npw@||Muil3Yg$G?7 zva(O<%xn`e|D`%=w5_Aau#Z4>G_~OB`1#NOcXi}lk4AMco;d^j`tDpwsgPrHfwuzB zfYR^iV;@ibTKv(sNI&$=^uxCIj5wC|ZY`FFxt)B`b7u#|ppCvp3L;yi5#sNWa_J|s z{cz#626QsQ$R1BC*UB}qu4HO@JdqL7v79s=n@R}Tl#r8RIU$pt$_nw=w2+-ir&F06 z_K2T~jZaLE8XR*GMR|y zRHaGu`bRn6SUllw6x-r%FVFFDujuFe;uyzy5^jIo6N|@1KH)?kQ8siTVw5k2v<&oi zmnzhP>rd$x4Id+R@MNW3J22LQt6TgJ694mKmHvKWUm8{gd9(Lv($TwRK$zHn*kZq| z*l8os&*G{ZZbtBJt-?p+El1gTY}bymnrEj9=$-zvk*=;bkc^Eck^<-BeY)s5%;}yT&AWg>qN%M6+v0CbDYwL$p4~ z>sk<^mguH-1)|9)5A`I-56fZ@xug5gYLwZb=e=|stw()UWTxa*5vo#SlaIpP;|L_0*9L|69Z$Rof zFhnm0{44-I)q0Yf8w;*l4`#| zR73_6e=`kav0?u})C{W4hv!3a5UEHQmHZUdq?``pk`ZZH3$RIcRwzbeCrZ$?F^wWz zC%c}O5}XMv%xh2z8cx$uNvs1Sk9%8#(l7V`fDLrFbV;(D45^~qkpP52klBuOLD-kY zAcUbT_Cgp2nQEg=tTEP!7NbL~H8zN{e-H7$M*Ihe|1kFFJYo&!6)l`ktmRt8?OB|k zVKUXX)p>qrOk`kigSF|U;_{zt--Pa7QWsYzS5{n2kO+>ooNw+-dmdD*nCcCIc&&`Rz7mzRgU+1yUd{U@N zDGd)eC@Ci3((uA+sB0pu{&wWQp|TskNuxqRs0V+Zxj<2B}ZVYD99ms5bCHQAIqih0Vrw_9VedNoKz8`Aq-F1#S2YLq1pj5v;`qWP%36#jF zW8Mpe`ZA*bVJXy`iT>>#|BFQbZMe|iP4vEAnM8BeHey)b6VOHUjn{LoGI^c-W%Uf* zeAGICNvR4iqCM;7KGrMiMe+2>fIfw0gH0}h8NWGtYy5`8+1fy9y}yN0XN#Xw|D`5M z+nf$c{SEFMZEQ!&11UCPMAL(I1G&dl4y_Cd268J5M4Nf33L@)J&ejgmSYBF#Xk|#? z=`7@5V#|xBhAI#Y!y-CrZsIs zk}iv4CJk-bhecbC4^j{>59^CiaeNS6n))P&t6T9w6w&TeUxXqG+hsAHnK=WSA}&Jr z0tPbQ_y6ZI|2g0JPyQ6YA~t^@>ZF0_AW0CtHLG>;wV2KcpU2D{))v*S@ z&V=x4BQ%n3!Y6l-;d;YtJ_Dz%d+=AQ**Y}Z_klMp2su9g^)B2t_Y!}o&s*yA4tlo3 zc;oYi>jaS38(Rr@-PUke-w^x;KSCOXFDis?jmYubi3JmycgCV1ZesZ|r2Fu?$&yv6i zmQ>XHDCn+zr7+A;#k1XXX@ogc9PMr@-A3c65u$P%jitsF94Sx}3XT=1NuuHeL21i5 z4?r;*b+|xq%cbz)VTT)Wx57t893I3y3LhJDs4q^Ez=Z;EA#@{jBlIBjAaq*WMVHkp zx~)FZV-1LE0HZj77jXdRaR4tV0Z74v4Zy~TE?bZ2w)Ki0+i4U4S~=QIC2-fPD8O7z zwaVPSU*y15GnR7QtUbL6g7y8Qlizx7ApB(77Thx0bHa0ZH^b}!B=T;@qmU~*X6d&(|L-*Edis${lSy;y z&wGnyB};}DnKzg-p!|L9QhDey;tR|&vjECF@(Y7+k`Mc$m8mmV%*2lT_NW%i!@OqX zsqq!d*e0$T>v0kFI7YSdL_CUq!ITd!7C}NY!JmIs(h4=>cD>ANr&kx6;RH)bfq>WN z^&NM2q|$RqKFeled2uc_%d@#On-^nwHk+Bvv0QA9&CO>r>1-bL;9ia;r{|KWoo5q# zd_Kue&k1Q(NN3qtD#fOAY$lr)rc->bBbVhmQmKoxCy{4{=kls?TnI+uzEFbW{Vl>N z&L0f>!dy^@_##5W=L_)u2p5QPoDk+c@IAr-cjHFM1ei&@C=3lMn!u;eRQn&pe)m(= zeyt8Qfe-ifKZ*Olm8#3hC4zhJ7(V%u}84Vr+)I)(CILycYdnY*Nq> z4_S1b%{07#%yuk$I}O*6sh^T44S$S>>u@6~#2s9i)`Q0XV^+?=?f5ZR1SNs^FzDe%jC#2C6hsddK7>#9wuc_}638&@+}(4_g{XlCF2C>l&hMP>_kEZ1 zZG5AqKT_sYN2HUcsE}^aTuV>K7MpITc%Uu~q3Ti?42r46ZE744%=M_ghG5@0Bi_o5 zko||HIH|gzN=bm1T{}o)=iq~=g{rq7eIH6wach<>-U>9-t1d8MTxF4wc2G{~EbT8t z;;C|Uhs~on*G+%QORqT_wKSo(R_HKQtCbvX#>iuF8xR%Xhs6a3z31Oi+vq{XbIEZ! zs!J8(5eR#!y+ZsVgnbJ3LKs!>3WP({UOnp+8d#U$VBJC^>k*3l%Y@%g_}2-481uQH z(7?3{4lX1#a_5D@eSALvd-YTLzxG{G0I+wPb}~{~6|d{-E}N50X}!E=n3eB=$Fh=y zC!%;FiYKD@q9`vG<-?)^SX4{#V*Iml65}6Rs;T)-)hZVewLDSF6SX{6DA*;(*$uA%~h49?~Q!)3nP(hYM*Lfa&7*(vUv6$}`?$RKDxOl{)YGln_2>k+nRg7d){Aa6?jk*D-gKMy6X%ToT5o}o6SpVw6W zVO>X=re$eikg?EN4y}w^=`ciZ$F02fjI{?>hraNYOL1sy4!GPdH(I5tky|vTOu&1U zJSzg_MKpo+bEWZ05{HeYaXX1W`-gvq#Pi8get^VZ#IVt^Z*NVpX$IuTh|;#&v2oBC zn8I-~29|Ekw}ic6kc{0JPma3%ZEc$AaeFn>-{vP~v!}CNGu@s5itsnl8$qPAEV56y zELl!XjXOFp;j?J11Q4xYw~kEKI*7Kg8zwI9d$s5{6AzE30nyKEgNqF9To1B|RrJ(NF|CT{Pg4~T1m9hV`FG5(Q5uYrm|lt?B7k6 BnqB|^ diff --git a/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co index ced740d521624e58d5837339cd5dbe332afae096..bd8d6a9888eddb1f26571985c344db480a779b6c 100755 GIT binary patch delta 10248 zcmeHNeN0=|6~Fg_F<%Bh$4whT5<*x+BR-5juu0v_A3)NLWy!XtbRzMHv13|;9pf-k zMLfqfsIAZjdP9#2h}izf=t7zgOLG=69HC{)+Mv-Y($Gda)@2fZP>rNrlw@1jeb4t{ z9@{#pVzZI*B)@as@7{Cox#yjC&w0;h=I5Cg?__4sHFdQJLQ`sA#x`{E-M(EOS^@au z8lgZQyM~xZICewq8tf3$qJG2!0$nMWhFgk9W9`K^#MDcMnUThrLP3!0_U{Fm-Ee2& ze;g!{tbymD1w(GR?Aw%$zJ;w)dd<$7LR;v(4eZ zxybx6-2^}5&ky+X-|@MGc`T`926D2|6lAp7`-%}VBfsMwE%-#*x zH(ykAMPCQw6UIXzYSnf{Lxf*!>SDe=`?K(z@+PF^V0MB6w`mU2NLYLawx8o|{GBMNj@qhO~6bbn}i9RU;Q;;GPwqZY|)oef5L;PCTQ@Q6_v zus#1&@DY}iAAn1bu&n%B91V5K>p9xpDQ|%Ek_qTOFr5G(5D4U%F>mJhU@*^uc?-vf zLU~rqTRFbFJCC2Z1pzZwz>KK{Qwye6Os$xjvPxX$tTLA+OXad=sa?DR2Z;hN5(Qo+ z3LK{4+AAP4y3DdYE{n|MvdS!20lac-PEK(%jTQWEY8;aiFnsfHehOpC%cd`6c15MZ z)M&jV6zu>#QBHq~7?2_cq=*5jhBa4ThYhI32CT#eG++bn=5?h+`ltO179PpFi)c<$ zVrhz4nj)5_4zIbUz$B~2hOWhi-iHnS6<7G$mYy{F9E#iRa~yB&Z+pz|N>@1E=iGO4 z+nN0Kv2gX5YIxhp|D%`wFKj1v((w4nF$v?|kHdm@xK=(!T+}(R*?t! zi}+z#GWcNy_QP-fV_o^-VAA=a`BvdOk3vz-7x#vY-#5MhqHjpLqI=)K{Apvi@id4| zBIcFzZ-o~vQz;4Ge7u)K62@CM1}DQ?zE+>g>66F@ni)Sidilm|DppAzHoR1vg8H|I z_s<3)Q~c3;%)1R|pG5w(4*_2=@`ogJ`G#23AIOOB8p$8^56qXq@N`owd?hsWdl%|t z;aBBQ*aDM zloFWuH~Pr<2XH~jHb9sjIh7B`A-(xtJi`B3w$37aQuyNoWh7sLMjQH_<8hQwEMm(R*LUdBN}Si`wm-kp0mxy!s?VNSX}c zmNYVY7P=7aDT?I@21R^RSr2&v-c#h~;Pu3R{P!*_C?kuLa4v)1yy7Fuy2u%hC;W3E zmn}?hrQiP~o+9WJ>nue`E?4wsBJ7HP!mqQ!zj(>-ia19|4p$t=kKEEBE!zI;PeM`7 zDTTyfQ7KLg+9S^z&ZAieYmS0TU^MQ}$4{uh@! zvSI{0LnvRRf%#NY!I4RDAEAGJyyW57YeZfsiMT`co!-dK^dmW~R+p*DR0YN5Zcpnm zhtKA1^t)OcTO77FkInCD^xJ&imNuKcvDMag!t3?;{J4kx)y8AZt;cY?!`9?Ddg7R^ zxz*{hIXymGquXuswAs8qkF(kBXe)2?*~{JT_LfqNyyCF?`HKRl&Tv#!*<^QAY;#uG zD|C95-mY^RR0e00N~LyG80_jsyWOdGJj?tL=Fwb13II|t%P)q%;7{57SqilG;~2+1 zP4LgI#3Y5*LV_2)Bp@+bO7Lm380V@t@VqqemWUgyu?FyqRRa47exOC%|7C*jJ1WMT z2(DVSfY>^v;0M3tI?!>U*NA{=B#v=h7k?=Qb0@`kFTu4VVtkO`1$V{xS%T01eg)3W ze~C0Kt}^L3!ApE%Q~!$K1G~lehXj9s#Q1H3_pItak8wP|6x3f53w%lhru)SBV}j>j z;i`(_M+V=9*t*RoZN^j8$ ztXgLkSiPl!uodd6Wcc=-NSe);r17gvc6$FP%#F}TPDQ!FY8hHQiwel$2DKTP1qf@4ISe9d!j&uO^}VV!FtH|c%f z^PKm*=k0UObK7&f|IkHG>9m0g&jJX5J8h>n8+!NcoV6~%P||f8TZ)dg0WwO_HOAUh zj@QAz7-^0~Fgl;U~5#>&%qhOjD$h z3jg4GnTqJGO_31x_pfSVf2%pk_fPBrfsL6TMsEA-X}S^&-}VP-x=OSmn)ZsenWn43 zusc1^SDK#ho1bp>m8DyKQv6aJ{}PU0j^kILc)iP4sxR@)*SmdX`uV=w+faUr8g{>@ zNciu+?W5>Q{*^To(^&7+jH+R6Q$!t9L?~q_vWbcn#o(eCToi+gVx*!9QBihOloJ(I zEG0(!2~DDk`)vEoPX5fa3Yd+)A%iPsaK#L+m_ZdE9SSSiK63`^o_Wy|fQbsmM9V|I zCV%>w8`A9bBlzM?v)fhPv^&8FMb%56dx~JnNZ{PF!5CR+pNm+ymqHdy^O9hp-rc5} z4(@~^A`=Tku8&>MQIVC(rpUq;-34!1V3D&tiL-+nU>@WRkK$HGpe$u#3a+IPb^akBy&UB zuA}^zJmER3O3XU$&C$Iz4{*^Ss28#r5R>bp9%M)_)p~B=U7!%QdFd?Il^r@9mGTdk;3J1%$naGm9K&4lhj$BK|ssVrvvAO-Ui@RCi7Dy>a4>A50^+1 zWJXM0SD3^yj#plS=_IHdmC~a_ce8}kK6n8=cP;{2hSPJ@WD%)e7enJB*0Qr5o^I$_TA z>!FB98xi6!S#lw+)?pzofxV;d8u2e-XBY{wwgz@)fz`sajWC+D5VuU?(2i4x>)JAL zxp-z2uUdR?O^-FHC$W_dvBPr`V|A@MYwxMC+sHRV~Bz(ns^si~&CC30Lm zYEl%!XGu;#x{J3m$wP@o40r+K(KjV0ep`-l(~3AGIcqTP8Sw=qV$j5uryh()^KZrj zs`otDK~YTv-yp|ZFBIetWJ22e%qBPb2y0bAC{sXmMGMfq|( zmRC(_hNOV_?J|ycjflU7am^P1&Huo46L(x_luvLU<14?6z5gIXv{IOwq8G{WER45N zay%R3Q%}b8$L61fJ6cB!uwmS_PoBt)@ndyzJS^f+L;3KF8dB&Igd5Y)dy^c3#B2~~ z$cqndZ6|CFSS>}Af7R;O{WS&FqC)K2tyY4X%Pa*LH9IVJ;#w1~BOYL}IN|{nWzIxi zlEG>(CsSC;iySB*AkJ(lw28jtmdA@IuoN!9sI|;u!!9a~B(|B0NI+qM#fk$8Y);}9 z7MMwZ-CRgeo6|vxC^eUtlWfi+CxIP~Qc{oIRz_Tel7OO$cxgoi73J()qXT{BavfdJ z5B&ztco^pE$Rn+m<`y4@b{%;{ouIkxV!IDNvifN5oY*FhxVto`>xK?3ex+B>aEnos zAq#1YEu^oJ+trO`Aq(%@&7JFpc4HK8geZQ4m#gE9N1$Cx+LO4YN1y`^?B=!~fhEwi Pmpdog*1eqWD4h8(o)SLH diff --git a/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co index 352b18f7bcca660fe2e2c722b02c6b4099a2a912..dfdb2a80e9012701d8026ec5794d424d06352878 100755 GIT binary patch delta 12380 zcmeI24R9O96@c$fw&WifpA_4qhRKh^kD;-1`Z?)DWkz@UNt)6&Od(@vh~hs*2~I3* z1zeiR2_w51kJ~yNMG26O88VP5ZfhbM(N3@eO=+3NkY+M%h5?ecVJ6TPGHswCbxHT` z-k!WEr5z^J@(h}>_uaQ|ci+By+I@H0)RWdDQ`T}hd{cy>=|IVj@`cQ*B|EO~k?oZB zIS3`i>eny>5eoZYQ%Td~-0bI^(XGS%nHBj#+%LH_jsMLi9+>k-nAKg&7fxf2U zzxSsM(6I%{YQ0-rmcVcpRIaU7%uFNg%r0ZRm8MO}gtDRP1Lb5@S=I8J#V4D4VzIui z(k0u=w7z!r*1rBd}Gl0{V>7^s(! zEU9ujT^~epP?d*<>a~G4F_ef55K$?iQbJ`Al|fVnDwoE^%I3IK$;E?}&bVg4&0xT- zV8Cr)z#TGu=?$pz$Hl4}<5E>19;}j(0chsPovPuYj10V|q`)iyvj^i>nv_7jRhF%T zdCNdaK44Dg`>BzgoBxev$LgOi zkyT5<(>Cz54LohTnh|&( zbKl{%XHwVEPYc=Pw|1nHI4d;P#~PmhAe~o(ae3PRUH=!Ue8N1KU%nm5 z_xXqX_fdI|NqPCj;Viducj>GzKVwtrQYCfkIPJ+ct!^}{_RQutT3NM!Z|>v+^lK%^ zC^kF4z0t&!T$_D+`XM@dJeHt8+xjp49`Fgb{&cDQ>pFegk5v{{jiTT7_m|U4>Fo3) zwqn@($4j#l-mO{f%IBp%#l<%X5E@qen$$WWp-nbbc;HCCUbhCo0#liyt|g&Z|L*(}2zBA!%kB zz-Omk+@tNdv-$usHZ@_R`RvUnbX#Wg+pO*WsT|#3mOc2}QA_sN`!#0fh=8{Lji%3b z{-W9W`%81FH|aG@W~_|1%HG9#67VQnSS;p367XDcthu>~_vx~#+}L9JL5uw5vVsri zL>izwaW>bC+?gwwXY9E}61~$RuYb3Y1MFhwP!5n|SJK55kQ41{KtBEVxfGBn11Oq+ zAl2UpiXAV`5I58NW zPbLPzT+>6$9?NY1K7cZU(~V5fs-*#39h0JSXMe@$naXT_xAB{T2j}Jd`A71ED#e>iu zpQkuw+i_oAW(8wnWv8>5YvvX;F6-{uyuPb9nrKVLH@9u-iuUzHlkv7>v^TM-FWS+z zIoh`+k?847B8`qa+tzQ`ydKqeMLWCNx2%tD*c|JL#(H|AZQb3`p1x?JwURQ9tt7d2&(Ofgl6(~So_3Ny0(`4Jl2Q2t@WU5*^l5$m$f6U5t|lCS2HpEe{!8G; z(j-3$ymOf3uLDm#Nb=Lb*X^6>e+2&q6^)OP4L$@OdWz)bS}0hk=^V*xfX{z{J?%|9Yr+?Ks(h2Y$e&2}WfA_~~;7(u6MofA$o~R|CIAJ7iR-liQ$T54E=8dpC2RnPMyot-X&zkq!11;Ky7f|0(c53&{@w zw_Y^xE5J{$R0ZsRB)$q2CAVsVF@raOC)Seu4Dd$XRfhgQfQRlT_05c`VxcDF8C)IT z1{JNkuZ)ha1fI;yXn<<31%7gj655yyU4>c!82@>wXsyQ;`WT0SA6i24-vQ^ENd5=lr!O+-FTjs~fz&^TU2!%{ zR+uygSg3VhCHZB*54V#1bHLNlybU!|r^JX1!E!f9CC;snzBs3W6w zsDXZDG}BON8j|^Ngz9c<@9K`af)O`}UOvc!$b~~bEDF9L7M;8yxSWW#RfZkhQW@Y2ak+& z1*{tpMBJHEb~v!iyYO&MZxGw#6eC6#A+Hy7U6P1(UA!0f=HfiqG?$CV_P9JEw%z43 zMn)dioGqj2dmKFOjpvM!-JBD5>~@H_W4GIdJ9c|~*fi7(kKy)3jAlUrPtF|<;rqoq z-B^Klhh(h5d!2@c=)=?JLyi#4nU8p}2feNkw#OriPEhb5Uty6qdhiH5?g51azF(ej zNN*;0N4*XsYJ!{x_a^G`p@xSL@!~GLJ`s1}mE3qZZzzNf7C0B4k>C{Yj8G4F&Vt8f z=n7Iu28#vBfoCU#yx5sO6q%slbBS2N=iv+mfybWk1vx_@EMU__E{sPOU2c5qq!5nL zwNc4o_)7Fh*kaLVM2#5q7+pk+*-6d_9z){8cuJBWVpmC$*G(xj>&g72+Vm8Ytj@Hv zrsfL#xYlpWJj|LrJ9e^}ldOrWI0cVkr)-&eo5@pwAKOaoncHn9Z#g{0rR{tyQ)v|%Qh(;U3T`rwMopwR%q)^a=I@+chs>)!Djcl}PmNM9Pckesr zLsc*$(rPLB-u>SD-h1Es?%m~c&Y!iWerYXDh9Y5xrkhL$OXo8bcd&kA(ne{YK^Q4c zzoyYptcsQTHDxEq3pW@I;Bn$SGtyk7O`85c#oj{63Er^&J?E?FAlev2a>u(;xy{a3+$HQ28O`*p~^yfH47FOU1n zo8p0Te|&r=@=sBBnzk0@*ct`?_r+bBID8zf|dglv$I zO-qPJ33*XMB1&kf<`msbD1h0JGtE_WZ+7V68AbEa@@$Z<4brtix;B(qf~CCxg)n?FOqjP?rz*gH%n9JvSM{- zC)**^ECa>arZYQAmEKk8$tp*F!NorMFyA{C%hdIbf_kU$nk2Z$Hgxym=huUlLmAzw z22UK&H>8lFoGh!p`C&SzmIL=s*h=fBi}I+H?6}^-+%(ZjvT5o{<(;ecD+`X$M+$D8<8$dcrQ;{RwWOR|;j zr8Ae_VN%N6I;Oz)j{02LJI}u7S~{oJ2KUfY50O2z#H#jC-zRj6R-z}EBlmn{s(-_& z@1R4|m7XD{w~Ah(tbUt0UT{0<^U|!c<0x~~q`YyC>136b5|hKKo!8WJpZaOzGS63; z<}tUp%>xaWAKE;j52P4F^S~r`H!MlDprq%q;{eI3TOjMl`NS6k+=%@ z^}8JD@mT8oQqumLVBZra`9|PvkL9@*e>*7pH|G_q)B)VFf#hAlhu4w(r@*7FB!3b3 z)w?SC8-4vO_3~TfgaMea_XNrR0K9o8$^QgA^(@KX10Fd*@~gmioS0aD1aE@k&=GRN zZQw1(NM5Sl3M|$CYm!$1PrpL)`M`(I0LSs`L2>vTIl%+`BC838vI%(fhJiHUOTf3h zOY$|q2mea)(5UDr@ z6XNqo{wv^o9m!t--t8p$5b%M!0-pnZb-60w`XlijP`tcS6O0vH2EO1il8*x4ww~ml z01rMxaUVl7Ak!LngTkc-;piUj}?{j^ry5NAX$8 z+D$6H1`}d0k^BkZTlz@;4DepP=0@eV03Xw9Ztx`Vs(0xL^+)RcpcuC4-~HhkKfx1T z)C7#FZs1}w$zKBAJWeLs2Rub#dx-xUC?aK8p~n~o-ann>e+FLjFv;HszUZ!m{sBDk zMbdr@mqPTA3X@grH1Yq|3;Jdo_CE%`Kq2j4 z0N(ieM2_P3fFgEwqJpBbz`I6B{wna1DRXBRZprTlQE&;EI)WiLP=^o>VbSIE zV$sdJuqcGF2Zz_qW9oBBSPVD=AUb)M>MW^7b~+sxJ3?U`&gpV`z!;^5UATg-AW#?j zU~hsGhjWTv2c}XGJ9D|5VN8P|?APT;dPMBa?ZvBec|v&kF44%x<@e$>yM2Nox;=)s zkO!yccDivAZeGBC-GT@EMHSWzl4$v!kQ=AT2g7(t?x1AM3p?@hd8deD@a_=Koezfa z>UhCz*!m>HHsCO9!yfF{;|wNoLLT0Q_rN0<8GC$T!yv$81HmEU7+%4P6ZV8fybQ0) zgX`l(L7>83QNp)}*AF3*iS^zPk1Oc+8`YMa4opSAh%*+v#y$&zgi9<)P8?YX7`rHh z0zQb0JmAQp%VUUqAe6)wg3B=WiTGX;gGS-Suu*uQ(@UxpLcA1T&&vF()+8`3 XOs0OJX*uJtWLnj++mbo4(6sP>r)4=D diff --git a/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co b/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co index bf09eb56f6e8e58c86d7bae4a20356babb1b6ff7..852461e4bb127604a2c5e71de0def2a8d9ad4098 100755 GIT binary patch delta 3403 zcmd^C-A~(A6u)*zNPrLZ$NtufR1lTBL0d-nGLQN8ELK~7a32TeA z7$RV-nl{n<)Tedb!zxW0RF*0 zd+xdRx%YR^&B@y5#P%12N$eZyCkY}@v0`c_)6tdZQ(_%ZFAXWRNg&Y6RI|6Lc7QQ( zQf+Gp?i*)7T>%dFi{+SBn_v@Tt2U5ChJPLiI|*y+gWm@Wgm8GhR(-FA75Foi2yfLw z18FB1$wm6AbtoHG9ozcm}p10&rZIa7j=<#S(d!|#j%;uq!loATU6&o&sXJ)!%G z0Mcr069K=rm=0+Rg1_KGNTV5S($Y_sYXq?k-=yEK4i?yV>)5)vdbVDAy6&n@q`)T| zG+5m@0M<4Rf;$_fmLV$urIH2A6|MR!c@})S!IDGyAUT}3gQah*uadwD78TTdWVGvN z^G=qUa~-D(40~YCef(fy2aTgbL}>?&rB2Bh%2TIh9L-ZOmLhLkty)nxoMkY=CzK~aPH*96*cHOlh`GNI)Un`{R0?VUmqodL>ySzmb686wJO zm8Q%-3G-hv#GoD@Qm*|8$&e;D%8;Hv{@)pL!S`fjh*a~^uKwA#b{2|qnhY$k@3F6d zVp_jg99Twto?T++LGiBS=nhpp?( zY$6^+zgzNEr2-ho`0JmgltOX8R@oh)SLW-X@eCz;JWj^RwAfvVRC1EfaOp@co{UWM zTsFn!;*lJeNl$0FSR~11XVU3ZCWky?7bBBX$w}1Da}#`YW|Esq3Mo!VWw=Np!KJcX zI+GHn5`5N`&BR=Z#2eG?$Z~;?<&;K?;PpqDj)@rWZWfNk++H8!i+Ke<;}<3v#>2b) zF;65G6MVb_en2$Bn_+`k2^wC7Yo?P}>+379?p?b!eOM7h5o z+iRkT^xfBYOXj2z1aOtEb@d`rr$a^ao+9@Wva3ZdB+o8&29I`tL?p^5IL6o8uThWJ zt5IKXFQ#r!&oPZM%!_dQtP|Zg$@N-z{hZ5$H^n*y@166l+^|3@>^5PgkI-;1?9*C7 z8eaRmwMoN!VIQr9iT|twPQsP&VaSY8Roct~92^|$px5w3*En^mA>`N$*Tx!5nq?Jk bjrqv&aaeV}iws5J;Q21oW<>S=SoZ!4Cq{U9 delta 1607 zcmb_c&rcIU6rS0(+b**1EFM6E6bSM|v201D4JoAZs|bxL1Y-eB1zNIch@uxE0SgTf z)0hYu4!szas~3{(!HZcAp40;n{{XJusuvYBINP12KnP&MCcE!@Z@zi&dvA9q^BJyw zfLy#i(7{0HGc9p;MjBkY`Y0{}{W2`lT}X^YgGRhej{|B`e%&`L*td@*OQ{fDpEzj8 zbr;B19h4gz8R8$k5jVoh@{{x71RO0%^LkptnrS($+AAKjOf{@k>KOMq96O(npYRzx zDs(enGD=z~hD(bUypFhwMyqsh9kC+OwuY1+C<);fo-TN;R9~1d#kXStgo1E)J9ZnP zKFtOZ3Td_%p?h$*lCPDkc!ylgOL7fgFK78Zlz)fv`zilE;R{~5N@$j=1)p3aw8-1f zNPYlzE1#Nkc=DkPP;iTJ*pzEUmrZ>?xp~!^FspyebLs?mp=w1$DoUiHL@G+migFQA z4MdcOh-%CxCU2P(VfV1oS^S-`Yl)~@k*XD`T9K$#7H@pC;IS*~foUdJ`oHFJM1P8_ zN-vF$8I&^RBc(&q*N}B5p#m_Iy?>h%XEy<<{!|lKW&^W-&#?RArc-sl%$8Yp=2WeF z|D5-w_ZCn)E%U1XIl;5u1@A0SR}`l|Xg={JX1a@5xT0{qI`_D#<0)TCd2sie6$-d3 zY=;QP!glQg)h^>#tH2v#Azlq5*6DFb4hg@7m+;mI8@B?~qf?D%eim3kZIcnvZ6S?& zYYxWCX}t6h|GL2+J;?XaczTe4_I+zB8fI)DMNcPVsg-~N6FDpn9RZI?Tt5PE%g{n& ztKL&MRru;i9n3|KF97|<#M;Am79K@mmbdR(tiszoA#DjQXZhXAQW2=$KwK?=GG-5upAE-lT?u4joI z9;bu0j+9#Re!uz6%+9y_?Kl2;W;18#(SOjTZ0oi*3c@COthAa^TE>h~Rtu!hB(b0( zTZY&uM5xP_!Mc2#SShvuK(@L4ibdW?>Tl1tF{u{-qteL%sFgcXxZ>wKqDHvB_J5xb z4a3mdm^yoY+0taV5^Q_asF| z_lgYe#8!hZE~Vv(-`LxL2Td-P={|>Ji~#aC*YM z69}%_5g|>m6JvCCY@DCb{}Zwm;Kd0II60vQFHPJ8PE8D|4+(SYcdQ3sZcYtSfUtnMX8`v14SNAd?buOi#Mmg}uC7WG#wHQ(+*xVH*ev4i?n-Ik zM<`j8dDVQa$HjI9}4gNg|oRkpB6N}v`8aAq1 z!X|ZF*sQ)4JAkho-%inSJYT z_n3L#Is+6VqWi@}x)u@1&m*+2Wt4CF<6AWQ^rc||PX^LAKo4K?~5md#-rmZGurN#tAC!=G*ZTfyfXeqW2egjP9*1^(>VD#)MM z7XPVairM4KgyNe{oe(3k9L@BeV@GD^Wz<88 zr7EM4&$#GPW?-6ax*@3+TgoIpsC{@gEr@9&OR7;tzN9)MNh;LY_y?3aa)N*9(hmin ziFserQVh@VW9e?X;D6zoPCd_0K0+(0)$8~-#f>bO~#w#g-eE=-W z8nB93vik`xlln5k%iE*hUYU!E)Htb^_c70F#`dq^l&fI{%P{qpdP_}RLnJz|J2>c# z`QzaMe}B+B6!pf#{vsy+~?ojH?SMm2fe+)U3+(X`vyW$ zZzwwG^+zJ!=#V!y7!CDBfO%i;?4S}aV^>IyLaKp^A{8ql+_^6_d|%$5N6E^>9mESGEvSpH%@zmo9!1$-^x zU;cA``wfJz+mg?_2%j-yz~NJZ6x4jqJF<;!V|;oXG)=B|9cm+-O1?B=#yD}@Y{ zfZoG0L&krQbV%fSgSipHr~2kQh$ut&$b$Ay5kB*G3H*3CM)=XAKT;Anl^wqX?>6Uq zdYW{o`y{`^n}oNF=krs9uUgRlBf=lGAv&jB1 zcAGo~W@UeD8yzmWTdP%WYqK?P0^WN%?Eb!Z5GgA*aCfYMf1zS(?yEJhp_EJ1-F%U&lf)D~_^W1ySS$7w*jckcF zS>E@3&ii%V)2BP9=Ng?jOB{gcKRl2Nz}b$m^Ko5p(~J(2l9ossmy@ck zGI2U;4y+aGC|?P;#Z<~|H|Dqjt(QxgOQ}qb+eUe)%v~g`+#$N0GcnIVAgp7mU~fXl ze^CwT85RvnL?(e#`;aU{t?w-pj+b48#aRir>C!pCbz9GJt+sR1vc|ah7C3#TVHFU! zRK~>^);CwixhGeCBjl#sfYI&dgW|oOm5j9o4&Li&Wvs1|T*X-1B-zGT*TTWMMpvZD z=!sMty^$KDKO+0D!2Zu-|JB(4dE{>nMykw9BGu+lq{h57axaDQ128!EJ-VRt2P1%K z5ne8@(6pY{$3=Y?Ene5e#9csX$c7uT;f8FuA)DM#05#Nr8d`uFYLt`Ou?%74FWx+ZT5)9W5-3Z*pX@qQgH8JNULWPRhB#9FPRJRcLKNi@eA-fdV0 z#J@FhvAGB7`x?3%_5txW6nKW4?JN2h4G$m-J8L4(2!pe-E8gqg8(&CM}qC}ehIwA|qrFNC91q4b1V z5ouRSk9HhFdJihy_v)oK&HVx^9?+A>ijK~4O(3NUM;}b$?l-(SuHC0}v*7)hDD9pw zVS`r9gp|Z6C%Awin~Dzxex3b>Z@5o z@kbWT|9};{wXIQ$!iYvg;b11rh$lYVrWtFPLjISCQRw*A1H*^tLhT7tB6(V=Jt`$S zuc6x6pZu`x`#S|rJY|?fPE5Zrt{J3E;q~atN&Zbwc>9Up#DGllc10+6 zSET3hdNIby{qy#~;8qFh$Lr*xc!le=Oy#dvK}N^=Yn`=D?g)HLy8IPI`<}x&uqc(K z9-KR(#;0QZ!U$ZsH|oKOWorL<*xw#d<9>|qs*;uH(CT8mP&y)DImSCz54p;UV(LL^ zg&Kbe`zPiocXYBU4+iK#`fY+p64if0w)G(M+VBMc;soEQ#uFG%kMREx*yF*2{FTQ8Z>fdd(I)SF(6RA__Kx*zDC8=r9;`a+n}S63 zI=w`7u}wtvI^9HFvScw)gH1lHde}uI8CxH6VwClSq_C(|*%^{`xd~6fBQ&2RW*09a zUAjF%qPl`UKhEZIxrfxCk2ttPZc@%&A0oYboIVz;U$cDv#?3q1`S{4q#91)2U%#~_p{Jq!0+3=W&Ba(UPc~q z_<5Vu@qe(ewwRn-(#|_v&}%8GSI)PnT#iOsO2F_wn%yYKK3!ROT`GvaZvO)3Hh~cU diff --git a/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co index eba6f4ad436662d26d6f6bb38177c094f3d57b7e..38f8f6889637add40ce9e18dc2cc84280050202c 100755 GIT binary patch delta 9482 zcmeHNeQZ?NkF z7|L_fgi(~Rpx4TvQVG;5{ZN#`UNzi_#?gw@vb9o6_XmH_ilIqk)zqaL)e2-}_q}&- zl9%p7YoSI;k?=Y9ch0%z+Nv4i-GuW z8dFdtjUgf`fi027V7;8CU4jMxgf6yI3-}f+e_T%Eq7?v}l@enhh7=nAdqb`oZZ!Pw z`M5zCzCW&!rX>`60XYb6(QsR< zwn_>GI1$yNwIz%BSBoyd{22k)siy(t4ix{kz&GeVTI*xQmjNo3w+pTbHzKx@YhPvMbpJ=WZa| zlAQu=>Mv;)Xmfu_3uNcYfc=BAy#OS4?9iJKZxZ;OJN0J7n+3k9Nneima)EDd){Bmx zrho|rU_#W4s2S06M9UE^D`X<3LR-XK=!ldTx*}o#+i?K9Z~%|u0QRu(_6MM;i0!^zOKh+Dzl=44c(WVlf>YEUw2#dYgsd!b&6QGlP;LyAS$jSixokV9G(2~6C?k|f!;t%z?uS8Um2x=Ka1ikW?gQ=t zkjbOCtpk(X(eJeI?^nMf2A<8kEH|_EAWv_KC@B=!$D@;6>-vSjvU$_t_xp4?iaj-h zoLRSlPjA`()hbyEm=`rs8sVV7_$sgo;m|PUo}dRm=MUaM&=;({CQj{sUm!r?iGQe&B*^?Ak(~bfjPz_em36 z>s*o_d1;N9oov1hAO$m&$G80123h-!WnDG=m8%blH?eF!_4#d|b`%-9uMK5{f*~!_ z6X?(H#jlmB)abR+A-;9)fxDt_N^2<;(Z{2QIQq@xnZ#;%GVA+Dx~I?e+qJ?MI{*qK z0#;)xeji2rESBeEoP6W_{MKAdfYMk_K1N>B9vN7b+FT8bSca*fE9g>VWi-~)8R`wh z+Y*tUw(d}%FBV8d+7f}@cz0hQ*wz#1+Z&I^dJ{+^_;_1qS5GIBhXNg;_Pw2fuAXo# z5RUZ*+M>}wtS=DnjfK0Sp}xw#-e6@k`b77G2ze|NOo)Thu-(;8J34|P%ert?&|-Je z&Y(T)qFvz*nr1>4SCDB72E)!!8Fv!uS?L3A4sf^Q^*QN^3*+g}oH5$F+RdNydJ_b4jW+|UMFGwHV0vSR$^wWXUM#kdM8n^xM;yKD|oh8 zJa%knVf@51(wMh%SS>`M%Hblcx2g`ajIExyV=Q&^llfglmtnj#*0uVrWXV=5>mlwK zmxZj3sj9M)iMaS0}PHX!$m~ zlW5pzCt0Y?YA5m79G-c0AvBxUUzLI$=-?P_b?W;b*iFSXsaHL)G7mqp;@Z>|4|I}Y zTAMP|EetPdQ|)M|#meM47zZh{7v7h8yABrT;d4dF;?xyEbv?ek@Ufz_D`oP+ihOeM zrMINoz0gr?l*$Si8>Nf*q5@Cpg=_K%_*&{OUg)GQHl_?dSd(9bD~-7Sg%7UDPh!wb Ii4#EZFH_wQB>(^b delta 4601 zcmd6q4NMeA6o6;;{y_frxKlt=)CBTbvyNUWHqCDtaSNmHBB1|uXbLFnx6%pFS%*G7`| z67$}C@6DSxZ|2GD@D<&)S9Q^Tx2KGOuvpU*y_PB8-jj92Zv^shm`K1XV~B-LQWKRi zu&Cu&y5s=>l?&(TC7w_0`_$4W&jKJra>73goyg$9!zZEwxX}_h9%_Y$Qyb#tsDMhX zG6V)JtK*TL$$&ZH7RDKao{x`3jd7D`PUk><39-z`Bs$92nUO!SFrl}#DJY(O90Kua zHVgK2>xD@V)TSnCa3ER<655HS5vSf_6fYZZ!4=~YH|vKd0Xk{=995dHtr?GN555A1 zJ}aySL7SyL*hX~T(vG(7-GF?TPK%Q(C!oJEp({Akv6nSf!1xwC;G6n9WCNQD53s&wUt9<6qU~ zA^Swl*8^{Ov&D7knQ|tkCF&+t6kODzy3K<%Yi3)60JO;ee%7=uUs1TGF|jsPN#!3F zWk~CZnOnYsh~rMkNM{nE616%@%4}Ous>F`A!c&DiLGTYvdvHew)=w0kEIa{%(@?yz zO_YNzwU-N%E*&!sg_#W4DK?j9MqC|L6!eOLU0r(S)K;-z?|H`G^j~*L^?FU^_c2PB zOz6U=u^e53xYTU1?!Y}wq)ris5BBLj(@c2z<>-zNypL%k-Un`UNQWk0V^K}xw>3#K z;qLfVw|w0pa!p^$sVsGya1lmy$nL-CiGHw*g&22^p_X$2v8B~5pTm;fL8>vne&tMC zP^t#)5RYEnA!*{hE>0iuT)R=R9;S(-XBFQi*Y@=fMKSs5V$P+Fn*Tvxtkj58?|GTZ zO{mxS90G9->8R3S3zcOGz_FnQBeEp|&;>33sH3Jp-@MvNMdCbO0 zPcr;VR)q4f#riC16NBg!zlmh z10)6!v9dIPiGZ3X5k9quE00P82<%k*Zzlf91!|s0_^~Wm@rSG>#1Ok^!fwKS`$Dd= zq6`h7Z;zTEBK|jbDi3&Sl?#L4)Ak-pB#GBQE{_eM`=x~j0NM#3uUGSK!g~*^`J05d zEsFCl;oe5I|NGL=k25NNS#21ShX#<`spel1?z^n!w+J79OU-{Iy!nip-z7ZuyuAOg z@jfvO#j8s7#te^K*l!kh1@c|DZZH-M1`YC{8=&~K2Hknu&rfs#Rpw-Vk!zAmZZb;8?B z>NvfGkNH#8h6}`yoUP_p2ybwz`NxE7-D+-Qa0*Nm7$cvtG@&oB+rp1Q`jrf3utE76 z6qr5mq~Gnx1r7DjJlk-P=UuL_T1r$d-%Tm!;2f0lRyS20oQh@4RJa45dTixMcC8cG=m*+xR z+FU%PHqIZK$LEHlIdW;nd|pXD31Q=`ROMZzbRWDsm*!)$I%t+QTWDpQqm0(amhYhZ zv^(=SDQ*$ozTKKjOKdA~&^_DAY_t+~J};!^hJ5XLB~~Xbempc<$YmmTXDnSfUj=hk(5p6< zFWjzzj>Pi)%9SiHP~@Oauv9}wG#w`i_0`bH)D;Nls$mh+TO{0;WYjic^^aN&{nu3$U4BTl!DWsf!(#Gm&+_|ZHrkn z_Cf;-56cWWFUCI+eNm$ciFXe~6Uvf3n7BFSgW*y4WPIYmhdDHO?!D*Eb(tCy7$)AN z_xGLO`OdlDIluFreEoWvSX?2D;;C>SNf14{CF3E|aAxU9MzjL;(U4M`1cGYW)Oef9 z1B{#9YFk5a-?$powHd|z*-FGs;Iyc`O&ds}_QMClPQq^c=lNiX;G3?P)oC@I_x4<>`DfB4yt-5H+GCkxlN;E#qL;uqz8M|uB--mNfh zK3#Q#0Md%Nfq*wc`X*}?npE2r{24bw8ZBd;hCf?w!mHF*#1?#${@_5c#9p?t*4cWt zUV6ehUL{iC+cg^8TssIh*6P5mwVdTrd8avS2ViH%0%pth>I+2{e7VMw;bM@C6rJGW zclIF?*ujE=nvaam>eq`AmYN+pL6@TJ!P(&xb)~y#92F&Y?xL~O85yHRYE;INA~i;c zJ9f}<%RU3ZY&hKLK(<4+BaucYvYoOWjW)WF?UL=0kw)dilO%AU02~ON2%QLB2we#6 zrdGjWatlrqBe+anK?z_O2k-(8;3XWuxJc}N02WqoSh@wLrAKgCdQkvqZ0zg98c2@`*3qfNcXeb>RB#?v#f)vr2p|NB%mW;-d(c^oeSP&`JhZGw?iiHFi?OK$YpS-3M zy(AXm(PhN_=TY3FjQE=asU6ap^TH^Qd{1u^vKrK1CeU7}QJFu|xx>+F@%`;m8C?L% zFQF?v>h#!)d+Oh$M~!-SNCww`Bt6!TqV%}_`~N#VzGwD7J-#>|uC~3qJzthHq<@}$ zi+v81>vap|{zYWZu@~7nP&SfMIw0-)#VSR+wsa&4rng58Wfv&T61`qG<7S$jZK+IpGMVGDv4W6}O((g0hARlM z0+-89=ea~I&E;pZ*-Wm0JQAe z_j;FQi!^{ut=h&tvd6>9R@n`gt?!(^*wyRtgH$Y@OmU2_w_P*6?WZ);*Xz;DzT=E$ z2EBd&BbHV;MS0+AI7?nJ!y|+4+ErAA$q$k{KImI|n}VMVGPNyO?iL!}9`w;c+|=+a zyc(*5<%kjXN32HNiOTAD#7}EQH~wp~*^PZ0YPAJ#M&a#<&pPm``sIOP6<5vhv&d7h qf2hHzQ7g*W;*gEJGYmf(>Nfr~qWWg<1)z^M7)PI1*-tf=0Dl11uZ^() delta 1709 zcmb_cUr19?7(eH3Zf?5nxs!0KsWrlB=(>u|KP(ZQf0{$4ZOg#Sg2rVCno&d$OYz@B zP;y?Pgr@XRPYKtDAm|zd`4DK2MGqkaLJ(32BH}u`_Z)2^YT)4R_nq(jzVrRQ?{L0} z*Ki>LP4T8cGY4V4Zqii1Ed(ad_Tw)hJ`NC2e~|oVq%1*Q0&qGV#TsqR@ZUHC*%JM9 zKbDRI+7c|Ohm!@(#}SjnjONc;wDTG2X*wgpXBKSWieQO+j;qST zv18eI(6Wl7d>i*^Ri5MR5HBHvOp%r&|{v zuMj@( zm0WzCRK(Xy#e9RbagUr2K(g?be#gEYlK=|Jye&@`S~QL-#$mm(q8m}}0!@%W1qoD; zKm`e^pb8?$Lj+Y5LAC0k@jD*sS=sLEgJ@wbY(K-Sg~RzKl6RP`iv z;{2S`zB!_#y3jJ>edN6WlwZ10rDcNHL*6m(5Kz{je5X!6@D}k@{u5bjkn|i0c-pK5 z?AL$X>q-Tz=wxsE3$A09*Ir8&eKaS|GX4e=QAbvVVzIKk)*7zzx-?i&E`mTzV%_1|)#cF0&Hu;$8 zYxFR)!B>WF3Z?kt#c6KVf@2}KnYDhF$1@?1nRRlV$IE2tppADR>+{4<<~$0K$VbLFbB h?;=j_jUO*Z>P#=&H15E*fR;;klP#cmUokHPe*xuzpGg1! diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co index 15efd8eea58bf07a808be0b85281fbbe538094dc..f5bb51990891ce4fb3515ca911ab84c3fda30d79 100755 GIT binary patch delta 3846 zcmeH~O>7%Q6vt;bb?i8GcgIdlWmO?{(Ryvt;B0HJDQ=pOR0t{ZcWO5Tq1LsrqgDwb zRU;0zv1qX>1dZmxfkP_}RBa_n_l7F_&>Sfw+>kf`>H$THngd9oK{d0pvr%F-rIjtj zAuDHFff4?6pGufULr`=Z9U35Al*4H`I&Wenr zyu!+zZRciZP~P$v`o&_hyb&G#s((70eLpk%-75OkG{pQt$~98{jmtjt*x*N>@7c}X zXV47mYrD^w(0i;6J=~LHcqPU03>u8I_gLHTO!`*Qxjk4RUfYlrUFN@5hT+Q$vx?4H zKkFPV2S2q2ttETVuB=;E*dh&H*mkRSEao5+zsmv_4#0&~#42KS zV|8N{EPXkbr9UTHyg9c;%8>w~G=LZlV4MbULJgqkYi$4=pLcQAyvTLs-JCs70ys$n zn4kf?O;)bW8-B{lw_7dQz9duMyfgX^C^`ElOtJ24EksBmLJAR5h){#rxP`?gAs)sM z{TN~ZLp(wtLJAR5h>${r1Y#q@Vv|o1*NY)KF~pZJM1eqr6e6S$;bw@93@0}|zE6V) zw}B{dA`ZZV1L(s6^ylBaVy^wxaj|AB@2agT^Ly*Y;(wf`ttR}uY1(qa&zYu;gpWNV zVSXFJh~K}Dxu6`}ucF*Z)@EM$%_i%!qTDW7(Jgnc#kO9q;g=5ittdaacQUl;ckurq zLqAr|OFJ3bG|DFbPFs+n`wVdXqnb_We(j-cLjC4fuN2g@QxmY4DmjE;`vGk?Q~wF7 ze?G2HtL9bOu7B|_!1)jDXfDjNl7udYXGLG&wAX(oof*i!njY}UUODZ{2E2jn8LwB$ z3M*jE1w)GzjJ^*gEm=Z5|VssD?H`v<5W zWQg$=Db-?xSVk~8x#1$lmuktHK!u*M^wtnIo^%jVyU~M)GkWRxh%^k&o;sa5`;J#W zv|&nzH%$4EtaliZbaTKP@ao>?p19w>;yoym;LZA4D@in!@LSjD($|vO;!P%W jC*fz;l4vJ&-SRis=dN3yR>Vr?wgAcy+B0;_cqU1We_jja8(4$`1zQ5n^e9yU`ox^@- zW0h=tAPb7D^**hgnq`vl%%Muu%n6sQbm%s7sXS9Ht!6It?|IpL$QClkZBv;#ZKl#L zYPpyw^TNL;LJO%X|NnY0PhzE-(_B}m1z%(3dfo}yvO?(9r`YxAzcx5RDCk+uvzfLKEX_iwgHUL+w<;pnm1qRR9vL19d5wZo7`A>9P z!4G+MFLs$yjimNs-IQKUt48TST6-xClT?+XZoJx2Kdv}D<28=Pal_w({$BK7Lw_Il zyJ)=H<&G<^f_RPVblmU{qJIee5%l*HlG4|T9uotDAqGPXh8WDi@No>4V>EM&76XGJ z215*n7|g&Z;~3=}qk>~p85j&P7-BHQVEv?&b1Qs8EuT=wC)CF;F4&XrZ4Z*xc}uca zuPjMf4aQ^LD%h$lA&UOH5@C*U!$=A~HIzA4_UDt5G-rH5`uBztvrAv~&%qP2OC?#$ z`}dP8`iqXA`H}RWGTlAWkA^KQ*G#1%RHL2ocoWC%Snosqb&xWe)gq3I-mwZj?n2n@ zr?NK)_JKP}qdL~)Qv=bl*oYb(@kafgSTs1|^ZPt%ET}fAqyA`gv{|izH{=Yg4cKNY zF5xRs@(i^~o59R_A=aNOnRT=ESf!04{&VOrIt`9#?6Q$^Sr z>=I!|K!wSnFS6!@#^HLqI6{p=Vpz3{6IrCN#M4zgpj8UnJQe4;Nuf3z$ia(@_vJ)5 zzyvs6W0qF%hD%{1Ozq;g+Cs1$R@n?*R5N&SJ>tf*phb%0nGhr*ZL*@l5*JSoY)9I1 MUiX-L^f7Jw1M5q|(f|Me diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co old mode 100755 new mode 100644 index 3e97ffe2f21e859ddc90b18e0558ff6e78d3b8f6..03485819916f7f6a1dd3b83e7356108bf0336897 GIT binary patch delta 3952 zcmeH~Pi)&%9LJyCG|AGX_RHE(MNx^u;Mw1(C}AIkgEM$b<0V)F#rn@#KkUnBtuOd_7d5a{Wn0xhl|9r#68BYluO5 zny0+d%e23%^&zTeV;SJJkMtnJ1M@x6sC=mvXRIs#v$ zs7L6c<^8U)O7Ny7XenERR%zYx6J27#y&44H)_TBdtrx7-8jc7DK%>Eca-HkETnvJ* zYC$?u9HXN}9=!7%H%Rg#&P5 z<+1Wu-B{gNIrBi?Wgg7)W+Cr3d-F;FaS}j+1dt*Dq~!ogzMTz#VGAzCQs9}Mf}61x zlmI430FxwuGs>N->yAI=&bPW-W&2K2jm7y^0z@W~ABw9Pv8_n*C1VkF<<`iTD>|&Eo^a|4B|Ua6*$~R8a$1IiYz} zjS~OY%0~X(_o-~sMxfPgT8~`LJ*l|4iv4PFqmB$Ig1%2&iH(N+;KJ1O%!M}uaZJ=q z@4Ed#1DdmS+u}11#dl*fn`aejsCT#~C6UdepEmcy;R!Lzy lr$^Iik*-dlrSu3Lnnd@poI8bVCr9?Io>DjCb9}Am{{l@p>9ha< delta 1913 zcmd6oO-vI(6vtKM0kvulvqQJ$H-r(xegvnJ-V)xP2~(RKV7IIL>Vv7`{hD~ z#s6Li?YQ3m|NUSAr+tQpZ!5K!he&1G?txMzfSb)LO7sj&ddlFLdmHvkbok&cQ#Q6? zMhPn$J50*qV{tWj$Zd?wBq124sk2%ZKXM z_Yh*>gWYs5Ap@2ggxt5ZpO7JZQ18B!X>d1Z0`6d@(cO}<{e8mUFZ_3fe~|gTB-`NC zvH@>Jw$XbwYx@(z|3LVMh5sJL2j*JUBDOK8z@P$y3Jhvvgc*j;FxnVKyNy8w1{D}o zU{D+5Jj3uaj5>xhtqt@lC)iSKh zAkdWXDVqdtCVVs}x|n?Z*P&0Vgrx*gLNQoL%m!NYiPo?lO--c7^wd}=6$z$Ov9WL@ z9Msb>y;UENq*CK;dLw+oHLyiX=2!81#!4#0O>0WXp+P?MTde9c&M_9vrM zKA-rO~aU(H8rBa z>?XJGJI_0_^WHxD&b%|3zFt*tSk-GH^tlHuI|CFDt`RJOUMxc-)V!K0mcgU3a+Kg} z03gZeb{ZaEjP15q*(S6Cz?3Q~c~V0HmFI5Amcmte|NT6+3woFPbmC(rslwf9k*w9R z8o~q_u8?q~CGBW$V_VX1v194U=})a&`$54@uXlHI`^9nghvYJNL%81*?ti0uK0Bqh zvqv%(5tks_M%a=r!DMz9k;z`q@IgJdQD=bcv500lm4tedUKQ?}L6SuOEl9CP_20Am z$X`>(c~h2V1VHu`YB0^+A*>yUa}XgkJ|2hFu##_@_sD!rfS3lh?yu z_gv%7X9ZzE4^9u0;LLCq7#Lm*&JMq<+aHeP)vN*_648NR*r?v)cY&c{7g6JP6CS@A zym7_2jsQl`BS@(ZFU{&se}PLMEL=@?(XOnZb@k$|+o&Dsfe|CHT{dF%Y-Yh$=O;JAhy~d!ysfS- z$AWAYWHY80_)O`v&z#QqEa}BQ!GU%090+*)WNiWm)+caaLmUTm!4{LQtHrGAZn5az zXb~LP6vu&Q(JLhz5;)M9z=8U?9pElx?}vfN%DYP<6`w>uHJihLOd^$lL@F+cc0{5R zkywgIbO|Iy8xIoure8B+;J zq~efh)Ez+%97PVij~qDG@^o)%cp$YWtnE^T4{(31e-3hGIb-U;J&e`4LUo$v`H$7P zLN$tJG6BtGT$=w!tj-mxPe##9CZL&&Lo?fF$R(->ZrD(&RYG=p%t*lHyoGEumv<7f zY;e$!(J|Q-MiY|odrQTg9;|_ES^c6xixR63M690whx5})tVVHpU#8c)5UaVG-ILwn zj!CY;d1xuIq>%gFvw@hE54S7%aP$CCQ(A1$#v|R$Rsu`=U&?Xj7rIeAHe-# z*TIX*{_uU?A8w3VRo%&E(_H(*pXP>4GRMk8^1|I4E>xkt;Tp)bo#;(h z=2)4~g_{=lPfTwRHzuPEVkJO8Co1iZV?UU@4ZuK^LUoz;=8#1$eRjuc^pRULC-8=~T&8^5DK~c63Tl zFbw<4RJ+OM+(z5CH#J+mB~4bFgLX98yiVHb-A>a?v(?$eY;9`tI-1SwahSvQMUn&U zEw~FT770=dT2IB|OE5n6Ni3d+@#6Ye{Bew*TN8^rF}^onPy({G0vkS@(Vzz7V;;Gx zplrgp>cv=mJH`!lvAFR5fEKLW5{;w&J8^?=9up4=N#-7m2aiaID0?wJ(jAMxhjFkg z7C(vcb2I9nDS1`W!Oe}s4RZ15 zsOwK`$p177MCugA`%r`ED@Ax=T40#bK^DdjFN=MnD=_X$m2vT_v0%fO2H7C=bCfR< z$zT+p;gUM%i*p@x@*-S3=oIQCU{AnmEvW>pTemg0ZebW(g{+pBV3l^$QU%&x>6TE& zRwb$S%C(YO?r>t&YNZ(ottggIx{8+S7SrW6&>9eWwz}2E~!b4Ewm@0a_9*yT%B|i ZDX$@gxcN|MNDJ4b_;SRW{rqEB{0r`;9IyZY delta 4492 zcmeI0Z%i9y9LJyMTDZb!dw1Xl4hoVmJEnL&+M^}&Px;djXV_fkaE#5?0Rhrvurzbg z#8xZ|FQzHG>7hb~Zkc47IJ`HDS=MDS>I;113o{e9Vd9IqVM26fWJBt6cfY%}#Kf2M zg_pFy-}8BX&+~ly^q#+|v~?zBO}VKV>qPzGIE%FB3RuMGX-rIk>T-G-)#lpXMLliU z>~e)9YhYE+A8@HPku=z09ivGdlWL(sie&tIL8%#Usr~Qzq*2`O3On@r4QqwwXkuS* zNQIOeHz-e0O&g>kM~QUAz9g*{h|-j^gqmNHMkp^ezf4T2blO@WIhdUoDF>MvEG5{Y z3Zny{MuAwCC{D1_C+y9MludezyHmWN=9U#+Swhm$(k~@%+2rOO&aw2{=+fowdr{i$ z8cQd^A9amMH>a;FUq6z-@gvR?>HE=6hTVlT_oKTQc8_XzGwjQ%-NUdwIP?DI((}k#UO1cwdvjfea30 zaJmeJ>%urAEriR8V?|?WdeoX;vM8(JZmg8>lU5~K+o9`Qr*2YviRc@ytEBE><=whw zIuCj)6?@FSGMYvxp?!Y0D3@behbej!n=vC#Lro8B@UE@^+E6H@}igM$*ljYR@ zZOV=5Em#`vwkb2+ZXA#OKMd8`#vF!4=WO~g+^zf_&ggIZy1!4@t-MpWA1i$HY++uE zzZ(18gNFlfr!v<-)Ri9vb?e_>KPvCN34h-DPVA{zWx*E@lIb~RWc*$!^+u&KH1)hS zZ$@1bOSZEedGJc93*d3GZBQ4$t29{vJF#-?)090==JN#$S^J12{vt?P@3*SP5nO&a z0Mj&C0g1fj@A9U{Tq|S+B$6)tlqWq_S@`U7G96r2T)%zw!~u2Fs_wnjCk*@X4SEul zgVhcL#9xH`V&MH?|60u)R^3_1H+J6h(ge3j?%0SK$`g>gH{zIE5c?XvVpF8A|Bx6t zE7*SZkV zXl8FVC+)uu1$rECaDx0X;K}jq0tlsmZ+|V9e+2w)UoJlfoL>_^r}8*abZkunm!QDU zA83yG^(}D!go!lg2jGd*x%@iti@)UZTfi?|Rol;c{)CL#Gg*+h`hk;1XGwwVr%3BC z8hyT|fimDVR?RVsR0F^Jhl#p0uO2e?=rx+TAF7D~ zy8@5J`A|z6ibh`Ti@xG%4?J{3L7;q#U{Kx{G^wY>bbY}lBQ_8)-H_lhXizk1n-Bon z*y8b{Xk2eO5EKA=d_LZA`B1yz`n-k+PaqI7Xj_O>q1VvTYSNHsv@C?$4J!D}pCRyK z3ltN0zhL|h65I;7(eF29dK(Q9je#~1Ntos3?>FN!RM;UOsl*M%#+(&qt# zSw_CCrsrW!nrGztDv~zluti25s=`f0aQBg5#OlI)G{RxRpTxH E0Fnpop#T5? diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co index 73f01d6438a065af70df6edae0a1ec3514fee02b..7983c9b8e051e77a12cc6eeeb2d38a8c745875f9 100755 GIT binary patch delta 11063 zcmeHN4Qv}l7M|JIiR09DcANf%pYx?@W4j@E*K6;lav@Ee@N-h+hR~Ms)08wm`v__% zDTmPEHMyih(5piC#P)hsTMh|Mr#l*05I;h8K_EZ{@uN!oL_iTjS5XdfIx0$fS2Mdi zv-Y;JPgD(6kuAmV?e}Kpy`9f*-^*%FyliBLE<>a9$_#^jQELbQnqlnMk4QnJ3CC;IfAQzn(k-)alwobtfROPLDfitH73 z&LK&on@cnD?%Ge~`{-lNDa}+@c@RMPF6OD4=tyJ-6JgQ|A`9fEd4IadM~`KH4_?ef z)`0AR#T`8zDAj@|GBh}lsRI+4`QXXS7WejSuAzGw0J)qS zq_a)-yHgSHLMB3Wr=nCWG!>IlbT7<#kfA^>+P-gBby)Dnp`ajuPc!7xwwSi#V7OtRu>CEilykD5&>La zB7ikT0&u63UiV1S=N?V^-P@9S05=o~;5xLW^!gG3+*~4nwKE5xj_(h#`jsPgq+*`=Q_VQ0$MJkPVAuruXr0nc;^p6Q}I|A>U0DPONF;F&JLGhKvddB5jk zs*6%Ho)uN)oXtNpN^m)AIa|o(+2pLe=h($^>{Z*`ttUsas50q)58Bknyc)MZU12Tf z>6vx4klVA))93!DrUK&2ams3Af9I`$jtjD~o) z+z=O*w-V3fb5dW+4RNa65dWBlI8$RZ#GlwvL)=gy@9&?ZCJu>qb?}y9SgyEcH+8{w z#_((P-oDA|a${^#dBGN3tub~joz6GLldAnTjoNZwOx}3Sc72_#|6kEN%zL+%v-IcC z7TcFhk7k#u-%I-zQvY0{?pC(g%3bl#@-^|Bs6i^fBF2y2^HNo3;}!9iI-?^Nov0(; z!IZ13**IOWJur2VT}|~hRHze^jV^WY<*TW3;+m|Um&836@1OMd#D|XO+u@K?t-KBW z4?8jGP`AIn$z4ueljI9Gbw8h0cYL5dhRc=qd{)|Xk+79-93RadI<97pIr&?(0~p$Q z6YZ0r4OwFdun0^201VQ<#dasg<5(U)mD{YT6WC5pn;fPex^ICZdSETZamuwGIP49H zw+6zS`Ulu}Yd;$j0z!W%E(XN-ra*ujV8woJV}E~K81TuD!1{-O2_4dK5<9_SjxMXf zu4k?ILX3N!x8e;L=hs^C%P{`*6;@ot`0tu^MKY}&*zniWCUj%`NX#6nD>q==_G>G? z3FDq#E3WS#r~-`}3UTEB9z21+$~Zzq%wrh0?I(yK$1%SBek=ZGjMv>^#UH~sKWxRH z)Gj3yQTnunUc?iQ<~K)2Uc>l_eY#?Py@&C?KWIX_@-fEy6f1rdT~yA6C2$%k(jZpVr1sormV|8T^PdGAZ#ea)&K7R-LfDw$R|79Kj2aK=N z4`StQyD)xKg1Tl}9>s=(HY@%F#!p17`12TV^jh&(Fy6e-iocCB%Kde^1 z^FgK>A45$u3gd?YO-@qcQ>c+fWxXG{B|qXeLD}htVP*oKGMx~NAAQWB+~|jwDBtv$M0ayM#yg(WU`Yu)*?3DZA2EmNWse zQKJc4>J|wpSQx_)tf4*-d;|+aA|}MGqOp=D8iLUjUw>2*(5wNGl!kg|=G#e+pTX42fXxi-UGGU(k!L_MnF=G(MDLjRHappx-<`p zB64P2k;WH*eYsU`*P|#Z1mjeaKpGqVJy2eZsw@AyKe-om&QFvY{S#RT!`ZQXv{Wvl zy=bAjhz{N(Z!Im6cerx$9h)Q%x=ZM>oct6m(qp%v7Wv_es zvKi2u?^k~M`_KC zgETlugM&0UJq-z`5yELK#%V0k)8HTt4$|Nt4NgyE0H<*fr|}9-_Nof{Z=~+8T5QjQ zV)5gi(z40kEJgL`f4>)~ZR1+OuG#eLX1`YUJLpg#`o~+ez=FTJH`0%iXwJ0 zlP#N6f2=!rU%?rFul!qijXJo#j!t*~zZkVW$6Sn#UZ=4b>r^&**^wX93LRDVRjop5 ze_!0L_B@m)v&vI?eqwqpF16vcSXm$^UMf-R+S_ev#jbqW!O$jkX$J1)tS?2|Cli_M zdG)WCkItbTmFl76I|}n=5oVCw`axqJZ1mcxD3uut@rVPTiu<|zSc&vlAwCT82H)Rz z3CTleidE*zc6T0BOgkD=H>a1Q;=s!>Z>lET#>XPw9JGSv{>+ntRTMpWGm}l-{7@)B zPSL47i)+6GqpS#``SdkL*@`850v_DSll7Q(bFcB-{;hBH>zgMUw(mA3;Q zTxsRofDg4ybA9~h!7#GOcw-{wKH$Xzgc#Cj+Nt6fto&8t-)ZG<1NWw^{FHJ1c52ed zz_gx&fT5>Nq$^(lpM2TMF9P@OwDKQ-5ACt?E5M77&8$C$*TFDw#2Ro1_>F^B?$8Tn zr@CIV@^avp-nDY$)znUn-3Cs^uZ4j2Z>#|U;FBky8MX3ffsc?Y0CO+!%y(A*eZX5UTlrywr>WtqR>PYRu;&jeKS>6t(iPMA z5%6Kg%0B~scFJ_sm9K#JBTXUo$LcsS7&p(1`UQC6UR^PH4)|H)-kCh1eNUi^sZsd; zKn%CBKUaW12h4n(cVX+pIKX(HSLmgvFU>bZ<5XAjiH@#Kf=>#WYAgaO9~FoLABdRL z7d6#DBuIS2VN-1oe1t|Mlg5QGP|+X64rzU4Uo_+g?2|-O_3_hcEIRGL6IGN*Z@34O z26@vr93c(@A2U0v7XqY*bj)8*^!j)VR6+1dp!%htNN6lhu37MlWH>?O&8mq}(wPv8 zlWP`2aWb+H4v~Ctzgm6Ldg0>{fs9cfAienoi4@-N7f2T(Ur&7f(TM2~lp09F!FrOg zKNd07xQ~o13PJ<5X+u*Wnd;5 nA&)WdN8L&@=5)xT?2V#j^c9bCL6e886lV;};%a3@3|0RF?(RvG diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index a558cf55dc074da0526e1ce0f326b52cbb98ce6e..bdac6f8f4da66b239610880f5c718bb51ca19da0 100755 GIT binary patch delta 10069 zcmeHN4{Q`w8h`I~x81h8bY@$yR8C5dkctI%xBIWU!u~0LHJ8eHrsx9I(q@Hfmurp0 z!=3H2ZMddNVRAQ3^t9)B(p-8`GN)6q=@1ihhF}m*6Jh|loXKfoUC@Bv(ncNlnvJ-Tk_!YZovw)c8pTMrTY*QBVjAk zeeiQVeR(I+07kRt72}byr_o9naZk(#j!yYd zJl2d7u^<>Y?caxhAM|UK$q*VeJ{1f5xcJt)`NTF~Reby1^Alg=d0Y#e{uZ3;;_-M*0P_LDClWP5%m)ddOxA=jA0qs~K#jKXV+aJW0s%~e zmJT)L6 zo^$nG*nm!KKsPp^7aQNnZyo!@G686uSGRH>#hyBry8laroo{nERjs5ZSX0vR2+z*9aS&38LCjjzfw_Q8C za(PzZ;NM~q?)Wkj^glEeP|#P}%rL*)a2UIxh+`Ma-wVJE59Wg#{_bAOlH0#bs>Bsp z%)fx&hq7b-MftZcjcDODr{_(};n8{UqI~)Fv@W@Pd}*j}s6xUD9N`}*x*6H?B(jJ9 zkt!ArEKEoCcg>jtp%5bbk=*Ch@$=(VLVIz>Zh3!@D&be*IFy%O&HXFAGAYB)tzy?@ zlX7nvoP@$%S26+q9dm&N_`LAOYgQ$E5spEz_KI0}?yExb%d@FG^h>i3PQbdRD@m33 zCX4uE^YjsabBVH(M}ts_u7SCK(N}D@prs3xmMw4y!p+LL2Vu1_A71vAw$*6wdF z=)Z|8{t3U!7R)yQ1*Zu4;fiM#DpeK8iI!}vX(~;;g{p>W%4c z0!`3yRta&IKiIcGYTbo6Tm-i8ydq;bRy|by^?(*0??XFI*&D*)dW^;7d zz3q;N+uJ)m9k+{rgf*nf!6ZDti|$5=S)(PO_cJ|y3&UR|^(2{Ar?DmAgI{Ja!mXe< zE>qAoFU4ycG)9;i4AFrv*33W}^9zc{BYJ!b#l4!1Gh;g_9-Vaonh8M(_}g0$=dDi8G*H- zwNwDF8=|bk?U<2mc99hXva{XkW%KOKF1B5x&CXCq(BYsf*4X=$(rx%uiB(vDFbOVdxKQ@v=hlHFG`EAfCC*~zqyq%RiJh{fb9JJl14)M+pmA|Cj4w+y z4rB2xc}AU6muzO54fd8iX2>4W{@(4qJ%`lef5Zh>z@{DD*^R9_LU;zSBuKFhkdX6gqSz)fBjurYi0=dNjd9%k4qzb_D%-Q`>9iVQhFG2J#Nf{2OL~>&SrGP6y z4wDP5@l<_+hT5qeryj{I)bWKHI5TNpabKMo79^vt1#lSd01(Lp}T**H^+evnEdx`#U9UBR5IH!~I zW4&l|Cv|M|H_WX?gP?WS!Y*#Y*G7_+U~0nGL6RGspeiOFeg6y>p?<#Gfw?td)E!CEICt8qyf|x4aEeq@KHFycKl1Zt) z04Y_P##I#8$tgQz+)0_AdzTQC(&40ZI4K=YN+%>`MoC#vQZ|%Sf#4XeCKN;w(y8&A z=a^m3$>B2ehB}x8S7BS3_0eIQ<_{2WqAIoa{~;gFsHga<~^C7<-J09@(BfF zF0V{ixN9DEa0JmN{CyJ)unfV1D9X_DVwxhu^U%-nrC8&4 z9CEBBI6~gZt=W(Cc1OU`PH-*EpQWj?*=Cd?$NCukT9q8OK>E5D9AF04A6&qXdeTHt zmsj$md@^R_wY~-L^`*?w&40nCfF#2**A${NPmnfMcv|L55N7yxe$X#P$4+9TBXm`ybIBD zG>Sr0o~Y7hwBPr0d@&@F^r5fkbdp?SemZ+3I+AqtKG4EzG8U0U4Vd6SbSz=h8o@0M zW4KbRgtdA^lMcW-yTNimEgAaa|0WE}eh7vYl;Nhvbt6?|O_L@(PcdLkudrZe*;4wG zomNFS?7Wvp_x+*IEF*^DsF7G;r*m%EhIN{Hi9y z(>Py#vDGJlan_om-)xhK>_8C=0=A0J(VQ9~@P!S#>xH78*e+iQtV$ zJO$%f-TVPO{D@~xB{8`^jPcqFLQ_fXcSPbgjH|K*r8U%BiaVNT3=sV)2~|zVR8Tfy zds0+FowXS6n(+cc2?-_9^i{|dDJcFPJfJx^y@Mb+CA=XL7tN5P8TQApebsM~_Mc+> z?uAIamv6SBjM8V6v>y)`d^xhz7crii7KvZMxO+@cLjQ&_u9;y!it%6f0sj5N&mG(` zQXd)Mk%aFNuS}#7ae|qWsLY7OlQBMYBht?@j8~lq;bQ!3+;MrvfP9QMH$(;~#`xVe zk@$HYhbm$e-|a|=u0#0aZb;gJO1dJU$J$|_)o8F2KF^!md~cY_EoQ9dmy}AVp(L*y zqXx?=lcbsqA=P4+42%YI2rVm>oFTt?QV6rL6syJ(VOT34*_dYxxhSzq7N)W?t2EA5 zUWQeanhmSwRppZ7ywx@fM)OR@khKUM6L1DYr_I^UIBr zicrbG9!}V1x0X?px+Xf)4;!E_fj;bqxpDZr)kM;l{Lm^j`$$^d8Ey`dbj7shBuO9O zoA~yxO-ElsOZTI|Fh?y-_-H*{5rCFN_Xpy_aBmmo85w;T;n2OEz7&9k aQ01f5`;fFX(iKR?ooN*9exSPpaM{1i`8VkR diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index 99742d0057f3cf42d3d999c8a1d5130b84f68124..a8e960242360d38a0de9b71a499064e75acf0177 100755 GIT binary patch delta 12243 zcmeI24R9036@c$fk|q3uB_jwp6wGg3*T6}-lTK7*^pD$Q2#}P4VuQiPbqK^t;uM;? zK4EN!Fq9fZk*A%uFk})B^&}+9Ag7_0n;}!Tb&`fcaVa!T#|aJ)qP8$~%#XWw_x9wa zP14Db%AP?p*1r4p=e>LTZugzMEq~1F9?WVfWV#3U6xE1%l>F&}k}@jSG%9qGI6=9l zW|5`wG$R9Cj_j9#YK#}4K2xK#gE9oeB;~;=G%P5Tntwva$k*sOGe3L4NCRCpEtIP$ zz@--S=}}pl8_8e_X=`i&-cQu4LTBHc+DHMA@V3QA@b4qYT6iEnOj6hTY4Gov^)(`g`EY; z9jG*+y9?*V4jWIkviLHqRw@islRqkq~7VQ&E1t4E(p)1sNvyUDx7fZ zw${6-Uv|2faJ`>tsP|IMC*0c^%1t%NC2k{?Ufq-Rr7mN*Y>}y9jmsQfyJ%v=5b9@K zN)HdAzQzZova#N{QYxG4jit0W?56m)-A_ys(T-n1!l2fw9L{?Km-=IzO=^7JT zsa*-leiJy*1P(NT15Ks3-FzN7kV6i%BL@n|feU3@O@#d!|G{EQ^S5b5$x`sN2|R5A zPn*hayQP#n$B!HxM2;>&j{ZjVOVhJ6#E+Pi+dd`kZvU(+VK;9?+mVJrqJy% z?c7ru$#=$2jM6W{cVv*pj^Taii;puCG%GV@6YA@eU+!b2YoRQHcyq zk zJ64q3PsdunF|h4lE2d*p_q{FeU+LZZi*+$$ zZE~Erd_>+f>JxxCN&t`tkF+TsOy_ceGJt`cSot1-7R#=tw*Nz!V`=+O?!7Sz(oi`R zj~U{{0rZ*}DSj?nY)AVTPUc5{P@-gD^aSP9+!?E{HYTK)#96@e=oM*X7H}%|;j5Jy zqaXnY8R=88zn@TSN$0~X@BK@EMtA6AeZMp6V^;<^J#&CZoBu}5oqd1Q?EAgic=_+? zZ1mzklr8>invwz3<)+q(v5*0LKkcdaOVf*WarQ5?Av@0I(Pq8)D}Bm^qkp+YIe_BB zbgmfj7uvN0irB;Nl^P2~RHZiqip!|bHvT~E-f(!LrU-NGEpi=u@{8#nBxcNv&=LuA_W zo$-Pwy-H3}()pO<#2`6~DGQ=>;4&R&kJEj3ehm2He`5V#{|}%f1+%r{p+Y4gSZ+?O z9%CUPXs6?wW-!Y}#fzA<43|t53rGB_GA0rNA=#aKCvh zB?Mj5nHy=}5^#cQ&X;}@;27~w=#6$v{T?uy;?X@lK6x|p;*1wAHYTLV#CbvHU1VOc zJ-)eysnnyVAHdQlODU+yP{Qd8>N?bC0>{TQuHGE3kx&QJ<6{(OO?$RY zigur7jEvP9J#(6=E#!suti7^ggQdz@Vc{LDqk^vzSfQ$tWvv@5LWOl*MMafk!&f2) z>3r#kgh8zP4rqfSTQ2pKF;hBqlKR{wi+XBbAIWC|H?K~$m85yV5BNwf052$(1yQY4 z+Ucnzt0GxgB7Y;rw+yrjc$Yq@t^UTV1^6H4k^DIDv$rUI3wXyurM)`;XQ0CD zS1Rx=ya&ANK9VPacdR1$HQ?q7l4rhb*8sOS?l0O8zPMHhU&|2UtmAOzkI=zHG zdg|pj(KxFA5!wkAh9Ox{JNyWEXb;H`0O#B!{{`?%mq|m90(V_aaasSbp`!I>szTD< z1Kh8d1=S$sxFK4qb4mU;XkU1XK_3F|{Sw)Jkdg0SPjSkmstPn3Nz_w)3&;-Rf$tJX zJ_UGVkmR%RIQNr$0Ul?iG8wAkg-{`^BRd3u7u1k^De$_jBwr1@a2v_1fS-ICIG+DU zq2d|ESE@mEz*`gKKu-Zr?kD+P;5`RP{!`$E?Iai5prTQU4pngs`1Rvthfd(VZ;|}B zz_~LdKMB00pX5uVEQT@IP@*BXP!+#}F8|xP&55Xka&9xa@Li@^Cx+SYFOVCv-620ji`3ulo=5GBnPi^Yl?2UT?z zE9(V{OtXP#^$B(?S^}6_95z2ti{P|i(G$Q{S}g$?Q?z)Zospwh+FVh8*ah#>fG@TA9g114B%LB#aje+|Ug;)oT5I4bL+&4c~_#1n_8rH`TzO zY7kT-#c5;l2u_~Ira2t~*6eh8v9rBa56t44wH}8R8|3r_vBl1y8WSF?`p|ed)gF)F zz&bo$8y>;q_hDCgSrlP=jeNA{J?0rElpB5QeP+Qpc=rjJqKDsS9Qd-s6umIw@~kOZ zbUsyk-4xv@UBYqta9;E1 zsep_l?TmP){){$Xf0G_t!Ogx5O_)Xu6}A6iD9>YOls(v?+`%?K5zN#4lF4MyopMy>e$?K2Aw)9A%ozW69LqcXq$P5XY)r1^4Aps{O;e_U@PVq^?0gSpF z?KQFoVr`TEmNo@1&kX6BAzd@1YsTrudV8zV zog1ZCvDVq8#`+&7I}!{NR{uKFJI1dHUEYVSf*hVdSU2zP_?6*M^%$x!`ama5H6;!_l>TzDn@K^rRf_V&fN%gYV&!A4Hrx3mXg z!rh3zxRy_*cN6aUOHy%qV8Wsk->&uP}9r2{Q=X9DVU z;O==Y?JLkY|1kP%pU|3SdVXq?#-nLodf?6I@W-sz96vdkEi%>ZFr}hD@)7Bp!D`V* z+Ya?~#7ozrjGTCX6+6*Xe_<%qFH`f!w_nCujs9nqYP>K!j@7^o^KjKT_#)1s?EFw9 zp;UyljKfT~Om~J{x$xC^YX`e*%#E{W65Ac$wTHc|T8y?kO0&d!53obiklTm5qC@C{tk0g}hkS4nH7~MoI*=iG2Uu45<{6{aNWe>b*Dxct#_n@MJ zczF*B7P2kzUB5w1rU!kPCJm+88BFnwKS0aV;@kG398=f)m6W`F#BnjV;f5A=)DUWUpV`2>JR6+ljqe>>Gg3&a{#N)udr)jP|`I#8} z97nxogMN86g&9fXAtqxMJ>k(NnD$DVF9KdtP4i{I!%mtv0q^$GyajmgVnqn0CSQb! zt1qOOB6R{UZKQb@@T%|7{7vA7W}5E>UOuv-O8j?VV(`zj!4cs5cGLVl;H?{Iej51d zO*B6bJh+wSSMeo|Z(@4x3vdGrKHEtfd;y&4p?Riy6iiI_FKJ!`e9u9ePXS(WWMuuZ z&McUaj?)G_@ZnoDuLl0vC7Le;?mtQMrNA$qB^>s@0Ve9NkO^%J0>Aukny&|5eP0)J z056HCf^PI0@a|fgcWeD;VluMniMPO@oum1$fk%pI{yX3n^}857E^`d{_WJ_=1$d~h zl6G_!CXR_T|2yzjt>$`!A>bL`r1ftDFVbqR>l=|Gnwb8TszR#&%rLRwdD>A4aDN-k zX98zlqPY$D@Bq!91m3-OB**a=z(mO_BPST2gG;af*ahT>I zg@O9SHYm}iYn-(+XdXOTI!hT1>Qb~*541j>jLfnFz_(m(^UPjzoRg*{S(^Y z1aRpl&9(0_P0ZzcG`|G;!TS>W7vZ_8psYZPKMWJ?kLeTY^3u|j!Ng3TLi0S}C$&SM z>MH+=fd_b6e>(8C`81!Crp7lhrM2|LTrfDXnC3p<{uMM|0(^Bd&7TFH(?auR;C-R> z^u#)txO$f68-Z8+isoB^4@PLd6Zq=aY5r5-N8Zr5zW@7RV&Wd!AO^ha5Y68OUh)T; zzYo0qJ(>>!A3R0zdCVf^lMKpW$hYBSGcRC+FZ)h;0@l~QQ;vRm18oV}Jys^z(6~0Z z$}UQ75CxBwP>a`61Joi)E+X12HX_=sJ|c>eNJQReA)@3Gi0Bq15UsWv#aT#+Y_(Vj zw)pgLR;z_1fm0)qZ7z`q7dQt(t-RMm9E(=s%gKA$}cRONeCbsIiku@HU=k@D6=R zyx<`Ug40c62v!?e9q;sz67wE`>|`QE4f7yS@L;F!cMDx1f6yVQE>7CNzLW6gWnaaTs99$$SJvX7jB-!aC&{DY%YsL zs*O(v$7k`qRG+PBr3?nKf@@musPb8#HwZ5CHXzie?0nj5>;lAJ5v3b=%pkQ3I=X! yx2}OR+9+j!%iE|J4V;NK-bV9L`v=k8c+6Vxk*3rNaZPtnaaJF|XAG((%lsF#+DMxK diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co index 605998b54a77be1f2593d78da78f4c65c0fced4a..a46abd077135fff438b88048c3d1c0d2b54f0910 100755 GIT binary patch delta 3337 zcmd^CO>7%Q6rNq%vE$VCu9c*XfvQ-+Uen;LP5eXIgp!7YkSL@fp?2B^8rRmA24X=F zDHN|`B~}hCB$^8cB-#oIAtcgjD^!8x1#zHH#E(3g+$ZOoXe73d#ME zSjqc+`{vE;oB8IA-!6Z~oc)?v5m<3*_2F5?i_D86XFf6jHl0$t-?Sr&gXEdwKl(N3 zP5VjvTeX-ckLKVAh9ezWXt;Styo(9=Z#^C^F}dyM-RAfdt>krfz*BDAzzU40cCv{k z%gb|le8lw!UUxZM1ACwRC_J0Xy_TK3bPfOH*vBmdYk@)OYLM520eP6lmps1e`3`@@eZgGAZ}T5C^_8S|ypp%rDz&Qjd5>8X4t-kT z(ORV$ovpN>bCv3j;Q&I_ss|OdfNj1Yq0cK4J6z~vM+yQee;;_3MFF&AAobBvu)SFb zOI)$Dn=gq{b1~B0Qrdui+$dAsfWF+gj-v%GrQ=wEJHS)}D5hKvypB+Dc-R+2vCBc7 zkBs;P;DXLaM|~mSA)Swn`HUAIWl<0U2m%U#0$>O*1Q>92$U#?F7F?nnaz$k$fW0Ju z2@*h>1aMFfK#A2i0FNXGJw3AE*)4}Wy$}G*9F|iLJg7i`Z&+(>7BH%g)*o2qQRk9o zFIhF8ZBd&?C$y@+zC!Rs2%ZSR6CwE9BSaSn5eFfXAjB?X&V0=}G~8%|b?9n4$11RA zcp@23B;$!>ydhb=73%}V`a!X$L9u6KysdXxegFRBHsu6Mgt)UALHDC_n=;}S8&cn- z$6k?BNPWrwiJ=DfuL)FdHEQz*uMaJ&mmVtGlwUSV8XN+~Z>z0zr&C1H4=XB7os{`6 zDbiq`AL@rY?~xSQ(g`W@>YxAb6uHuMkEDo{JZ^jN=Z!^8Pm_T~>0N0MX`F3I8(0Ru zAeE&Bq?OelyE*IKK44WiU~NoUJw0H?ayO3`b$zJl7`s3!za~;VZ6j!uoxuCxpG&`- zCZH*Tzx`F6H7IGPHC{Wvy3mR*j&p1@im!~%39;m~7(blJMskm3BC&*+$i#9YgBXgWLdf|!W)QX1`w zP@0G(35`UOF=~CPzaQZdPX|82MezK-JbT`aukH&wsZbX<>`%p4-{Y{H5}oa2`P+GX zCKczYS`?@7!bA&J(v8@j_BIk0Kqo=HEu;{cwX>}ll8#YFkd0KNhDBC&oNqtu^G6@zOV0aau~&7eS4pao zr(Fct-}@qR)26(G>!C0m%8oECN@-D-e1W zY=qFK;57(a=}i+;EEX^p(ac!ILZ(!V=3gcGO(g#Y$#28?tVb+hD@8MVRxD(z#Px@` zK1FSs?&-Ge`?p03Hm(3mf^;h56Yx3`M`c4;CqLJX$OF`btTf>WDUOih2q`W~%8p6# zn3NNfaz_i}hsI6xTeh@zVjkoug$P@Yu;mC_4zrbp%4*_KYyNV51K4i;U;Ef&J}KX% zwubvGN}8*{t;a22eYZBkGDQtV|4(({quW5xdZ-Pg>FekJ9^v#8?n6zVrqAu#7?G9f zYezhfJ(no?hi+7^9mn>tXUsE9$$irIYN$K#5rY?}PlKAf-`3Gw#k2&kuNgpu zMMi%%@IX}L&*s)x^@GtqxGeOAOGmwa2y8AC_V=4h`K~fM?-05|9lX$C6Uwb2!PQ|e zw_Ev;i!b9l%LSp+$rqwo`Z$^i=m!ndLIe2-2_#3+K((HUji;0NPjhTMkHi~y<(H6n zj}V*hB=MIm*f_uAY(}v`0_7uOrX497m=cGzWtV6uB1c36w2ir|%B)n6aI>o?VC7v7 z&9q&p(oEiEC#KD6+ltnlbD_Ij3HsGG3nC19(rz(E$d>W$m)d!ww)8dp6an6tMKm}I zqB`t)h>Kt7QNWQ%JS=2O0SBisCi3p9MpJ=oWDP>2Hl)vs>Vo{qMKbdu5p*XJ?&uDX ljY~z5UNT#N^AB71p6~zw diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co index 8eb0bb26b199a2b8b8c2fb6ee53f7f71960c4dfe..f638a1b9250dcdeaf286382f068159c1944f614e 100755 GIT binary patch delta 8622 zcmeHNeQZ-z6u83902csYK1L$yHY(kKXWz4Za7>qSVFjSez#H6pQ z-7*!S@P>~dCPK)ViBZ^7gJfwJgBk<;A;EwX35#e*FbHu5+~($eulKRN;YS2Umy{-b z=iJ{p_ucdEdFSSrdtcuX@q4Gl(E@15+iluPY5@1^!xE?xZ2}1it7;Q%@PtrC(^(Gy zcsa3O8jELRyH_X!TrU6uqJ)yicZ8_@_Z_K)M1J0X-;W9tzMOzEaa&%B@o-Wk>y=a* zEFkR6Jm}68z0e$>HYxs~E-Pe;%9SNY3b*=v+nSnB-=GF$#l#?Yz0F-m(A7eXN?g>w zEIB+wPyy%^ogrk@i%?7bn&l@{Or5fTpn|H*+59ZwVO>b7(PM=))FIV*s#QIlaf3DK z()s{FwQD}lEDw8kYP_1zOz%vpQv0?3ldQJ*YoIsgtp)M74Q=rV*7q71ldcPrD)3=U z4L*wLz|q(Ya4hzcaz}hDzd9d)u`wkG#f_4dpckBod0}<19M%NQp#7|I6$D1m#!23X zm1fEIV4+tPnpdO_TfMrFt!PGg2=&ue5o1HBueOGjwLxtiE9-*VH3T_k1kP`aF9Q&& zuGX87ZesPC8oe3mW>&AQ)mxBmVfDH?JvZ>{5SUN^CM3;Bnvt|1X+hGcu=`Dl`F^v) z<+mt2elCEOIDl0+fYmsFwIuQ215lRwP0Dh=Sy|z?D4#(ApeIMu$p&tE7X`RiJdQ~* z$W$%g0;!K@`^n9+wzxDbj=RK>cnd(uD0r~R(BWk0a58i_8M-yxRFDri84FP|yeJuq zQ8J3T=hg}4B5g7T0Ta*<5F~nUbU1A~oHiX!n{Mq?WNjJB+H#b&6)0;}ekPPHmymjh zlko5=V&eNqPEket%L~cBr1TDd2r!p&zvnWc{l@~DicARWhq9D*Fo7pK#N?Uba2zcH z+}|M%KkV|DinZ$hlt)@({a~t$)1W*W%&0uB{F&P25wlK{@~Cn|#JQ)2BXPDwDkI)q z-lZV^v#2dz*@5&{Z@ae@#BVacyE|dB7gj2i-N6(uDiFh)1XXzL4QBA^bS|i5t}%?I zG+PRvaxlx5?!225H9z7+_e;v|krQ2jM5sS(|^pY(`3-S>vpJ#wehkm=k#3zEa?1c4frr7D5y>9lcG zlGOdA270-I4Dy?LDY~CDdf1%w^@`%iQn#Fx+!`lJ!?IK*N>FrcPSWDq%UL{0Z2sX4A@PwW8Tj}kAccel=@4Vi=r+U>MGlV~n*dNx>H z8ylN!z6Fgor`zgoboxA2k8h*ZYHzZ68tv;F8-4C3BXy9_(?``r8Yu;X>k_x+uX0Y2 zg2)jbNyx`AzGD)ehw&@V3jOC|d{&VV_h7u(f)sLstqdFFlRB)xc*oia&PZL2@sq2B z_(qI(SQ00`ymc$aPw(OpJpOiUXn1wpz-nz6zv>g>QHRiI3 z9DQR3!(=UbP!!Mget&h?v`n<%NuLJuj4 zE*MbKhg3viMgZ3`s_}jm;mY7^`?#9ct5N$RwqI1!^=x|tx6xX#Uro5v@kbIN4c(8L zYJU3hHxkiw`k zd%pXfbMHAh_kAz-p8lRme9H9N=xX+4$^M9eN`4Z=Qmi}%*yKpiD32*?rj0+8$BLaf zPmNviJZS}NOEPVHlBKA|z8;K%wjy3Hudi5!q|;y<&_gwC z17+x0x*{LGs@KAUx-mGF!@>RnEq#3q9-v+H^@rG);K$5jsAsnV3cbYIfY4$EOp#HA zS5ewP*5$A@;JP~v@H4|tCwp^YmvJKRuH-eV6Jr!SYWfBS%-0IH=(@!t)bJH=6D3|- z+AYQizKSvI5eCuNGk=1}!Ga!f{MhrXsS!+%9}BRifMlCkQ?q27S<@~sy;|qsOLQ)N zjgI3>bsjz$zm3GdK;n0h_!n_}K`mcWP|vR^Xy8i=Hu2;AczueRUi~hU#zGHIv5n~E z#Xr!2{hDr36J^9fRZM(?k_$4Eg3P2KGbzZNET{?>omVTjx zhi718>0tA7lTnePI+K52siO1N45Tgl^(h*0T?L14orht!)?Yj|e+@zHoLohgW)$Wu z-#i%;rDkl5c~5vZQ{oj>x47{b#$Dc{-Y!bKgC4CrMQ7XEFKZGExP!j-$)U5Qg$Yj| z+O_$Vn%QoTil|3e1dw)H2Z(h1=K=}Qu)Qf9*swqr$!R9NBXboAy1o5E)-IySorP-u zRyv(_uF62{9y#r7JA&*Xi@LdWBCUuInfX%$)Rv8)(tST?RR#X`{v4R-xZ(fe(foA) zwsRw><#joImM-A*_YdE&1}9>o)D1-vR8FQO4%2XI-RH1$-vFxax}1Gi!?8pTIE>aF zU&Kf{nymniWcc1%G&(a2*KYq^ztX&UcqSOyX9E12}IrH~ol)>USj(Y#lB_H8%!f$$vmk*N~>h{LW__ZWHw=$FW5Z+dTL1c!njyNXeh?Hjw;qNx3 zs3ef<)==TCnfzW7e}0L4YFE7b32%!j49S0pI4&MYJ0#Ric-QVszL)UlAB&e8qoG`V znepdHy!Y)){(_ueLk--^bPSUTgWqQIR~7z~O#Vm04?Grshwy#+%uf7Hc>6?45~cqA zO&r7K%n7eaLpnEg5mF{hVeq6Ig%iiiHwvM|0**%N>z!0MxF;0eUG8(+)6hYny|SW0 zf$q8#>#*0P*xCjK);m24taaBC==3?1*m9321)aWn91xK*+Z$XGP8Q+xr6TG*DYn+G z|6?Zd`f_`*1;)7aIQK9*3~pnyLWDJ2Vdz3!75{^FuEL^qj|j$ zwSuxdo=6!>e8>v8JVn}ZRv4GYH;MlyD^wNZ@qIGJuYyCxpke{J=yn=}Av`oF7dmp! zjXxB+ib0WJSpnF*)a5l#s9FIyqh*b}x0Za_-erU+=4ng5Fth@!r*Gs7o7VT**LI5&Y}ZK&8&JA5N~@GU@{l_+YFJRK z3gx=Ni2^EH=mBm8+aGJIG=Fq$;j%TIP#h+t6+xvCRi$h}3R1N34_m4MYtleC_ug}y z+^`J+EskU?`F_9i`_AJ#=eyszpMB4bf9kItGxS$7*J_`29}w(7`HT<=tZErzVUVFi zErYdMIqH^c06^u!c1D@6Ci-1knNnH-;A2!!P8-VQ|K1R7f)(Xo-5=_P?Q2pNwOcCn zWH=L+J1xk_R6>^mz6P>Axbg4?lLSwk1`JQuVqPSm6|M-5OrBgSJ71>!v?Xm;c2Dq=!ErM zbi{VEWJ*>!jUxb}Ui+U*6Wx(t*(3J!ipUC7?|9Sohh;t4C&6pOkw%c+y|yQtA^4%S zlG{1J2v%@**aprGJHh$k<>2Mv$1F#)a~0bv0GOMzfONLXaHuN+{xlq6wsj?#hAt22 z9jn^TfGW_VNLojh9>d|T^$}}&L$$5j7jdT9>gC-txSeAwoSVUI9SyQ<>~b{8vZ>3_ z428KWz`s-VH~{Hw+gxsp-7;=yaCtEH$hfi5<;B=5XKf|7Hj~XIbKfL zaQAPqfzRvbITImYd9i~*udYf84;S@hP2KuzP@l;j0@xRox7HU<;tMD7g_HQg*{rOD zaEpDh3Hu^~eNlsb5mlz`)GUQ@F~(DZRSPAh;OfgzqjO7T%7+Z3>A0b3f91R6LqY12nL6Ogdj)ta!umKV#^2ctaaP z$Q$pIQtSHk!sVOYS-cM@U%$TlyAF}tv0Z(g5HYG-h_tn8353WV4-S#Z&k8$4uBMhy zh-~#|^yP2e%w**-sn0}y9k~Z&|E}xF*7sq2B+?r>0f_kPtT`DCS$>- zMk8%YB}7ggm=I=W3p+%vrj|;Gyn1+|{6Eu~tQ;mZHF6rYsZo(mNsXdUv>Ecf%c71` zgVNNEW<9#rdxpV={rv8;7p96~REI`>rk4hV&+`AP%z9BZQ5rlpq+>2sNc&!>(0%D2 zrSH?GZ~d3nH%pr)^}^}pYRYt_q)<~PlyaSsB5&?k3Te|ii?H8N(9Z+>kJ5lN@xJPf zJf}^S7Gc9D^C{CiB?X%@^&CM(Aa&7J;Os?D9=lYW1_!?;A!rcn!g@51XvP&Fu@8x2yEq&<2mI^3$eaA_s< zIhmAAobkXl%&bjxdEpxSkWGGx2LOG$IBt9P$O>_f7g`yP6Cd-!&PqPK&lhZMX=B?r zwy=E27i!_#!@h8PtIx-^vEdeOZ%a#isI3b94!Tg!s?d7rlaorJ%mTFZwCAN2w}{({eDq9|L`;C{mQ?OIS7!ySZwW4o4bCH$rP z)VIJ?dw}p!kuqfb!$h%8YY*rl{A@dIpvr#2|ENRDzeM;u7WFegc<&)?{g>5xCa@)= zRlG(TG^iuxlz$=oZ_ec?1&k5?iIZCXeZp^1Ez2oCB)pJ3o#PbnZ=$$j*Dl@Xgzx`{ zwtlfv%><&V#W|&u@RP6SCiQ)>QfSk62@ZmBoe~|D)F)iOs_y%%Bfp+u^;hR@#>mQek0uH}u zxmIzKG#FX0<xr$pRHIyNm))~g?em2 z+{D6Oq0XNG9eek+b?oN+wXu1)nLuAGzLCOUI7lJK`zQ>C{S?;5I0`q%SpoyGIAz%c zOPjGFpWLq?kIee&>PTgl3xp{Q`RH&gABa;n%;$QG^E_cWKNq6V$L2b#WoccGW$EPn z@jBYf&k9_CwhQK_#D#Km=3==n{47VO#3kloT`1Qv7p80J57biI{h|5cVu4yRS|Ait zU=5yrfKB8KO0aYu{<;KRs{mJ*gM5s-Be2mQ63_b~FJA_klttVbfa^K6aX<5B7TH(D*)gSgk|L+X7Rtx@HOTy2ebFbG!_Q|_EH(wQJ!$4u3TV?kI`T)A1T Xo|-oz6oRWtjuUM)- delta 4677 zcmd5=eN0nV6uu0JM(}q9)x~_SOQuaLnKt9njnoqnp_5wMGpXo zoIOu1;#t_fRW7}uSpd*VNl7CUDKviaL@pPmr!5?hYl97|J+V?)B5@X=L2yMZr=~Js zj<$B|ysk;I0HOfTYokAj!drz6n#nRg$5MHJ;V4Apw1O*C0 zsvuas2n)T~SS6rZcO>rmv)15o%#B%s}y0d~#tPo`-omef(YC3T@e7rHn=*f&Rd9q?y zPj-ye6Y}4I{a?cVyRiQrN=^s$E`ws(&kPZ{lVM01gNEafMiwM~fp*%z=KNM6KN}vwwT%c-Hh+e%6 zu0gBQVYWKVR)^W@5Zf~aXSs8GqJ*ISj)I$5|B35I7-8{DWqHCfxnS+6;7ylUB+gSa zVbJPPa^=|r)mwvYegIlR|4$WdOI8peI_KwSD8^BxYyHg96+-Z`QuYOkkg8f&7Hs#6 zMJQ>vpSG6)|1Cv_zoZlCt@dN~R^T5K9&PEN7TVV91%paBCVWvWxz3lC8?>Gkc9-?2 z6&I~-oMpuZVP*MI%6ITTZjGvoLsX5}4gkFAFYD6!la@5d7NygMc-Zf46BF5vC!&rXFs z6t{M_QgEGkv|#IVbn}4VJr$**CMn?v_q$50$9#PHv)4hLN|<<5emYe4kH^9~h2Xl;rFLb`YT3p14LVcM z1@o0eWxfqUy<=DPBPt$+FfrVtuD<**(n2q4m$ZfcWpR4%q2i37OPU<}i!P3refHwm zgI(%%mmaqJ#R8GWI)(1GvyYvj{$DBBs9xwUA+}L}ixv*Y!J+;gDR{}6J0XeB&x15&u<<+d;K2+dThEwSi@Uy|mUY(}+-7}) z+g58boAhjhjb+$6v)f%~WjAu~!xW$NIIID#UE)(>ohWF3%7xQ1k69TW#G9dLP z#xaAi@oS7%pASPJTMPaQO!k0q8jxDL^T&WiRxgiL@hLv2< zMx2cdLjoI3Va7%_ygH-9N)~Fg6@;r+SWMPzEOrn+jKM&1FpQIsU|2?6v$$jvV+j$? zLptLqG!Sk!Jt+`VSU~tN#YIFl87#!tWO6z=LxP#_Duy3Xo>)HD33Id_e14#FrO63d zEjdrF()@LCJc8{bH2(mN6DIMP$o+u7Q36--RV6TC8U9Gq63;i4KubLN8iBvj1d{pB zOHfL3?FF*<2PG(EjZ37%Q6rNrG)~W4Xs|l?uLa-uxZIyK6khK%GQQa7l)Q*3mA_XZCZCo1@0tsm< zRUo{Ml|(sIu+j9ur3Frva%rT^Ap*%;&QMAbH!f5{2=$aoAe9y{Z+B-!T@?^g(pF+6 z@Au7{zc=&En|+oqQd{ek%+Xun?_B0z_25NvS^e|MECWxLzSE% z^=~!foKZw`+63?f%#Jh^9^Mi6Q^CN$kB4hiseRL-cl#Gin>Z$9~k3tcmaYL|?T{Y*P;UCM^9MO8IuJ693Wr*)A|SCVw5!l9LKf>{~`(Xx(a8x1+z zJC%&cRI^9eT29?a}{P>!?6l8K}n4u>b(&>i%>P4_J@Fn zG@i-$IpCbeb2)z)cv$0OV}5<&*Ju<12SR`xAO{!*3*{k+T25 zO-h0@!xG6@A{k30V<&e*u_P$=5-2tdiXAWGlYOho#=#}4k>-{Da@`~-x@T=!nvvtQg>Qne5nS`t4HN09oXWmz3s1->Y*h>J+Q zty~l?(2x6KRZjM8SgG3zJ7DmhZnfs_!KXkE)=^ z<`PJ^II+*1(8(Y%L#nDV+KFV6J)0F;i-+Z%+J2aM=g3NbBGH3pPE8eOUgd=tZ_;Q_ z!lZ(bH0g`+L6Z*l9m66sf$gbjdeejFQ(Z1oqb&xnr-Zc+8N8k1U7cj-J6Rk_3ocWr zB#Wohu|1>`a^UrJj5RZ4@;^cr@!RQk%#1QFb8&J8M@9u7GDJlvOIAza`B8yJS-d_v ZKyPI6Er5@6IFcFI!{m)NmtnAH{sjA!PvHOn delta 1694 zcmb_cUr19?7(e%JI;XCC?hO35R+`Ki%jVqfoDG>8uBGVQCKwp8)OC}AVg(U|DJlj* zbEk*?P!SYK^k;AvNI^H>i_F&&>TM4_NH2*)qzTfv<&Nus< zianrWW*W9m9~ne90?Y`2KzbPB))l;VOozFvKRK%Fj$;T?FCRsEX@9I!CZKf>3G1{4j2KMEnVcimnFs<~* zB_YWaQ)nJ)i4s7P2$hsc*f0FG7Y?$@$gkl z6Tfm5pAW#A>7sgrUmoBA^b1VUCpt8!nGiK&YH?OIE?xn$AeIPXi6E8;VkJR#EQrH` z>ad`C=}`Qmaf0l1E_^HfK^?@Eu|z3Ll(IxAiG5V|!P5(z2_{c{PgGCfg4G*~99K9J{I>r=CBaEc307i|H79NevKa3O9*9xjsiY+*ij{p~qASzL4q>05o8{gs7GroWu{ zpKOXx-|qDJQ(Y2S|I|rL1NOHAGRMf$!!*)4AVtO+af4<|^eQ}MHjR%#>?h+R?w_o8 zaWxTYjrd>E3~G?u24s#OQ9!hHib6X9cYNP;Ma;?bq>%89V!V_=O#$2@ec?D zxLl?4%M6NR(?BMx0iuNf$GVBBU=Nyu1?Xik4>^N+Em_1eCOYmpS^>{j0Ds&=2W)L~ujW`v#)F z%j8YL4|mfX z#VaFppx5-o!a4k??H9adv)E21B5xm^&*jc%7d9IBvL#IYptV11?XS=t#xs5Qnl4dD zdB5)#3fJw{ea4266?i}vB3NgY@9oc1R0BW5zR@d{#1|c+qc|WADEl4LCYeE>S6TE$ zbq}gn`_X2#IT&>z)NI;OvF7SpEQqL573pX}qGJUPJ@=*SG>u$nSwq7{MXqb95EGf= z>EmoEF77ENj`x?gVIC8wn%gj!NvaqxFjFc{6qqwq(}n!12fE=I)&;#hQ9nQOLNAsMG&wFgXyyifHIFKR^ zOcMtlP#uu{opr!2=H2$OJZBH*J@#>M03wI&R2_Fe4Gx|+x9UuwRKD&1ik2U@EZ2HU z=9*wG*A~$WwdPP~iC_s4EFpp=M6lO~h}(e(0f-0!5g{#Rwq+l>+jX!H%)ZN%3}=QV zl(B>|mQcoOly#ah2^c#8jGY3;?#U~I`*(NAPMVN#W3$0Hi}H2KhHD%~^?+V@EI);m z!r<2$6U;v*Pp6q^%^kcYttiZGD`xr9l~N5Z0PQ_yF5T$#kkyOIpr%92{G0UX*6$9b z;okZ5_+S{a&-FmxTcJo$|5p3ccgE zX>s&*3;1$iT=!)s?-ON9O7td3S=dXeq`d^S(h_6|ex=lBmkwx#;E%sI=QWXx8+F_v zcJ%nhQ zPW*7fBA26N2?Dz{>_KaHwCZmr`mNzsy`rf!*(q^Z9T)}t}&bjaCa(Q>r@JzOikOZEc$?hd&zabNZ zbQ`jRkTjmHcGYV&t_DqZ`L$YCKr`{}41a*(lMLTO@nTe~5w~ix7}ILS?b^&4njfLr z>XWu#`p{7gk;J4>U!s?-dME5Z?zotDjM(zyMWgxC=xW|5A~8}DBPB6Xl1VB^Nhy?6 zn38HX6Vp?sMT+Ni{d2`PTxBF;Y$e84Vr(VKRxdPYHn{4|wSC3EO8?V5e$$^G_F1LL z5`)rvRa$zwZoGR>VI+^x1@rf$XypG*;9>n=O`y`=w&BNr?4E@Fs{2a2w7W2xH@e?7 z8oeIfjq)GhT2#do`s?T|;ZH{gg@!S`H@2dL;)hx-{8ICVUUf_-ZSP942wys@FoBuQ zD&y5O7XQe6@JU(;u`VJsMxdkXlpIh7nu3a&8OZi4nf}I1$e+zb`h%gMU&%(4Cgn&d zlQ|MrYT+JUO}{+bFhUM=Hg09*&t<0KK=)PdvE)kTe=9Hadzt_CBK#)if7P=PA7=h_ z9f;PSw|20N3yT8wGXIK0^T1SnYM4R^;#VHuO;W{OL~o!)_sV7Jx9$}S8ADK-JfDb! z;^Oe8-FITt)QxY@}r-Ek47 z4ysOee29QIHD~z<^rcjOv^yE54_=UR*pe=THz_v{d`@60rAoXl_D|v4#;_Hod=hd3 z`qGMfY|v_SEMZAagpKJn@FwkXa@S)swvhG-t?l6L+3M6fEbQPMflECe=LGY<@~ diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co index 40a8c3555b5a5fc8b928069e48cb1f3163449d81..1432f24c95ee6cbf2e8433d4251bdec5dd41fc27 100755 GIT binary patch delta 3251 zcmd^CO=ufO6rNq#lB`(Pj)+~N6t@;ZTDgiXsnTkjtc}|GFR5@O$Atu3gtgLoD^e1$ z1EwWfS=c5A(bqe@|D*Qsgu8g@Xide*3$2p$<{ z#ZA>bXueZrbRjFvQXzGa zjvunTw{Qu)>)b>4op$GV`t>gcE=tmTe&L55^uV5={%VxFjdCBCL6q%!%W{tb>aDIm z3f*&c9x*qBydjJc;uWkz^r!1P^euCT+Cf*~r-w$$!iR3bUFs2f)T8e67KH&nRv`GP zau{q>y1{0p{&La>K)vn)CC%5qSQNlF6@g9`N9f5S3s!&jouPpbtTfQ(IVBt}rH8u95AirAMb#hTvCOoNsUmYu$EhMSL)CpCrfm8y15ipPJ$?||^y_?b z(!*lT>O7V51TYWid}_+mc<~$!{MZ3MMiwKBF@Q0E(dP`xe&>M9Izw{68Ic z69>){2j0;gP-3lhz$M6j*RagG5^})x26h0i9B-%Yxc>v};Cx%N&4P_;t$UwV#yeLu zTe(d;W?j}6!J1b0wpIv8gn&c{NQ8h-Mu;dDg2O`aScnsiHN&QTupYGGec+EgpcH&& zAdw6b$smyoHY97cVk20wF|61GR_wH_-Vc9mQ}?}5N`V%l#M2E4^FS(3C?px zxgehdYWdi&4LO*9ji6S`(HuZ1F?|(n9Bos-9Aa&1Vt}WV-}lNIz786{K5O}D2aBR# zTc$=WlIXt#i`BSE)Q!RCB3Np6<&XXUcbs@mK38!vWAFQIZ&}mBWPDlpNH_(wU)oo+ z@vE3G39G^q&}!=Ov5)Cze%0VgoPE7;~CsXM@XC8NuSqk!5Qj-j8dPMWR%|7)sv%q99)>q<}YMIJjavay4EQ%=UECiPkp82{>rIA0YLZetPuCHktwC>7M1PG-BVD$3=#Os40lkS<0PS=j zON5;V;!38#zW021gpqP%7JXid23RJY;G6}gu@0t0InDYqaQILrzGYd(5?9YItSYa$ zJZ7fHkZVmLR$40zb`cavxVJox^Y5ppjQE}KhFQT6txpxl;g*zWTng}fBa?_r4LskR z#73TPN#bRmzsV#DEG2TGrBrrUoN|%HEo=TJ@LvJ{b?~C|?}TNwwJ2qi!$0YV8{C=UrG zl2GL&RHZgCIc%~>+9p>vj(=lLCMSZl0;Clntw7Q$f!uXtV2RyY_P_RGhuRdsLtmEP zWs{W2;tgXm zyQMBM*yRd(ol>x(%j5MpMX5q86GPr$FjOuU;W_5mOph^oYF&Ge~7JYFKr zd^mu79$UkrK+lU}2(L*;@sF?x-wj*wqh1crhFt=E8sW2GYXDb{nO19S$JRbkn1bS` t+VxEs*qW{{$3)dvn)LgNV`m8Rigskx|UM1_s;L) zIO{s8%4S>Qq}=y=pZC7+`}Oy}_rBk|e7oK?U;EhHwH8&sb*yBsUwq6Nv6$o87>ZkVqK72f3=|a2wYSA-`5kPuyI*S zetdnM8mKxGrn~aRY_t}-V{(EthYL@8<;I z-?j5++W9^{mx>RJUh((>3%UZuX2fP)fjQznv{3wbK^WTPntUgeHl0A(%j{WLmF_F5 zoWqpCKQ3Yy&)BYthnX+Tx5t2QE{fj6a<*zX zeGkjpwi6m_v(*sV(PrBTh3Nob{}^}?fJh`#$YY)-d^B1Z#C(wOu~=aU^C7}_bQEfe z_aMMy19(h>mH9TJTC%$~T{p z!I)93-r5f@$X{-)F%WxF`9O=7gQw-{(HjPAlT9wzF_wkKSIQ5@qfi{X`L4LB_SJ`} zfBtj8H%$E%Mq$Ikc-lXno7gteKkfJYtrW^D8shmYrMZ7(YO7VAXbC{!k|E*2v@ZbU zrUU!5y(^V_1y2553xwxCO(gwaO0Jeg7g<YJxwzE z<@X~d;c8Jlv){5NHiS&b?03?jrk?6wEPRrk@LSsFbLIY3*bkNZY?%6Ydga7*_>n{E zM=(dZyc7;Wq3CWR#sAfsVJSW>;_)+wa^HbYL+OVGXc6*0&!o^iv;?JNXj-W z-HoXvJoU6cnxCBZKQJlgjp!&;ns>m|zwIkF9<44=y7t3P2tQDkz6fiK>F~y{1fEBS zmPptxi&D^u4nd{*=kQ!E`au8}27iKboZD56b<`Q{>6}#3)hwjm!=Xm0&CB( z8X>(xvD%7oDE$78Yf}A9fD{9x8+MT<0n!sej#m7STyo^a2`HdcYtq2Hfi`fM zDgFYb|9CLHm#`yLUhjyQWR4wPEVXtc8*;m)qur=wEz9k8dh6>NT#e;*F4pJt)v=A7 zlWVMZI^7K}uFm~zU0tKE;k#llER+H#P>#@~w{uWs7LA&~138#kUxe@o|dZm^1$t#ZM1Htbf+=2_0A=CNK1W;zPe7 zEk21OSDR)6mfB?8L2=HJj4!8n&83<3yaLR1$77>HVB%Y-QCdijED7H# z<8?8Yo12+X72@dxye>ysH&-0+KreF7VD>h|#7N?u=rtK>X%vM-cS$Gf}qt_ bO8U?ybnC42BSH%Xr89&c8I(TkLu>v8aM9ur delta 5268 zcmd5=drTBZ7~dK0xZ~iCUBs$Y6fvJ8mug`Sg$>6+h`iJq%o;~XhQmrEwPFgtq+vW?#^&7F@oiiy2#9{8}R3(Wx%4Uyu%jV%6&5utkvI!wK(NQcN5W(;kc#8^dx7G*!Sl?Ds z&u=I?Azq!_1AJR!_Xh7()X;PR7`|6gN7IF}^wYFQmJKw$8VtLmQ@xqd^SxQotT#K_ z>Q(ID!1ixq`&HO}4YD_9cr(pQyjfJ4~HeE>H{+^ej3&bm>w;=DD z5mPuERUb6e>4KErAFQI(q8MBhgNtHtQH)ZQ9Tml)qFkt`g-T$wl~9t%5uG0xJR-Iw z=L2jEpE&nIz7K{3+{G{&H20ZupJ6BcY;0Q*DKFO$X-uch7ae- zPFHne~Keg7GD+$BtAWB>O>)R2m0VzCP(eh6uCTEV!$HPU;ar50I*j;G$ldkpcq1v*l)pNBhj- z<}&p@+&IF- zv+-IcmY7r~vMb=ZG53w~uW%@Um{?E(kHmu#>B0N(;&_=@Jw-rUPA9MI%Trg$Cr0_I z#K+cXbx2LcQaQpReF)E+FuobfTMvgf%4#>BC+EdW{DHVBf~y_UFfCnJ-VV1WTe&iu zopTnKtzX9#uS+X-u51a-nb!7;XwU?BkfpRU;;_``Tm$8~s~C*jE@S(>qwN2{xTg)E z{Eu9JphFB9s_#Y&t<5u56Mb)0$%^(Oj9fY#hSr za*&ES&Ph~+5`*+S)`@M??U@dKQJfQf7700>umV&;A$tc2A@VmOrUGF< z%~;AF@e&yZZRn;P2+V}aRIH35XIW}(E?OP!2!&Fn(he5v=nUidCmnGFMCjYy@4aKG zwH*hW%ZxXZdtZLH+9cJJ-WdWMgB`s&%?`VBMh(|pMK3_*!_`+o)(#ilsc z{vTcGECm;50a9MvuLRc!wNQVZv+Mvhhb-j?)sc=YuK41P`gP2Vy8o_E?`GO2cUIc^ zNm&HLxp29pQZHfa88JJD4V82Ec6R7@Rh-dJRJbeVT_1b$+WXqtez2l*|FHhHJIb80 zzE4}<=kWI|{d}>kZyMua4={QMD{%)Hw|+l6R{v;BJHuyht`wNeGj6VAvX8?<5nsXJ z3NF3);!-~SSNuVJjrX&%VY6yn$q-`nUf;{*iSEc_zKAbX9jVsmjeYfs^;JE&htZ3J zkwqx?mq<@84Kn|u9(~LCtFlewT3C%o#|FLVt-*2V?}Jt7_+V${`rL)7sTl}exKN2w zxoC0M>IiyaFv6x*C)n=QanyS_`Y?;4sK+Wv8?%fTKeT#rgiqbx;O$-#8JAkxP}O}F z_v4$H3ukd({zkJ*ujX$y%ih)eVn(|VMbe+5KSU^%O4Y`&95dza?%Ft(%V%@@w1UttXu{%w}V;VN_!J<5K0n9evb&l1i)_OF})01;`7v-WQ z>0B3DpEK?Ef&;zaKrcAZyZEx3FJT7;u>)o7Km|MSI?GnC!~T+gW7$;gOGTP#DR|lo zp7w&Ly|-U>OHs5UfgRn59X%gAdO>@(W6JsxZ3>t7+m{61<)3yt?q**UcwaK_FzuN` zGnrl4Q!}b8=kfnBOur82IioZ?uy zY}3`*$IowMvh8kKepSWHJu$zL5Y)r!U@X7{Io5n(m69GNfz*Z{ET#Q79eFOy|%;J za7Xm{WB9;sFC%5QAGd88$(z~Q{15G8y4+d*%|2Il`lp>PHWJ3W|J~d-8-KxU{7SJg zaFU6znc-5#qlKo~IlxvvzgUcl9N>3_vE~|mXBc;dF2QXR|G6fZodzYK>_}r(XDIReKP%!bBN@M@kt@tRN zGh>qE0IPh?93WC^jM>btV~n9knav}<9vLSF*vPmHUW$s0*PmoOrEH{BAG2tk(efO# zxiDT0;OwAz20MEp&Ivwb4E}BA#5b=5zW(>D|Ld;-c4m++(w?}+P6`fJ<=2lh`(R?=$WVPx~MpHnVKN1v3Kt73c)jDI}M(o6CbKXnm%*Z6E!MYk}YM>ewo7o+% zk^X_e%n44`v#Q5R1o#PTiZPD7!P?2nNZ#D!WkLT~W8$ybw9AM-%5LS1$Pl~UWu#BD z!((nW4^^0-F8oWd`6;&xR0Aa(#6U%G0;CN115kc){=$8xI0XI4k&1V`8`e~3PP#ar z6~)Y17uQ)YDR&6+^41mpws5Oo3JIZBsZ9}-w&j8#uJ9|Z;D!sw%x*X5 z)|!3{qd?)YN*7AimEz~741`>$d6p$;*0=ek zT9OO3_$ji4?*dO;qUaXjM;n~&mjiEE;N+GI@xVtHUCi*5N8dVnkUIC&rN z^c_yV1^8lnBwG1P;Hxh2=yN95utoDvSS{$+i^PCf-Ve~@x}{FzWO z>#wxJ5+vXrpC!=BC~)NrMV7D-xcglvUj%%Ib?PTyw~VpIccEjS=b;p`HAQ?hYmKA5hP+U* z^D##Snh5+A+gDW4)xd>cINOWBU;o(28-O4Eqm$3KxaLBhXPp(_gAO}iaPlR+jj&i~dD2u1_IkF+}Z)4YoGr{h!#AK`ZRDk}h=fBThw0=zuml zc{A{d_FPasw*g=Kl(T&s@Kbw$lli|NDiV_{fht-BTzTKA=n>$DhMfFoz*pLHNA+w2 zzSf>Q%C((PvBRD`T9E~Q?4Qm727z~d?&N;}-tV>qs^6 z_5|cOy^Js&8GG2WN5hvd`{P8LLVMrTwG2;?$kiE5CQIME=h z0Wx25qi6?$wsL96PbWsKtOeSN5RDoP`C`so)5t795O2CRRGr+J0 zVz447iGP%ENYY?Nm3V?&zLFHlJcs=WqAeVxcSl%`(*|Kb3Fxq@5)I+TATcfGkCBmM zfkqOT@g%uXmMo2nbi!gG8n3aKM8=3EM5>~ZUR^vu!zwP*8I6Z!;;OhRsz_&eIrBjk z_cYsCnR&dLyS9uxJ#~3AhpRbx!zMmcGoBO5-hrp0cf6S!#&fclJVlZ6fy{O{SCyGl z!VR?S-tR z*laf`W7KdHN(p?{9_+&!u8wWKBIBFD%`Kg3??R~KB10aWz)dLYgNj>CQSW%<8s4cDlHFfyt?CzUA z4~mQtmKsa;y#2kId2jx6Z`ZRoHsm;+baWLn%A6x5FUFjd@pBehiZ`!m)MV;ro_S3b zk?pRVi~(>vaa> zn|K$kb~$Iv#{$5N3TmAw;})lS%6aj~VVim&@AK>t-J73vc8pRF&ib`llYegEj{LUN z%hbS`@@J@2Z&_O^4t8p1TOwwoznZL_UB|?B&FoB#M_YKln!Y<8ZRPnIy^ZsHt==Bw z`3>~l3j6X#zrD0EV0SeJ?Lwo`|0(GI4d}lP`fo)2bIKe2IZreOaw-~wIf2IU{b+uQ zx?8a)BW>?}p^@UNlfn5kTXWIgmf~77QlpviR5N8J_qQuxdLgmj%K-6a3!K&HAubf{xXFIQb>(m6bk zC||J1WYeLs>S&IG_!U3WI!AFz%o=^uCm{rP8kAIkzrL& zR^j~dcuJRIRlNNBs*c@`nuT-B zV#&3qcj#e@MMK51oKAN2Ij2%B%FN`+4w0@pIH6o0x>UM4S7z$_I@t!HQW?lhcAeih zL+gAB-BQ(_H~89{H`0}3w@+W?XsB|gu1S=yYDS-3eD``-Ikbwsdhmh6=7vl~l#|(o z*KfpAx*WKAA`Y^83X}Dh`leJpn(Ga^*`{8KXC(Lix05?Aj?#2t())LF^knInMYL}& zQ>f_mvz`4il+IqR=FlRuY(ji`m!_j^-ZMV6lRaOw5tccZr*)OmN4cs(TRQwhe@HG# zv>AS~4Ka#FYw2WcIojDL>29tzbuC>netvo@W~y?$vqM+o%73O&4OB7@riSZPlta;b z*WxK%5nMDid1TQP&rmn5>Cl>fL?4@S>GT!Q#o8aw(2KeI-O}`O(H32xtCui&tM!8U z7p>J@qes3JPn>UYnlesTGBwvr%;I@zc6z_2qj>shZR8Gpu;UB(u zikX$hhEAz?Y6|p6j>6Qway4B%duC5>@jQqee37Pn2OS)$0yn2o6jVd~bmHt4rbW9| z!c6huqu*TGA1OL?HJ;Md!G-kPJhG5hIE+FXxlOmwn)q|(*nRJu`u}nIlj!Iyt^ESi zpHHvQyu(c2l#9w77w5DMuQ5HWcH%>(lhZ2f>{zaGT+@$z`cD&|C|kfj!rbB$&!X0O z8qT8l5I3wI$(x(#UBL2;+m|@CfZbIrNZUN3B-d|i*do<$ao2lY4fVb)qE~cD4L->u zZS~gIZ&jpa>g)7^L|qA+72~MOyUlR;ufs4nswY=J^rnC28YV|AzD9EMAjnbCb?Lsk z^*Hn&@{?RYDN+p?3ViUbiTWdW9UOl>Mh>_G{N!!jmp7;vJW(@6eJ;J40` zd>8P_-;+EBeCsehL4TBQ2RNd0&9{Vj2|oaSCIi0%ypF~5 zK>YW?u{;Mm%((`Ech4dDN5C@{k^B?jj(ZaNC-ACLvj1mTig^m@U=0B{D()wFF7UC{ zB%cqwavjMR1JB<=@)f{iEm6`@29AI1CV3dR7$@E78@ySBpgG__l6Z5 zLkx>}NuIFF8W)kU+M+v*V~C;$=Z=Qq)rqoV_4T`~zCo|mHxj_}dIced6Y`3R3l#Lq zA*+X9v^oU+xP$Bp;TVz}zzKUJA-oJp@MC=>uO*74L?RGE3P6Zhbh8xpVFd#rFW!B> zJAijz30k7aQV_30R=jx2Wq%M4L#g9@@}*bgi&#ls5|B q){Vlqck^tCwTLsLT%zMK_6erjmiRz#8=Z;l<)|HUCRQ(Jm;4Vahb=Pz diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co index 53cd19632346640c1b82592b925a87c67ff48955..5bee0fa3fe14f3d4b5725975bdaa77f3faaa722a 100755 GIT binary patch delta 3320 zcmd^CO>7%Q6rNqjj-AxrU8@PTX%RJ*y)I3%HgVRZ+bTa!0&SW&Q9(_JV<(O+BLWE} zs6P^3$4Y`h6*QVd;RKWe5=c$aN>x-LZw?%(y;a;Aq6Kq`^J%T*`L8WpxsEa%elB);jWG65_1X-g=Auwx{*gb|FU@V<5+62Mh##vBT zDUSQ)YMeKkK*0VUGLS|c|2zdI6|Z7?4xF=CfG|TrW!WU^YSzWqGjjy?2-fUa+hq@zIbspU<{RY_9DHmlven zxxkU8{5BfLMybj+8q1#4Fq&mgX*il?C#Z@Sgyfsvw*klv4!V6HbknEtp&>VqIIr<& z)a^&yukq1Qw?6S}H1MGSdp1A z7JwY8Z2*jv@i9j;Jkyo&Gd(B(v~skaTHrnz1$d^e+GavV#oBB&$-gu#tM&Oh^}KmW zJqs?V6;Ex2;II%J7J|b{tdC`U=XfePy%!4UL6(xEVptqx^s};ywpbJEZ5{$eaSo z_%lE0YOw#BK($t*I)5M=Eqq?~TIBb(^D4Rn^zRvS{$Xc`tld>anHmZ6UoymETp!BG zmd7MR$^y!eoJFy5BZ}3%^EOH|@cw7n_Jj(C?P^X)pjJF#h&u zWlpEKpHy}SxYfmG_|X_k2ZQj-u?0R9o)$W0lBq!Y`D7p@3Su&p4h!M*j35M4fp9W7 zl}x6^lplUZxnW}5D&Kt>Pk~{Vj`d(~ceVWqY;S+I+HMqr^?=>s--7+Wovro{WBZ*c zo%G)8y{s|X3QD+UCLNlG(}YUc9VO?R$ZnLJ@Zc*eha=DRf!V3))NEW3dqqNn?GZx7 z-gZm_!JYuI_V@PyILx%bJrkeO>kfE*qRoaiT4&+bgt+=K1H-X^4R8Db4xWyQEN&6G z5^!X^2@X%%;H{XGxNdN8D<*OzZ2VWEbOQUj8pujZad>*N!`YQHmfV#vaD#z|C-=cy plP()Et?6SOaSy$jfWi1t+ihbP!Th}dF2!B8=(O>MuZWcbe*qBaTQUFu delta 1644 zcmb_cO-vI(6rNrBLzTL-U;;Ks13}m-!nU;ALMo{sQbS`3!APM7mi}Zzh@vqO4hS?r zj4=@$4wb~HoJ>3}2XD*4qb6uP5KdgZp%-I;;B0r60wI72lWf2DH}8G#?evjQCrnhMMq-&m1}=BOIEj`p5KgXap+BkKlAGVS!24S9(v zrRHXhXa!m>=&h2wbr@O9Z5hhH;d6>wM`ML=Wv*x>Z#**?f-p?&WCm|T*rj3^!fq8i zA&gQxRn{7*+FC1dR$g*gT~ao_gT&t<@evY#563fpshVk$I3^%Dm}V*S6z2zEr|O9z zCy(Bj01U6wwfXXul5M){k7Zghrwq!LVM-YRFBG*1OGvSV6iZ04SyFCHO2DLim{emn zF@DRqh-p`rxzb;>T}?#TvV<*5*s_?dJnZ>wM4?M7p)opF`oHFJNPqJAYOORpVo-84 zfJ^)MAKm87lmfs+_W5f_o!$hX>qHZnH-#p*PqF(g|B1SvHd zjnhioGb~T~XZ({uS(Mqfu;JL3xYAx?q88+>MbXN-&3iI$*Ij}1pV!O60(06gBvH4~T7!n5(Lw&rLuh&Hv@6|;w&lAz< zbnjYQuY1t)KrMdW(Mwh|5UaB=TIEj+T8McqWaI#%wVYN7kuARKy8%&8PFsd(A?{^$ zwq&VuK_te@UM1iey+1C3Yzg6FnzWlHTU;XBI!gjM*&;NND5294w2)|`gFR>sgRd9a Ql1-Km>Im`qh*R7D0Ttku8UO$Q diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index 5e266b49f578951e50293dfe47747314926a39e0..d0a690ef02e74980ae953a499ee337f0d996c360 100755 GIT binary patch delta 8637 zcmeHNeN0nV6uX|zdD|0Ym zK5-c`Zo(3E8OAc(2qp7-f(y#IDw!6ty{V8~oEU|s67LnB+=iqA{Wwn|_lbN6#*gv1 zDtZ7wrakf8yuvoeorQV{SKnMwUDT2)q(b*%yk6F1U^BE zHoVkHT3WLma?i3HMVrB)@)&a#w2dP_Wf`0rN50AmURJg$t9V(}s;q|eR3@-o$=nWr zr@UOPMY@*PD=O4Fr0aOSvQn)_x}Mjos?@^7odnRL0JKQzkklcmM^cYurj&ANrDm5- zYIW(Qc9#&qS{%SS9Kd=Uzy=yV`T%74F0HK4rIQuA^s-eb0QAez?c@X39zg*fAg8ey z403~|hY9A|LKpo_tj{0QM*5wk*WV0KGBO@*GE_JjDx3@zPKK&lxE1sxPR0t93mZd5pJDx5YIPMd1OY-DXU%Gw%~wNjL|G8Z?PyhcJ( z1Wv-^V?^-tNY4@@{^fxbA5ym4?8#q5MYz%|<3d`NLEfc5q9c`I@AUOtksY zA|QNg$+pK$kJ(tO{trE(1M3HutDOV%aO)90F5Ql7dL&SDgdPLt^JMCw39p~0NQu|+ zj^kP2Pa=K(k`AQra8GW5+3`c~LdjyW(Sk*5O=WnZe`^PM+f;i8z9Jb-ars z^D~82Kb&g~S6#R{hPYzoEL;`Ba#XnA)i^_EbsX_|8n))$E>epn zu6JNA(BnL!#}c%iA#8w0Dm}iPEA;s8nl!cGrq|C?L@bJW(Cc{c=29uQ{pYK5r6?L9 z&x}>a!%}X;cp#>*?kO2My^{T7$KHpGVen* zJF0=vbPQ;!30zie=L<><7(OW?0r?chM`qv}jKBLrsC^d3V^@aac8t6ANTGwq3h=;W zP7sLHQjFi)7>cjQ`0z`icpb+3asmgy;%Fnr-##EBc>WeVus1jwg!wRjuwmMP*Y;xE zJH!4E#*cq0D0iD<81L!13kk~U!2B`baCB&-XR$-?kD(4rg|n;p2g}?;fI7<#tx;zWpS9wZp3U2w29q51-m+^JECB_g1VCm_9EZK|CjJ2 zvhiuqrVx+(@!ur|)a;d|a53vngYtMWUjtceR~ob`45fj*8nOd&khe`?$I{@_gdv-a ZUkX8I7cZ|G*^5hI`rlg$i)#P? delta 3965 zcmd52db<7~cNhe^(SOmisx0n=J zA6LKctA~uDT-j;&Tq+EjuQ3;PgB8axAQxnz1zBi87Fv)cR!}i1$U_QRPYNoDEozA+FwFWWQay@2S$Qy< zJe`GBYoXOzXtfqnEgJD2L)JVNH|Gqo75AW8!c+C+3D1by>PH7&k}F7CpkLTpe40fi zYX_?`rrJdS+G5{7b?r;m3_Mi;nWtz(B8$dvoJNF#u;(++79if$b%>k0h=0h_={W?%Nr?M5;JDB9^@vx`OhVM#-H+SK^)dcJ$&t1A z7>>Mp)4=NZb`g*DJ_~W=jaC?H{m*rBltDQ=n<{5avQ~gz-Iv8`1xKp?irY#&zN^Pj z5n|M|i`55itC!O*v?DkgJZY%XJyL}+%`TlV&{V-Fu4zgCH%~vTN9R8psR&IkT?f#R zYZUtr%IV{Dfu#Q?JhBo#3X4)V6k*irwZiB}XE&TfH7$L3qH{X_W7QlkF~ELwb@Mso zKd8=Gs1_6r4&ux&zF)>{&&AhzQ+59ran$0vQ{_x$C{qNlS|K@?<+Ka!$H0}7NmW1HR|WElX-Ngrhy7Q7j{hb_&IZkR{00vzc9U0B z`8U4U4iJU(;o-}2{)M|h($`M%F!##+uqd@dQOJP=twM_RsDHy@T=Ms8Y}_54dNh#? zi!a_;4o73;g({GAadA8aUeAD>@cUYrP0Vd#IC{EOj)i0BNa_l~S^znw&@FfiVCyQI zv);}-U4iCC5o z5@W6rbbqB{lDC55$L8VkiP8unTeSSOl>fl3#fvCjl~0_|oUu)GV0_+$trYj~oI@tA z92&vFZCbpJ@<&(8r*~y^pMrm?ATVV{RD0o1NcT;?H9{)JS>rQC-rzqa{ zxfZ`5=Qo0yN$tQTIw3fs#eY%o@3i1 z70wbIKCgn7mMSY)y`?^i@-~;kEp+>4UEb#=gpick>Wlf3Tga;npF?8A&`lea9Cp6c zMNymAy-{&(j#=01RtR>V!$t}AjfF+j<*j92u)BJzBk*o(J<_M{70T@JJ*Fv17_h^2 zCMxA-qHH^KDd8X|t3yUDTU11*kB-^^(-R@Q;7MfCPvS6SO7X#_4MGYpM zrEW4fG>& zMmwX-TX1==R%YZf`G;{W8IVHk_Ztc;;F8+E?oSwiJ=bO|O6Pz`y?P2vNTw`;ky;8_ zX*m_B&~5I^2pcQU3uBe$%H-DaM9ow_FOTK z`VHt_0* z1H3jeAG|(tpJhjWYDw!70H&raAe(m?9_kN+_eR20Yk!<->#qabPrBAnzy$^sN$SY5 z&akzAS=gFgUT+(q!;UOdKY!o?YFD)iPF+B4tJ-AQ-e1)r%Z~o46y&E|!1uXp3jo>H zR;L^3Zdq?@bJiifPS)GoogSonWWA%qsSNxm1>DF0HP{P@oH2;ZtFmD>r=X>$d`Mcj_?VvG z_wzs=Z3N0+r+(nS4w1{TU;UjBF{*or6midi5XpE@h@AMPv_s_RmN^t6N4y{EYoGo( zmzTpNnF~J|z6Io~^@I835Yo4Ww}-cZd>JLIn|e&5SGbD4%L<=eJtSG{(|W3mlD0NV zib2KPXrxqplZm?iX6fVGp8IWb)c1dK^l!7Xf0!IiQo`8kdHkt)YI1asv1F5@Nb7NZ z(UYUp@UTHRn;PT2x{}`grRsGbDBdXQo|PR%-gq;8mfw4RIyt)AShC5{_{O;sBD?#> z_>U(`J4B9dnJXdEl^v@U&*$=Tn2_YC&!|m~f;uHR8sGM!q1d~ukQzRCc+3dT=!NFT zhbd&)*2Y7J7uLY6PI#?dFYOuprTG7tWk<1%(ieNb&`}YW^v1p=y5Brj`ag5}*Uz>7 zVdfM!@HZ|{v!>k@rJ6Mf&pL;t?@sNV3z<`wl|ODM>01JNtaL!KexrJ$$eB~x%J-g| z&YE^rlx)@%In`A3Tb`5}rY(Y*IVkM-dI#^jP}*T~bjzFx6WSt~FM8qio6qI)a=eiA z=>hZ)`OIfaA!+T@uyL+rPF5QX9sXefR4ppzP9xwvX-qHC$6nAKp12V*^Px?2*Td_O zxD^STO@5OH0C~xH#0nP6 zO|#uhu#;Wi+1V55b_x5TlUHAfnn2$MwO?|arzj>c@h%~%T!1-vPs?jD-`%3+H(_pW z(DIeFTe(Q^idodq>Mpk%Ck=UxJCeZfC zG?TSK%!fB>`A*EOYLA7&r!jAy)z1OU&pceHs*e9UE_i2XMgdaa$2|9xmVbo#iMO@< zBT|HU|JUP!xP=s`4L4z) zuF>)b32)W%0nBqDEq@&Im5a4}j~ahy>1b`i^SA-spyh`!KYfdqzl-^a4lU>8PaG&S z0OXShydWQrh*!yu}Vi-A5r}= zmZ+>RMpS<^LDU3CdZ6PRY36N85Xv+KNHZqL$Q5{bWQJ*s;w~7D_7XM7kl`3V7a%m` zFZ32`WH4pD367{tw9r{1Nb0g|%#SN@2|sDZaeSOilnrvkHZ~L`#_n!f^3WBd*|$#uc>zP2cM)%)?8E$UMAUVXeIW7%|N|ZJIY8 z;KZ-u&@ZsI72;J*a5I&$h>tbFJP-|vLQ@+4eV4#>sVB1V(YZ_nWKOpAK4?j&6%VGWgO k5l54-fm*mqq?%zvc}JUCDT6EB-3)8Wk6{?56k{Ox6-`IjO#lD@ delta 4679 zcmd5=3rrM87@paC$$NqGf%QQzs62e^?LC$w2g<>r)~lkWcv!7m6}%XXlxV7r=^+J5 zj7^WMW;Ml>qOsIaV!PV2De2Ky+9o8W#-udb#F#emK@C-C2u1De?aUoZ3}_)~HzEH& z-+c4GW`1_|nl5VEn>GG)D&5(fx!0EnLQglAfR@J)35}#C$zxzp%G4~00RWL>*J&j@ z7te1~%IeT60IpNAVjv5p@#7crIdHLIa(jFotXxnPFPkM&jos;xERE;2R61NPuAsOW zes_E#@GmAX;ah zrF=CBM}s3xo9N_X_-Lf5gibDzWGS8OmZY0b-U=UO#IfGYII}k^&gxwnm+cMlH)8&a zm|uqZ+Ymp&;mu4a^kyX#d6y=v@s6}0`v5%3IIfQ9!ci}vi^W|j52%tKbqzsXojQ0^ zRUbS6JSI=RYFFPUOXMh$C#adoFZ?bnxdGxm)@*k=>bUw}`DqP_XSk2Psvnt1CZqm{}rQ8$s%?T^DjlN#;>jmzl`f6ISfM&lJRhD>iKAcBI@ggg9Xr*wK`)jL z@qCXQ%0wH4TBl1BK%um&#CccsI`am&I}2L&e#LPBeM>Pq{eV+*21P zP!$Df-LrQF>w{8-NMo&!RW{b5z$qGbgxX@lTLo&PK=m9*C&ER6!x?zRMEfK)(w+s7 z7NR%6g-B!~LE{@Sxk_H+o0Db>)f*s98Erz{2DmSswN{wSR?bsVxx?z&Vf0v-N{?-a z*;(Cnt zeXhh8V!Y$qvGqrsOgzy)&VdEvqdzGPt-^TkRVBU-U#vl`|rUMC31>n z7F1z8eMU$L|#$)A@g!w1r{?mbSxhCO>(|AGaQl+6zj1O{3 z{4)lo<+nDYw=*XSrD#*7wcKH~Z$&rDI+u+hjC zU=xVJ5!K8xMCFXxM9pUimqiajoe^UNP76j^hOuGQpAf5wqD& zlAVb$5tVfmk?dF}M{+h9vq?NAGZXf~8A%W(Teg(5l(L!24+Wej=|+we-c*oH3S=s> zka)~&ept1Hxn`TmA$(a1tvs8QBm7+o_fS>w!tTv*MHGIARMSG+X2_A@C0e*KW_XPj z3~tCJV)7_i1cWK4Cd*(7<qQQh=SZOT;vk4ktTlPSN<-Cv(;xs0Pr;%M^h$g+5_$G?@qK}yPV0h|-Cc3HhzxSS_l%U3B0}`9` ze&6}efA0Cu`Obg(_u~7^jb-M7z$OkZK0BfMkoBR+SxpAOVNhxhT6RQ`+iSHcqI3%=uSb`NFyKkS|;6i(--Zr1R3%?aj5^ZB#+{1u*Ec-%8$`+`B*vZsT= zmq(mid^UI~-4!xTHy{toxWVJQ-dp%n?h~el-{#+KO;+T0e6p|Xm;Kt4zA>B1p)adE zx>enTzN)sN+tpLvb9#MiDuhtI?nPxirk3SAv9+q^)XOz zyjfCYuAJV@S2FUZa&~uHWfjJ88K%ApW4VI{W=h;414m2TVMeWoQ19)~8HCEIR3HpI zY;Z*h2*3q{XEK2Z@QA@jM+4@>udyf$0fYesKmjlU7y%5qyNY3VcTsSQ#fV!fngL`< z056dMULgUDsm%HZ;FXJE@1COIO%x;EeGmYw9JbR4JbV@cJZ=BW<|Ia~)@HYh1dZ?L_vr+2(imtGvBZeP50Yar0wru zSQX9;PbA}sWIU0KAKM7UlAzciD7GII8!lq8Z(-%B&+Mwi5*;31Lfm^C)eTCBe>jiy zjGj4BJcKm=Gi!_?2KTr5vk_u6rjM4`mT{<;*D5;XzxmDCD-Sz6HsYH4FWJ##T^-sF z!sC%0+R}DNkDI^!ztiI*@i9n`J5tIKytFo}8!0k0E1#ENK)T;Prw=UvpONR~8Kk>d zO^Iud{9@I_zM7qdV1fxNm`8cB;Ka5C?cBikO{3RI&Dct+r_BV-x)XR3ezuHHiv{Qi z!JmGwPneYS(;}~vU!3vdZ-+TnlJMQ(DWNw$E=G^#@;!x}xt`vb7|Zn*;$pmTOcbSj zPdq0b&E*QQd<4J81n|;GvnnfO3Ib1<=tNiLMteK4Z{FT$x5_~$`rtnQ=ZOCjT>=>v zC-%2qHpu`nL#=8swi$7xe5M_S$Abz_qpT){=ZVyO-#&2=ojf|8Kl!Q{?H{02+7YER z+P{<1K`BmYvS$!s#oL9Cb3J(Bz$AOkgI5o9w@{(3b68TMix)XOt%xn1WcxdLyrM*T zsusl~cyXi+pHI7Rd)ntBzEBuT=@?JTX7cBP+e>^qTIhngEM7@RePy!#a@N{`vIn0{ qKaJZn9WH8GHpeQNAS;gIQ<*)ktuI=>lYa*A4Y*$*)+^M?pg#Z$c2wX1 delta 1666 zcmb_cUr19?7(e%HZaTf)bM?VLtw|zo3$>}+-5=H%{>joMTe@jt)KbSqgvts;40WjZ z5Tu@OO#ySw{ zN@EbAo-|%ZsE9}S5B1tqTJ7wlTo&&c|?R^HM~%cDRc#8W~%CB#!ge1?#n5E2L>7a`=y zBqbM33h4gIQh||y<)ky7GUX{#o-*YLQ)%4PmlOEG{44ukb2y|gX}wY@i;ftOmjRf}{9iTk)9d_o&51gHqs1TWD}nsqr`WwHoTz(Jdh*grMd{wOoSs7KwD+m^ zDv-ZvX5`j6B2Rf|y;DH`$w(2ObnGqSaZ^G)iNR=%J-v7;LZ$&gfvQUx@bCMIE+S)ZK4) z*lj}0Ei?!Nj%akiC0OwiQ-+ga-Gm<4uh92Mp&)~Lu;|LR7gIa1X4@;M{rLg^8fxF^ z&h}T_^q?|G>@EAw7Sl4!1tQ&R>QNIB3N>WS9M~V}eMk34Vdt#J-P8<*qjv^}Z`m9z zE>*0z*;LWd;!wrr#uim2~#9}x|vw1U>Pv89#- z5d?~m*I2Yd2tmt8$Z$YK9FPzaNP9s&*e%JC7Q#P(k$MEt14zZ8HM6s`QDQZvl`X^} zD|xCnu5dL2BvT}y}bx}jUg znMhM9%GzJvu1k6qWVH#%kMm9}NkG*cd9tLp_DW*#2UTj1%Hqy++0g&uALY3rS7q z^TG+PE&`vjF1`jpArf)8fN$NU+0m#&BDSR2@wmfHY`135&pYVEFG1iU0l0`tL?xnb zqHdz1bvWy?da{yL&bqDMEDa#R0+?q39Ag12XaOj`)&{@}ITvrsNqkSv&D(P{fah2M z$5{X;>B{waWA|D4POGKc_bgZ6x--TOs9F1aoKknT79wI05rc>rM5IG(+(M;EhzAHn zKYuk4|_i~DgSYvwi@xn=4s0jKV+UZB7W)t z5eqvY#@`I|!2zh=7|>YmrfV}V{btkkL0RsUuJ~7RuzPi*SRt<+`WtIk9j@IRZTdC* zf5_3F@2Cr&-5hP2Ws`rYEy&S6)3uGaE2|soN4~spT>J3Xt|Zd6OBc|~l%7P0eUr6Y zn16xkpLgqvnt7SE8=w7a=+awu{LYjByIrH*PfM+sq*f<*^Vp_~8lP(=G=ma5Xc?{AeeZP|d7b##X~cW&?;Kr)NgjVUI4&iIF{rJT>!D6hn6vZoZeq^TX r0Win(#os0!2XJ++7n)qmIK zlP2q&uB6vllB{>0OB(f8u>LC6k74~NAsPL<`#CW%7-BHQV2HsC425H;93#jv+6@eb z7z{BOVlV@vnq$;(j9QN2F)$cnFvMVp!A40H=l1di4SYc(U*Jn#T(PAhmZvGpQf}&f zj_#>g%QqlTwO6oMTOn@~eXSO0PO)Vq1N)8D4((tmB}t3MZ=Lt4>CE)Wcc`!V&q$vt z^2ZOJq_hLQyz?;~%f6Y}5t6;4IL8W2S8}5oGmYCjn73hk1oih}W;ScCVqOf8O&D^O zLct)FTS8Dic;DNqPPHj&K$}WTsMe_+NMtWHElAe*26ne4%>qT z3m?OH3zQ;5EmB3`c)S$juf50P=KQfp^hA9V)~}w}KZx;;xM5s7+N~E8`~62D!}yT> zmkT5@vPE`)8?GEaK~GG;2Rx1w@LTj*I2aB|Guo}Gnek@75)fufw_likMG7%Q6vt;LPV6Lgc4wQqsZ~s5s`a`AW4rYSksF+krb0>Wwd*!1P|?PD4Ue}@=A_T4G5^)L=0uD%(vZ^W-WOK+30)$hKK?;`&i7Lv$G=Q1iokelf6ct;D zLo9j!Z{E!8yqVuVd*A&}nAJPXO%a}-xOMo7%meLHqzX8_%@Ao-&AoaXJXfz9QN3;G z)_HBDt9nZBuU@U!X}t+7dvwETOoXKU_=&im5uE>gKU`w6juflk)=(2&&P+7Q;x;HU ze&t0N>TJ0@JB{$s8AQ)B%tL&c z|FCnU6#0OU@I_n1rfl**Lzx5Ll@a*9+zoyx?*l94TJN|3K&{4tVnt|urw{@6$`LqT z7=f{Z2;Tcqh(jQN8=4wpR25sV6=D&t7(a$eQlz_>IJU3!h|J?8ruK-;f}dUI~8Kc@wdpaCRl0HmOi`A)mlj&FjgZ9N&|1eEi(_Zhk7Y%WAZAtDM9QHV%~xcvy_1|c3L z5Q79_m_Y2;AR-D8QHY2_L>k2H4CMx&BDR-6bP|Y%2t+}Hh$uuvA)>7iw=+Dqng4j+FA0&t!`7nezz>MAczuFpKe5LT}gdW~oAc zjkNC(lkz~=P2Glno&U$FyY`2|dv;T|VZ;so0XHFa54L@?`EF%-QyB>OZC&bnwOTBw zpGkzTjXB^N!XJ$}_yweUjTiqVbnCheua0rh@5k$7vtl4T9zUuiW delta 1907 zcmd6o&r2IY6vt=Q-_ay9QT$QFiipNyZ8X_62`X9@3R)W_Y9jrWsv)Ts)6!EYXr;Xr zq1H#iL!mA7C~aX8N)c?m)<3|5ha7tB!AnaCqMhA+BUK7L>O%Ow_vSmZpS*#$nRUAJ zjxM{EouL;O=H@vup9NIHtegtTGeVb@le*)Y3Rt<+zxU;frPKhf&zw*zCCj;1P$xGk zBLDYB2-1eS|DOkoG+q;OSjS4mw~zu+4mu!Tanlz4lG0ubcO510z`h3uc^bTOmMGhM za9{B$+rNdB!t2~BaIkBX=ramM2Yr6#WzEzk&X7N)PoiVQSxMSW`A*$kOlMiGDrW_d}ey(JeXFyrJZg*# z;|by1Dz;KmiE7R;5%1!<4eMdlp9hC?M!1UWa$s!0u&V+RVWzYOART__Zquf{KCL}A z9iP-n zv~&4)@Jp8pz}H9(+>SDv>_BP&(oxNo!G>h;W@~d{E2=4(0mw$Xl;uGPPIM_BuR?mF OtEkrME?F0co&N)V{Ks+t diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co index 2a2ffd0f0ddab10b52fbd98199de9a74a44d96fc..add65e7fbaeac8daa36302524e5c2dfcc3b73cff 100755 GIT binary patch literal 65080 zcmeHQ3s_V~mM$&`2!gzHkwMcj475s+mr61%=|;d8w%}(R<2ute(9lF4ZPOqkc3(S< z<2r66XpB)~jER{fn`AI+vg#Nh<}rC_HfUx(vtP1njFWL3Cu0(mNz9}7oLg0Jfu`j( z8AmVQSNv6{PSve@`##P&Rp*o=Cofm<@zMGb`sYi|5FeID(6aoYD>Sok1ky4{Ap8#{ zqliC{v8|R?_riA88OK1zhh%zSq()Gjv<6jRn zkEUdf{MrWqc@fm90CfwpXXoEZi1S%vMX|Zsn6%Pltu&Qa-?w>xlF?dP{XR+E@NAN? z$W~)4HyDeGOx4v-Sxv@b!)j}p%>*^Y#a5shsx8JM)8nP(RfWd#{YjO^3ezJCf0a~K zQc`WQk^M>4W%Z`O|46Deme-gJE6Xa2pP{UV!kUs2leLjHdDLPo=8Xb9Gpgc%+h4f6>1Wi94I)OtADRfjmoL!;py;;cMA#C#7&yXp{^dT2D>WfG-LAr8L+#*+p?2`}AUfDrP~CY6S_aX++a)+S)D8{}wS%LB=wM$~*Cl8lMEh=+ z;FY0v@cK|YI6jyT<}c~G2JZ}}1DPOr{N15;aC)d6yg!%@Dr}a{N8!w1I*?z3b3^Um z{7^f%IG7G*7w9^#!KJ};AioBehuXncL+#+(fppMQZR(#+@@T>0e16GJ_z-%)foV=B zd;&c*>r)OnoK$M9s<9ZVO{Ep4N}Hjitj<(yKq265B8cP#Nh1VRu#>@9URGM^_k1rp zHaX!Fb||UBSZ63HH`)xVtE?+o=TaS<4r+$)B^#FMrx`(iw5CnB7!4(sl<9`bs#P^c zIDF|UV}>EcR%%FJZGcC}P*J@yIp;S?7He6p(Pru#MAsUnNzqsVJ-}e=lvIhfi8r3?f};PbUQO>!j7jT2qO&3Pz0Qt>x-- zt;t$lR#n-|b$hCTcmI!d5x$^>b2|pBP6Z4H%Y(@5AX-L8ex72G28w(fDEmk&9?|mZ zE#n@a>$I-J752HX$XG)?J6XU@NZ8>^!IPqA_n-k=Ocy*?IM5{unABE+>00? zw6_3~aG|{&KnwT|Vx-W136Mnjx8o19i3qAqf}hIm=NIpW^KlCTMM5OpK>Ss0kQa!v z)tB7U>PK9yz9T~Ij`q!M?gg`&u@CoxJPEV%<~RGgG|l7PtA7z-W} z>k8Zjv@Sni6blNR>M^v=4Cn_ud1O=TEOKwFOQ&hpke#h=(%cGU(M`5XnC;Zu8=;91 z$7;p`?Uz6kh?{I-xRv1p40lO5(mhf-W{5jP!fy%l~{rW1EZR4?*4+ z&kL@DCOTRa$BlEglS5MbIa}t6b6s<{%*8r5?|VpoyoS8_j+ify%|ss`pcl+=sXIjB zGa|2RBzo1oz^h)UHzz=Sg3qVKXQ5zr3PPL@*>v(yD7xw-X`ebb=b%`BD_}U@Y_5SONn@ zapXv8EcEzcae;9BKsdfgUTdA>p1py%-R^9m-2RdfVRw^*t%9$AfSWwuieqt;7g~vr zAK-rh3Vv6BXJp_=c=R^3{^Y4ccesUEJM`ps3zZUzLWx}zme}2b8p_bIFd5O|7831l z2O)UvwvecJXH-<2lZ=gX2hi)H89UbOg5PF?K0j^?{C?~8aGeVB&CdMjxO?E5Nqq3i z31QdHMMlL5IdG0)VWJow?!-1OeQ4an66#fMy?zwFY!XI?nuVO1&HVaHzfV+@D2^H9 z3?t9BhLc~nMv<1*G2}q2bu>Jq6ZP>p?$JbSOBhjS*N`Kv8bNQ*hU+|uY&e-vAdm#o zC?V%ToHU}|o}>vAr_Ky_CTpU^l$m3kSD;;ZzK=VBOzTLffvcp~$4G4}1abxX3t#Nx zRzsZ{>+!n$xt^L z>Jmbd%<7P2^Q4ee^W>0pvvm9dFUMcx<@k4cIsTG9jz3x~Q;&9*O&aYgn>>0$nRNWS zyc~a-m*fA;%kh8O*YP*P@i)QoH^cGoEhAye&-@ES6d=?`hXF!; zbQG;684K`*<7N$J?dnD8kEg6yfDLitzFrMfCX` z?SSL&gyZjqi~@FT5D#IoO$ z$3hVOpFukPW@0(;0Hp^#4|KSpR};5>Y0j&wu&pAP{?TyyMGVqLxTyok%gOo#HIwBhj<96mvw1x$P8Z6G5t< zgXcHF0{zhPTzF4l|MS`@&L_T7IZsP*fk;u`PH`cg(*zs5OJST7>gYJ}oV54QbIJMz z#q3{F%-uq9&Q^-^9-x@Fi=zHvICnU=2@Nh$4f5iQ@u-8rwTEwYs@lh$y0Ca`>Uh=I z(3?yA(m*~;^Ld1X`NrC{s=I@t92jSZMyUJ8r=nZt?r_IweAz?IB&et zD5m{%Nsz^NnSIs2E)%Sfsv+4R)j+C+v>MW_KV2f=xZZrdQA{A)Pg*_#dBFFK9rM;d zV*ad@xBVIO=cIh?Uoro&bey-2!*>JPAAbjN=V^F=;obnW0s{TSyNzAd;2u)?}|Vi7LX?`k!Rws2!dl3*;tUmtlH@?#xqA-^{AOUOq%8X(^g`4!}29P1#zF7gWG zV;y$L+ate)`~-&s@{Y*wD@!o#@OrwZp*PkeT!(S+F{FLIRFDTy|=P2LiuPOcB z?Y2@6kB{wN&P)H;*`|i+h0+> zgCi;Zg%FB|!YLjZL;3#uTgrEI9HqC8KNXtm(rjtbfPOW1q@;5xvO4M!peuNhF~a+l zF|I9HUDBz2a{nuLwqIL}&?RGNnT(OeWDJz&QI^PNGDa?!F+dkb`Wz->%wsYJ%Kb>s zV={)G$r$-e##qE;4Acuy##qc`j3rFQSjuFKWlY9Ey#e`_Ga2LOOvYHjWQ;;4W1wDv zGDb0zF-n+>VP-PM-Au+ny#x8mnT%1%WDE%X%YMG2t$7GCpCS#!9 zf_x23#;`LP1NE5Uk}*&h8mNqcdXpy^1NErkkTFmX8;Xn}*WYlQ!y#j!4mcDU19itA zLdHPdbRaSY>ac^6F}mx#y=4s4fjjS!VURIUm&X3C2N?tH0sWOR+L?@TjL8^iLqHkh z6((c6#$=4wnT+u!lQGc7fPBZ9jPW*;G2US^#z`h)pbY}~-eoez@0pBon#mad!(JgXyZV>i%iD&gvl6}n2hlm zlQGZ+f_#^mjPWItF}`9l#uX-Gpp69izGX7T_nu@7#f~A#7}tXxLoQ?Z5LerYZ=9g> zic(7Rb9S?9pj^-G8ED_IzxXy32_pC`Kp z>ZshFf%XmL^JLe+adLYG+Bf>LYvA~}Jp=6<$mhwff#c)$476|bW!J#*aeD^ZH;_-U zYbbUN#jbJv$Q%Q;Ysl>xXy3qZzbCr}@^O0x+BcBTlU)P(xIF{y8_4I$u7P~qo`Loa zG%M`#sq;kdNCl(7u6up6nXP$L$$t-#|W3b`9j?_6)RdAfIB_U~`~e z=1#6RyT-Ziy4yAI+s*A6Xy35FeSvljzP13%Xx~6Yy9VAz+@68<4WxUrYv4DW+cVI< zVFNx-b`AW7b9)BbH;~VhT?5C-?HOp_=*zBw;jzM*!F*)Uh8nM>NDM>Mw0acefXVH#J{KBBG8$JI7_ zBh;sDDx`lz={F-iD@&ro zlypbJA1Hky(zCNAdNtDjxQ|Hl`LLG4ifbcaEk!k;<)h-Ue9c3rjhU`x&<=I}VX*TK z!!gg$*V=`iYZiLWBk;8ca=SmSA&}ek`8t5k>va{IJhio-wRg?ue{AoX!@pqfnz#SN z&gbf{X4AI7noNAx%ID3oEY~aXT_>Nn#)n3? z(!cKA|2m|9)4P9g4CqMsu6O?qq`UXQ^8KN&i5$o}h0gEXbs!fEcHKcwIbpchG|J@- zZ)+Lla!CKyFm_*0(D_?lMP|9$T16_;{By5S?9aCczYqAm@g`?sy?k9FmOa_+k?zf2 zkM-W{^!Q!#W}nA;Z+3aC_hygBdQWzEeBT`Rq4U7}dpQ(Y(FVtnuQS84Cp#?Cz1d&! zd+5#XiuK;?t@u6kW@p8EPxe*idkNnm<$Edl5P0$(=zPa2-^;(74}kK${NdO159E94 zO<%+BoP3QPmOa^^k?zg*jP>4Z&iFm^W^2ZJZ#HJE_hwthdQUcG{62t3GbHdOnnK96 z^gcizWG&dUbda(j&4QE-DF;$6q}h<>K$;6_9;Eq@@*pjMq=&Q+61S~Q0l%Or3zrB} za+cVUHU)e=rhu=<6!7(!0=^zoz}I66_w)rWOOY@5)~s+~{<%`j7fShq zD=}XzoIMTH9^oSO)mhJ$2 zj*#`N&3g1@M}8;RS{ZNW@6c3!8e(p?s3;zg=wX&Kfh`adBnt3=AO@?Q*R_wmTc7XI zryhOp)iU4vdCX_Or%!%QKE=0waK806#kao8xBoiu*WqhdaNbeztyg^Od&a0xeCzv) zQPJDC-Wf=J>$Qq+{ol>E9^X^_^^Hg0ue#5vzj*!MC^mdU(e74!=X?6j|4{z&b>KH& z2Y&Nw0M`K41J(ns1zZc*0N4Py4sacy9ncQw0CWKIIq$yw?dx=x>}zzN+3R(e?Q3;k z+8cCV+1Kf=*zLM+?GD}d1M2c)6+>!VIl_c{9RpO^gj z>;H!N3sSz}8_Zvn@(_dx@-&WsSO0z$*v?_{iKqc#QVGB9!|Ly^lKlTgk)o5tNoAKv zaRX6H<&7f6O(an&Zx$)ui{rs~0f@LJ9rc*2>FA2Tz2a}L#Kw}v1{>WyHkN$N*iDMR z{i(l~zdgzw{q>h0h}@z0(~E8WkvoRNhyGfSJNoOR-(R_7Fl*fhBX@Z9`S&1q3~LO4 zVURoI@%RQJcgW`j@EyR{tjpJP^_(9-KE7t%+dKjC@ipt7)~fUQ0_5Xs);+CN=ko^0 z$JeaO*OOtr;)CB6lT(Q$*mq3M{>BoN%N^*0|FX;nf2M4WF0NBVd0+9tA08ike8v^~ z$MD4UQS2XA_o?fz95R@Ag@chpZe;rhKKne*pyHQLed93RAYK>6FTao2QlJ`V-VtX zQGE41%N;k8ul`HmtB>{%?yLW}Z(}##%?`zPQ;PY6`y>p=7sXd^@9A4P+%X1v_OJME z(ijYK-_8E`UiNp+t8-2Bb>?ewH5qQG{+R#hxvT#8uk=S|Lz~Y)_Ouwdji|f-DCsdb z)p`F7(VvtIaZai~b5IyFL-FT46^h@C4#l5S`i9~@0LnM?pU=T=bsPf1qo;paJ@?E_4Z{v7lF{7*sT^nV-YY;(ij%zI(^_w~Hz z@^JZsy4c>%0q&X79?zPY4Pt+NNI73A_)aU=8a9Kxitp`kT)sYAe3KU26!hhnV)bE>GE4o9Xpb22>LRHe^#;YUCn!dDP>lH<#n|^KPWWI^=niP7 zg7%0LUoDi%ac@wJKS5F5L2>f$C?>u~G3kTC(B0538QLSJzG{`q>2FY+eu82~2gTce zM^W=0#mo<^p?L4`y~**uP}UC*&pIAu{jjor81_%^|9aPS_j!bb`A)EFRd)wz&E}wVo!Q<5u?3nSw!nRW_W?EoHUn-2+zPl2a2w$LfcFD# z2iy+$0N?|FI{VbxkoG`&7}BkGEtU2S zXu1XCnW?1MX3YthpD5+GO~HJel;1uT^YK!CM;hkUQhw(Q%ukl`yMKcDL@B@L4$LPx zq`e55z&I5R^R&%?I6u1$5Mvf>m&Kl=^RrDN#by#E#S+*iQryns{q7K>CHhVhE8%XD z;vP0XyO+f(*vDcOJjQIxkF!_>Pq0`8PqA19PqSDB&-TZ*oW5#RaC&uhaC%KmaQf=i z!Rc$(1gEcE8=SswU2wX?5v;^67<%l2Lm=~rH-8k0==?E7*CdJ?Zl$;}h2o}Z6gS^S z@!pvfn{*VLb0}_`OL6-GiaQD@?!1%Y?z<@N`PoOIN8z}8Z~k1ueLtr7*d&UN-%9a` z6pBwxqxke~6rY_*u|-Gmxg3fI=TdxO0mVZF6p!3V@#tL?TYvU>XfEvk(xTCbnzA#V3*c<(2SsIzOxkyeeMw`U9a_DN*J$%F!k6$1JZBIiMzG@{;~qzMzJ z&J1@ZYof%InPZ$+pj~*rk2`^QUc(o@*vGAgIyD5Y7klVk;Y<7WAq-#MN6e&3z9O@0-K)zIjaV%VT< zTgLRh3zja?<-+?pPA`>cQd`Ooaud)Oz*QWz0b<@J{!~fYMI_w z$Mn8>ruQ{4z0c0{zHus-h%zExcg25Wc=SHSe`EOcKE;1yaQ+*L-bds3^|_`<(fbs= zkC1<;-X~vggx_E8L*;GV5%MXqV@^r>UOUtKjxoLO6{h#S#`M0|ncnv%)BBDyz3*+N z_r1gPzLQMvdza~bzh`>iX{Pu657YbJXL{fNGQIB%)BDacz3&{;`#xrR-+89@U0{0O zMW*+C!t}mNOz-=Q>3x@(-uETb`@Uj&-xa3!earN|?>*^#al@k{OP_$sP8GUb6g_sY2K-5oTms37Zdj)bmP?`Vh(gT(G zzv0mXmHEHn(*u?GLH*MMmH9ua_vLP(IA^P(2P*S_u} zpECdFasKf-*n3GHL)GzZ_xV5hykFLSx~>!F|K#(2xv>5d^+cTi!}klH_nQanKauWf z{!c#dw`l)K*e=5JUbgaiKW}^5%IE#O?QM(W zA}q3B5B`2~z0c=sR~yd%@z{~K2Xv1exeH=RqAr7R!1$aXqQG=Qxi28nsDI;4L?={F-iD@&q7fQOER?|S#|K)QP$8Om=3E;1bnH$N(kT|SokkbVoL z%f|w)G#v>)rgZtaZbfECpV7Pj-AKQ!cmEF|{U^Qq--Gm-z59O{>32Lz@Yzygv?(## zlsKo7A6*0Uqq7f3oKwYCq1Y-ETgA0$t59MLpE%Vzo~}2(k^M~Z9lgt!u07?trrZw^ zbxFDZA(rKShgkMB--tSld+UO0P+$1Gw=SsYnqr%xYf`&_qH9ulH|rM^vwumk z-)lC@6D|>A!r^{qhpI6ao<)j_cVso@ty2Ez zTFl#|e5(WVwGJ22tM=)GMr|Sbc&9#plsY)b8~}S3Y{WU>pdhm|C@79J?L0l=5X@li z06ap-de&w=$&Z)#oiIPlc%?lhU@ZZmdrTZsQ9R7{o;bqxp7>4Y-V=w|9utSV_Lz|N zo;bqjzv-f1fM;5HhO4CCxTOexzZKGN`CKXH3#I(Qm6$J<@-I|jzNG)ZaSQv6pBvC` z+`@k21MD{*(*7dKGkl#q!vxn9t#9*(_Xz%(!zl(+Akhp*Fucs_gIKwQ;TPY#t1#Qb z>LHM<^!F|+vtlRYl~t}ZS#MYA7G$gP^0HN_sY%I6$tv~K@~X;Glht4`+RT;43X`F_ z%3w1aZ3e5QqS{botTa^DSS(dm8*mh@G?tcCmO{PBP;4r!DK(T;mQ)!^s;mZMdAXsg z+F-F(m6Vm6s;5?4i>8*B*H+vLl)FtuHmRU0nU+zQJiWNclu~kAQOdL#$uo+km1HDm zloTf?rJ>f5a)cMgX!fP8D{q()_K4m&{xW-ggnmZ$9wP} z;Pf8-A7yk4ED(|Z{g%-aVS$VM?CPX;=-+iic2-RTdW3HPA-8mr zq)Z9W!-+o;BPd`KP{YSdZN|LNq#YUTvB-u=L zHi_F_?oqpRio=u&W%d^vOOXXSJtWMtC9NuqoRz@~w$VuT0X&Ie8r0B9{ zFou-rGdi=WX`R{WGk9mf+nLRo#`{f~l`)I=kb9esWm9w0`3a?FbY)YzvKi_8G=Z0A z)23%6vmVk?bNCsj&6qKb7?$a$8Ov-YYua>+(NJPZnQo}8T2*7Lv>8fQ88Zwiwo*g- zYD0z1VyLKInVd7=21zT6i6Q$gy>7u=`l8`?$Dk2!Pq+(-z$YdO3uy0<^zjqu}jFH## z@pHTh>V0MH`S+D0|Gu(rdOY0(V*Jwc;{EgImZO$8WZ1Jj73g?f`1il_eGo|-h#yky z{Ey7CziE)iws;Qs$KNyJWm>jSkggGC`SlB9^h`$M?Ju$S)FswFj0+-+zsKRp0-z%; znbmXP?SIV)40(HwOMuR=ot$;=7p%kv76Qcykofrd{qu96@v*ZO672nomiYPeI(}SU UHbcf!`>?Mm$D=(O^eB@52d;ry-~a#s delta 4514 zcmeHLT}&KR6rPz~c450eGXsiH1XD3CB*odk*##pSg&wL@>Mvs# zT_run__v`K?N)T$wushpL^STOGU-KhjPWw*+t}oz31bnmvwLYudVzh2MhPyjh|BJjmcd1`jfLRmKpOF@j~B$1>hfW$++_2N}F3gXQw(W%7dvR zvQ)aO@_uG-xwN~bm5KNNUyaJ9bI)pQFl*H~rBoxnox-8|Y*}G*wA9pfN!uW{d~vBI z8QD;h6G};fx~<>7s+4NXmbo%@;6jYZ($}X>v&_4NQuDdp#;hsCK^iTNHfHCe@k^^# zjE7CQ7@KG*{N|h`OSXH#fS-?8Qrp`gDJv?{HhJgnL$X8SJ}I+i$$Wh)+@`x zubhcHgQ-a=XYS>4_03v&MOUsci_R>SGPhXt&Qd9N6;2K0Z99#I`9id~#4@fRV*c)T zy2WmfSM*1^qn%=;(-rXv(MYh<>+=d?G$?w+E?*?l6%Z@XBwdVOwvBO$GE+0pYq`k3 zRdvi%+c|=0wh-|2>v~*!VK-A_d-V8D;5Yj9cmVL5P73GWaMeMedQ*ocz{f)y$k;sq zcy7BM?*jb8Mm>HM@E@894*5?)!1yN#Xf_G>kEitb5a2D^h?(F;z|-Ljq&jZ_-nfbX zk(S>~&236>9Xj-%)erO);DV+s6PyRUXE+0?j&?dUQ+~tN7iOR|$cGzw0|(P;e%fA_rU&^3C+3O=8kpsv~);`+P|#0`Wp+>k)V z6@oQ&giz-p1dqE84;I%liwyx_xI8|QkQ-`=2#k`^Tw;TdaBG4g;s)yKiCbG=3$8m@ z6C|$3OESCaLSzKD;PH|im`pNzgb+#RuJ;mY9#?&ULNq6!6U$eaupJezR9V8XG>MI(V_ZIZU*3lNGM@)uKJYodS@v_}w!wD~>nw1$ zTgaVe7tp*c@(K8|k|*_-D1q Nav9Jov@!xs{R>E*53T?J diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co index a1af49e2ca562bf36f081503bbbc5c37846fdb5a..73bb3859163bdab6fe96bf64fa76757aae67af5e 100755 GIT binary patch literal 74264 zcmeHw3v^V~_4mn5fRGnS2pDLqii zFrh5VClM*8NKsK+wQ7S^|6nmmt+pc263{N_Xa65XwA51TtJR`)zP-<5a&Iyj@~Wlr zX04Oox%=#M?wLDt@7?Ej_Swf>xUfhH2{DEd_7_Udkr19oF!FrMWF}oa0%bWQ9RA0U zC^8btv98eTjvc%)f#INwe<&8`q=Ih$I^vl?Aca3}2%N;f4Rdx65cr?}OSX1{V$=LRE zsh%>B<&Gv+>EG>kFK^l zOKtwE@ZbQNP+f&b2HU`6gKgl+0W@%6NuB=^v<{%Y>JmIV*ai*_wt*u9Xy8Cz*Cpr} zKz-FEczLi5ygJwhjt``P1o?nKMiyrHuY^x^6-*nCE}8u2qEl* z{Tn%*2ni2RtyeqbaOxUI?RvYVu6#{(d5zOjQQ1&lW=E9d(tpHM_*vo{hlO|FJAS6h`5K$Kb^ofN{V)$eaf;ayjaYG!gPh+=Rnm zA7$G{w7&9i`l0EekjI37C>K-!72v=Ae&&8xfQ#fm*%dOfx_2_?T>{m*8H6{7MiDQ3 zIKi_7((8{b#22#7=Yty3$PQXBbYutZLfFWTWE6}T*^!1ga%6`Qan#6;62w@kLk2Jc zHUr9l4EwL64%(c=ohRNTMw5cgy5V8(xcBmi>4Z8ZpW}6Pmbs-;+kj0YCBS|SC`I0&Nj5af zn8)wlL(C~UlGUl(2;Wldl5;u|DmVHXW%|1?n{9A8dzFj!lx@gTiHk@R&*=0}FO;FQ z5c;x!_a!`BmPe1KePQR1O$mqdhr{`0@~gIazPa0o&*v+Us$9R2;$1%SY?~B1a+Hre z*M@!Zk>}e;NEqP%07_w(L1%RMXt?#ZwSDu6LpS@RBp0;g^GP)nWvRj?OUqq8Ne{=+ zurwXf?UQD@d~QPU+U+C>Q#}a@DIPLD#W#vw7sL4RHZOcP+sq{?JK_7=VutIqq{QYa zNle)a*Np1HBPW_)J1>b!kqY56#>UEWT$~5%c+D{>4^T8G`^@GjeAwv5#Mq?5IlIL5 zr{7OPf-H|4=ZPgxx5bg4wk43(wsGWOn_~>zqf^XNvEO5e+^!qZ;4+XG+6x$(3q|m2aFA=Sedp$mw&&c`ie}xRMZ`j?C)Rt%s{*HczB= zt0i(7+KXEj;?qN!9?MEfChDPGJuO@6idzxln+9dmux#nliPNBL8ZBGqin~3;Hyz5R zW7)E06Q@JjbSTqBr`q(( zTRDBqwn}>b+k%{bWsviKJIMKO?d|;A;rw^O`FFti@2(`Vx1YO)cx93R7zUUK7!H^O z7zGIBiLroCo|wSO6BAi^ViGH#kj%;_qyR$w33@=NKVc>y)Ss9J2=ymsvGR#ItbC$@ zm3PQZRPPsFhA(Yr_{$v(Uv6gj%5H|gYGL^6eSpwz$Ab)i@d(409%J~+CmFun%J7wE z8UE@J!(Sh1lJS1>@cYSQUYfjZ>C%*Ga8JSgG}UKb8WkQw2p)S%!c*+>4tQ=UK8g==AH@f`kK%*e zNAbPhN3-ER%7Odn8*m>P=zSC)pR z_#pRDe31JnzSsMx1KJjih3bD#(J9A>4{86BR(NGR z&U*YqT0Z+JhB*fre&adNgZ?Gm&h!k2nLPJJki&JGw8|@o!#m{ixpVr4#L>RX=g;dm zB3`(l--P(iJNnIt7cc4?5#N1Re<$LlOL`aLd++JpfY|3rWpGVH%Ul`cbf^8T*TN7T ze`4tT3&Z-qy#}}V#50jNSb^(1$>x>OCS&)1)ft#dHDZ4!Re5D2p~ohfVO|Hrd=f(C z1xAL2GQ*+{hI5HT%jedrF@C1WPI#6= zKPNY^e&S~`Ze^b(@4E~OzQ?d=C&PL7GMs-O!-e-VG(P~J9X{LSMz5>~dogng%B!9L6kd5FY@Cb^8scSIN~Dk3sX6)yZtuXZ%SZ~a)O zeaq|Kb%5&t9e@tNI>0(WC!iB>J>Yu4dcbDIm6n!ED^UI>`D9B zb5k!%l6xKG*Nr_7`7pNw@{X|=AV1Pw2l=|O??67>?S#B@>_x~&xYt8|{n&RQALXux zeErx8^?YK`2_bS$Zs0^G33X)H$#5&*ndDi z(cK97#<8D5exmzM$lp2kGUSuoF37vaeh&G`Za3uJW51Y+*XFlx!gpk~MrM|lMrM|m zM`o^B6Pa0A8JW3uZDeM3b!293ZDhmZ9qp~~jFA8MQVgQ!?U!QkxBcq5CK->te`fM+ ze`UD+1BQ3~o#`FD!1Q+flgaP?7sKX|U()t=jrb*P@3o6eZ}(^>-xJNSC63|VaZK;n zC8oD;0+a8b^vjrMp&t+a@f5`aZ=a&=9e=+`M)@OuX7V5XmEmI_Fnr?gOz*_UOz+8m zGWpa0V%QpTnzr}Mh|{#a-+apSo*m8P&qp&n6vyy|aZK;GpEJE96PUbh(&?BYuVH7a z0mfIwqv@DSv(<4N0b_+AHbz{pHpbOutJ5)cNYVck%?)dxD0SHwMs8!|aT^2ed9)=8 zxQ$UHYz!ESqkJB>G3Ij{1MPm4FXT3cncEm8+{ReSZ44Y2ppCJN+ZfBajj@8;7%RDr zf#U|$yPew@-{CgKDsE$}<~9b7E6~O$<2FVGw=rzo#<+vq7&z`gy((^F)NmWa&TR|_ zw=r;Bf;NVe+ZgrS#%SO+#wKoK;J5|#8o7<(;x-13V}{Dcz_Cz&Z44YY1+p=495oa+ z29Co9V`C`CZ`jYFurY89I2aoP$Btiyje%p+{@55eh8>8Fp&s-0v@viD?Ej4nfsKJ< zX>4ye*ciwM^wq}b;5Nof+{Qo-0d0(zxsCBFZezU4ZH(8rje#5k>K*4c#&5Wdagy5@ zr?`!Q90cmU&TWj}aU0_dw=sUtZ4BfnQ14A{WBf0-G0t%t<2<)9ki$S5;{vxa-r+XJ zMQ&rf%WVwgI8g5rw=v%1Hpcth#`uui7|4O3-pAa=_y@N!KIJyXWo~02M}m5va~tD} zKsJWPV^AAoIPe%s8zY2x+fRJvfibU4Q)ZY)#cQBlFZc}PH|+2`L0&`2XCS|Ua^yA8 zmKS^m@*5}*#A_(|4CFVQpcjbOz%i=eGmzgvy+FJM_EYc~$ZzzILF8 zuz!NjKz^e)UIY6l_zdJXP*3AEG+smFHHMGP(O+Ie$!8$Hf$#o6yawtCJ_Gp;)C!YOPw*MYZ=hZv zUIX<6pMm@a>S??Np92jtcQV{~jSK%$^BVZ>7JLTs8+Q0zAg>{O3-B2E4MgNM@OLEm z4CFUZ9*Eb#cevm)kl%2EULalr-{FGKKz;-D0`VHyPr+v(ztJ17f&CMF2J#!d@fz4a z!Dk@9(HpOU{S$l!@*5hj(f_#knE*FZhNXCS|UdVzQi)DwIL@*DWB55#Mrp5QZ(-$1=UyawtC zJ_Gp;)Cg4n8r19 zjA(BN@wU(14rN;m+YDPs`wOH=#e3M{ccbJx@L0)n;Bg>+!+#FR?(&k{WN&V6)Tn0g z&&$oVZOJv-Jh^$ct+@rZ?6KyldUEvSwd*9(0cFU)z*KXm?vDou%9BxUG*bCGl)uH~ z*P%QwkIG?6x>NTylTSf;K>?N5q5Mw=2%XOdUkV5MMu0Cx9iaWKvhl*_!Ot->Uk2k~ z=N|?-?=Tee488d-boW{4K93-L50tz=`Vc62ec=b-zg}11+f7ScZ0ukrEP=1YvtSt9xH7N{GIO$^{Afj$3xij9eum- zw@mna`aquXPN=W+&BNnB+%?LBan?9448~Psc`%L|$AQ7PX)F)KN#pgon(p83??ETN zzy93wJqT{zow~pDd=J*6{I5OVgBwu(HzrrUXB$xdLC^MKDL|+0qn_<=Liyi&w!azW zpY&|M5#|5v+5Vj<|Ey>GurQ!g_phGqyHV~t0Pg#P_lfL}pMw8AHw^58f%+YEw-bii zr%`Ed1oLH7+97@OVO0AQ_`l05*eqA_Rb)2Jf47ffU*0|Ve!%xlFgpv&m41nM9Ei6^ zc`&{n%Y*Us_+AUf&trKoULMPX@$pz5h=<4L&G8U64=mo7gRvEza2}JjOBqiWqd!tie^Z#mS_ec zSJU$V*B~2#&oV*EgOm@c08$~OB1m%~&4W}7X+ER{kQPE(1j!6(F(kpQ&46`5GZrtG zW)v=Wp=<`M^_T%`J!ZgKj~TGmV+O4Cm;q}&X24pH8L-v^?bX)OP*_{D%8mJF)?j`$ z%|E*q^JO&ud@bfH+_+BjfH@)x*D%9+%_x0jgl!b87vGMYZA6646A_U@n)jX=(F$y7 z3*a+^Y~gjb;97Rn_W;+*b^YH%bJZD$x!Ib`@F11P+RufzLQIeZz=M)pmged68vA+6 z&&odXdx$XrE2PQ)h4tV>wYL7hW!=5cToQ1dJ+8;sE7s%B=j-^puhZ|Yr>(ai*m`@X zw%)#Lz5g&+!z27za4u3?Z?CPl?;f{8TW{Z6+=`y-?Y|Sj*4w9!=<*p;a`$+z3IBBT zOSglKhV!&a|8(?AM>)>d;yo$+(*yaXJ3;TOu-<;_QHXVdxYV<(eQ zu9(lxrt{gyPF{y{#e8-)ozFgYatg{7^V!)jpWVHEoF~Yp^91Vlah@QX&J(EH$9aNm zI!~Z(ALj|O={$kD|2R*OO?eM>`#4XKP3H;J?c+Q_Hk~I>w~zA#*>s*j-9F9}WYc*9 zb^AC^kWJ?a)a~OuK{lNyP`B?!Im{D$YvoFruH%4vLc~(Q-+NzUA>dlShL2c$fK8t; zZ2XL&%csRk=pHNK%ZiWC0C5r;AWp(Yz>R>L05<_{2HXtT2-paCC*YlcEnd7p=*=rW7lTWKU|HbPhEGKF1uW&&s}cQ7yXLA&;T(QHbUA2X)~lo zNOwYVL2^SHOne3$LpFSbu^KMYxDA^=!Th^4-}o8kFVQ>%-GV%e8xb^~Lo&>jz(a)S z0pa0*?~);n?~qL6J;*Xc4@sfNUYX%GqNm5(WrlZ=ne=#v%+J4`qA>&8FKMAz%1n1~*4iNRk`V*yZD3)vSG`iyJ zYH>k(kFVR;xS&d#660ySaoGcN!7QG}j1gKqjWqDJ9;$d6nvFTUZA^`~^#0ppy!OS$ z9E!NESA&h&*SN8LwJ`@0S9Tya=2w}wMB7cogVo}vur!1bZHSCM8f4nKRbUu%3r z&93e-D~39DQ}_0@m>4X!75ci0m>7MDY0+2z1G*-R?)yh$Oa52K!cxY(xRPBy*fw^2j?IK6ppS{BR&*`e;a*%F*0{-9MAmen9LHY;u#UB7;k!f)iE6s@X-biDDnu;`G-pQu<1w z?_w|?Nh}ZM8;Rw?d?K+tm@g#yJO=ZD#PUEfw^6ReLkS!Y<-a`!ijl=ad1e*EpV9G0 zx0omk!oy{G^k@%3`7=CL%FiTO$gjE9)9xc@+8%_ZbD<`PyW)kEm?+B=CU^qHNa-Ml z+VDK{b{5OU-r}9bs89B`ZH4v42H1mIE|JJrEq00*;@c#L5i-g*hoWIlqukecD?+ad zVh?H8zfs0GJl$i`h-=ha?3$st{sU?IKNBlwGAtpNePYeOELFH<$p&dTq!qAiRuAPq zD4&k#_DM5cJ|NuCzj-G~nCeMLNb!*IDL#k+fY;hEe!LB1Hf=MPq+raZEoO6yZb^yF zQ<9jnm6(~HL`XEr>eRuac`u1dkqY6SkByb(xHu2i@tR{&9-yej&Y2#aWxEWq7vh$L z_;e)joK)N}_jAIv&bSv8&P!^sbH1F|Idbe5XW!;dG$XJ9k@KFBYm2t?_Xycoo=qYOu$U>M%XFyak{ zQD+&({2?!9H|WPgeZ=@zmeS*dqYTHNV3^p+aN-*blg=`n{D-A6El@8R>LX5hWi>rc zIm&SA35NPkhST3*IP)yS)IY3_*$4H~pgv;8D-L>`b(G=k6AW`Y8GhpphK92YbN}Fo z8OZP2)6bYv%^2(Gx`$~jC}m8kdr*EolPhCN?Lzt2nOqrD>Ry!Vm|PjNYB$QK^=$w9 zD4)@@{XHnZp=bN|q5Q_4?YE#jy=VJBKzU}*_V=QER?qhDM|pP7_V=Ourk?Hp5ar+O z+5Uc%&*|Cz11P`wVJ)UqDD?+ejP~ecC)lO1XXoUGq4T5FVoF_kU&_Al)HrU?)@Jq1 zkG6YEDV)b{4lQ$`uT1mx$7r1UT{?!lXE5AzBg2+VhI_La?)xUg{Wl+jAq2#~&bYeb zUk}#Toz59uE7865GOkS?Kju1HSQb1RvqBysv7wV)#^gI9j5b?DmdWO7hInhu5O3`s zza=I_XF++d;stPz-Cjj`ySI2_byY4`(D#j_imHk{e9DP_a4(s_kE^RcZ(^_{R2~m zd#@?WeZOh8d!H%C{X^3?-1|)i_XDO}ch+qyBAP>2y6%BEZMz`d3u!l`??c)H={`s; zkbVGZFQoe+?Su3~Nc$l@0O^L?R?xlkny<&0-pMq+_bwggr_lWF8JJI@`8_vcek#qk zWMW=V^Lw)~Kb_|HeG~IDX@39Bm``=n{rJETAc^2p121^$O9L-%8pjP@$dt=QUoPfL z(=0RGMG|OywcRqqJ$%ob7CDj1_mU)v`(%duxi8IwJbv2&9>47oK447|9>47g z9>48L9>4ABzRVeAty>qFRaY08wSIkM)`ks{SsOP-W^LXanRVx#ky&ncq!!O@@bTOZ zfjui<_f`y|=j#l;(->~Mf#LRahIh?kxZ@^X7_?@+nNCykLDKT<}ya<5Opy`o&X=cf~nmHRlMEqqP5S4MfT{hW{oxh9I~-G2BK zgxoxF`a3V4if#a0^^g>Si2FKWKNDlUG9H&bex8%bchA73XPQV+E`AJn?*~pGfGQ+$MhWR9f z`o9zy85YV6i#iz2B@!*4hvU`Bc4&tk7t0LicQ9N)m>-xntu(s>iuhwhtcAWv)GW&NB2GM_#v2w)ck8S|C%2DJ+}n$?^)i9f6o}X zQa{F1IcqwS7?Q zU@e6c(t1eskTyW->pCcHKY_ljgVOwK*z>1oC&PL7GSvKQv~^HXgj|*DpwPbt$0@?E zM%&Ay?|XS5Pow$QD0mvpzh+qa*G&6_`PYcOOkzf>_cCdMy-jeOiu0hz!H|cf2tFv{IhrKC z25K^21C8MCAg<$q#wWy}(7jaS#+ zAh^a$i$l{DUun4TFF1!j&u7@?gJu2($?o!!++=TVZq%q|h$oqwYul1*w0Uy#Y+G{+ zY}wlWJnfs{S)uzoJ}Zu%oU*R#_wZ%Kp&!h51)&`}bu`A)(UZy8cO&!R!Wd6F8sq8c z$?H&_mq&fOAOu9Gj>dRWe^!j~q@yvOj-I?0_2GAdoaAnl(-=?c_A$njj>dRWw~sNN zbTr13x_!L2bTr13x_ylCq@yvO)a|1WnvTYJQn!yWo^&+Ele&G3@uZ_Mp49DQj3*t9 z@uY4aV?60-j3;&b7~@GtV?3$be*onW;|cFAZO;I0&j4*NB^v*wk;i{=4R#G5#(&X# zzcfx!^ZmLyeZRDQ%ubwco5a@LeU<&U@EP3{rtRZ~+z|TZASa~xZ)ltlj+KUsUy{ZN zb#VY1C&YLF?`SMnGQDF1-Tl{^V-wbH5ksAu~cCnUFPoDk#8G){<( zOY^?Vu;6x{9z&xaTryOyDFI6S$%xBGXji()QPen83F#r!j%G z{k6&68YgtRQ}Z*^;%$R}*;Tn7P{|1?V>z|Lm^fGrCoCUMd$@kpeZ`OyBCxNfj~t=< zn;KS-_76xC$Y76y8HyP5JdVO(<0lL>ZUVkfaQ{Xlykkv}@*w3yDu7f7sR+_sNb?{S zLz)k10i=bH7C|yYS_~-!KCiZ?E$nrR-|x(;?fYHfGQz$JCP;aZ@*x#KDuh%7X)dIB zkcuJAhqM6FLP(1snISEPgx~MX!WAxxu;;)`*mGbe>^U$K_8gcAdk)NmJqKpOo&z&s z&jB68V>ncYdo9@Mz6&pGz`TRzk8H-gljhspn6G!^9-s%z5mC6Gr^j3prH_oTje>n` zx1+B{M1;)~5s^Zg_nsMX2pFXnz!wPF!s~2-y??2`2mCm=F8-!zj1smt@u6gfhxwkx zFYrB!e@^vqZ{kCIZ{ovUdlS<=i(laKpLfaMg?n1NhimC~+**p?-zxfDKC=e%t7-n( zwU{rX`R8jfU(xsPxRrm$&-Cj%Zsp(cLH->NYF}#Y9v&w5Fu~`*7D$AQ#DDN-i(?qc zfFyDp!SQ2W9>I?*IR5<$UoB=^c{wb&qkpgSV}9s|g_SjH%N^fHHZ3YhUbwIzIU^%A zEj2A!e`8f`&6;wD#cp-lYOK}embzMt(`I#A9QNuuOR2TSQn%i2uXQ*C*)O7*BV4`;$Xey!uerRWTYpY*`O90C z+JyXp%M5peaxOp8t$iz(r@gAyuj2CXV`}*ZE>G&# ze-D>mJge6KKV07Nrdr;@<;{Ol%OB=)+k0yH(_Fr4GR^o44|Dm#D7E}$E>B=zoL~R9 zTz=8#7tze`xm+KsZs%<-|2$4Df1k^2|mgAY+D2h{SYMtdIwQOTe-NtH?x~67*s?AzwBm6(goeq-fs4cTPtt8c1-r%I# z*6PYq5SLduYT*%ERt67Pl3H3@U0q%S3ec9*x*E_1wONiKsj{vvud<}4&C250jC51F z$Y!Kv@@!guR*uNdzG;@oX3Q!Q+1WQ`h^#5!uUk|o+D*&LDG&;U=|Uk3#9W%5f0I9( zJ=-Lt8JYPao0%pWOV6I2Eu^z+C1G;IV+D%#F!)KWM7ZutS-N`7(^EXzQ zExv%vj0|z9GeLtlmRVRJK5?eW|Gj0-&Yvw5vNQd>e@15DY(sST2#zl zZ{jyT2*kS%?k0k_8~qa*jq7J(dPX=*hh>i@KxT+|o19LmMDQ>unv y!!%m?7ZnYKcn;Bj@%#A>$VA_j^$&fG)fb;%l!^0-V_Prv^+`;#vRmi6*Z=>YG2q+) delta 5245 zcmeI0YfKbZ6vyWd3%ko>A0vuXz?E19igagYp9C9Nb}6j}AE6et*4+w(RjdfaCYW?7 zg~k|dmrm_qODb)RA^l*jV`H0`E+$lK+{C0wQ_}Q7OxqaC2a?)ofI{!ioIA@H6F;&a zI?3jo`#a~}`@d)B&TD8~H*!iBEun4t!LmaUD7% z+%E%%I_4o>lo$!OiB&KaRWKeFCZuuT(H(_Kw66H^;|YDJwJ2y(#wS(>)9JC@Y!Wi) z5>z2QO}n#%{ibZ8GiO?uPj?B!=4?7XEgYsDbo>D}%Y-w!e8FVdf+*=_iwz0AR%1RX z0F20CYsHFtt->eP-v*-w;SJ`m%*jMFH)Cv?5{@jpC^&M5Rw2_s>`m&*H&r{R*ulbq zSU>pj!U17ZW2t0W?w})`<`c1*_6Cb}D_WRo-)gZoChT^LwJBj6E!LON!up(wwvrrs zTWJp0R+i&vlk>Mh{&vWJ0rGd?d~$9s91oULiRDzuau|@qfE)(oFtVH|mh&!_^B$J- zzAT3UISj~Q_Mr9hdKoNdy?F?C_yBhp#vMKiKNro6oi_}`^nJS6P@2?Ja6BWjgf&Z) z3@r#_YYXU`Ncwdxs9A!Nk&0n8a~lhU>J2*6fw?~XC-%y}Q)$w*YYj*msi;a%k%IB_ z(yJ3#Qu(@jY0`trdL-*w^3r95=lWv@=A@!$%2v>$<Rm3d$>6O@-IbE~Koq1B)wqXmBrVp3vQ=?q;<$v+UEKZYx zVT&%6Hs;~k{#e(n)G$zyOM45Y*H3rqQf6bum;CCr`jPj_BNk;tRvPezEKU>bUm2w6 zxwF}+ajKyLDd)>u=G3_G%RBSN{p8McOW$4taZB__mVCTGx21y-YO(H>CW# zH=o*>cwA2$L3o7EQkv9Cu=4u@_NO483A`2TKl2NR67DF($-{bF7-aIqrWMFSvtDuk z3e;V~xmtLK%N=TI-RBDJV?$1RYskCL;dIzttzH-JYIcS~%^p{&@G*KqwB;c~Z$(ITAl|y`$H{P9{2Us&?YV40QvQ9E%yMwQzAPNHChXSs)sr>03Qyhsj{;Z zxM7!;Hv_*=ujO(HDLpl@eKE)L_dtg>6TE9dd;oY*anxQ>;I=Lm$&OMOJvABB@^f+% zzga2D)ZiBhFM<{2J~Yr3=&6eZZ&2wfQH3k1PM3 z8XN-N6!}4Avhyhf>=Rmko=lJ-J8JN2;1l;$Bs=54NB-3ETfn=OlOXy3{{(?c540Wr z0Df#n%cp^lWy+2k4AR)84eEys5}3tlOd8$=$_0bqEUFxLP(BcdpRGaz5qrSRQ|+Nw zTH5!r{s2eZfWrZp)soRzk6)!uR?T%g)dasw9n8lOX>7#nB@1PJezGJs z;IqRVyqybx%em{++&XpnoZUl;!m+$3nM*`)4zHh3r<2T#$HmJJMG}SQJU&t&&g)i3 ztMwD{oZmx+;JrM_JLq%fV8kBD& s>~m#+hBc@-ExJI7U2D)rW3^7%D;?_SM)617EAs*b!nAnQh6)k&AHx-5>i_@% diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index fb5a3e6fbebe714574ed447a112c5e78cf2a4930..3258dd48c71fd9e93c9aa0067375ceaa5daf5499 100755 GIT binary patch delta 10061 zcmeHNe{2)i9e?k|jvZ_#caC8+>)MbZ7SaT4$G?)g%)ba_ou-s^ZP-F-8yvZiG+vjj zSr_FF^kDetx}MI?pT&D8XM{hQL+}&z zJgc7P@VQ>NW%LOn!cyGmiWxkqU1Hk-z1>lvCv2P# zKM+Hyfe0A77}|qC2n?!JlOZ%>e0HEc$ne|lV^fa?9sKtDmZxswaZD?mzlp~(4-uLk zVA=>B8erNXHy;At_d-7hfam$DFy_OAPo=6Nn2!)Xovw;vK1%qZp(=IZZUn+ufiR{K zOe2^^F^ysxveYNSmUW4U#hHj&+zC~I2dM%-q6%!K3T)%x(koyKCc?H2iHI$fh}t4p z0et1yoJ8?3hZTIrki$$Al;+y5A%Ur~a@WfS6;sNf%o@^4AK0%D{aI>2mKu3QcJVc((JY+*A#?oF>L4-Z0P&3 zp}&`qTEAs2<$i^7cKZs)Tl(9c@wU4==38ez@nKMdgQw3e68g z8{V%e`y#CnU)+=q9t>^=j2g?QQ+?% zg@>hv%_|K=oqRse!sOG(B<78F1J;R=_HHqkpoY7p%jZ&17(4&2aQTr}Z%6*64*_2@ z@~<{>zr(p2iTYFK+^$jhQNQ(%^-wypJrln28v32nR;zTSI|R913-YFdkN;9vk4xRU z^0`;w*xzd7?)fYi^Z#HjqL?o;7Fbwrco4gxfJ2v-z88TT9x4PkeBt}HRck*>E5sGG zkbecg4lNA%A4^RykEr1_pZm?pqesf&$I?r$XH?1O(@P`$q#ac@-~j({$?d>iAc1}K zy;c)<@QzGiKWokH2gMNBPvkwLmYmb04z8m#XSagiN9F!eI0~irZ{+`lUcU4iyk-~f zoH5IDW_Sj2_uk0G_qVM@7T*iP8)xlu|KH&;C|vl5m3#5?V)Dzmj|$K)&osj^SkrVP ztq|YT0{*dbeZb#hlJ}LPeNe960Q3KvFSk0->J_Kj;C=|V$OpE%f#RDyJbgPiD45(JW|9NX!vNxXR%J4TF($X8 z6hnqE3gzShIGF#>{?6%ZcA0qvR+n)#1&t`FS14R0YJtHLSeY#)E%k3h`TzQI$U1A)8FCk3^)UwJDg5; z+!g3>Ki<*N>5s1!UV>Gm!okd0chS`-v8vPr-ukN^{|3b!za_OKxuZs9P2iIs<}kuF zP`o;%GTBpr;-Pw#;j$f3+TjRgA(i<7#g8QP_*ROysW#5_?V$L?q6<(>2uxtd8#!43 z@cPfv0nEt44gdxz{#vgdAEvlrk^GAkpZT+1{#A+x&g=0Lq{(58m_;T%LkHCVSa0fg zD8BMLdi)~AXK$!X_B2g#>mvETQGDtJNUlG3WX7Ao@H6@aZqWgA&!LuAU5q5)otw+-kQ2vMAWM30{)I5GpROi9!B0gQiHDbH?b z)@pggeNUnx&h2ajU5`Hz@7mtzYIIT7yV2vNtTWme(^$VJps{YRQ)B&ES(nR~71$W_ zQ2{)!ChPXOnkeduc|4fq$jUB{FPeijCEP778tRTXH&Owt8W~51d1BFs#=4x@&xiAm1M=8UC6IB$5`?Bg61Vwyt<${$BK2`SeO>F@pf@1Zn9b}7Oi75 z#Kn`{Xx%dHhMW7utKG=_TEM? zUVDPUEBL>G{olm?o3Q^@1iUt|{?8Zb0z~Vxs7fEP{{h@e*V?^WL^U zAeEJ-u2>b{!YP_%T(>O19VW!2v^Xg(PD+cD(h5l#QBr1W{x2I!rzCo=9pE1?q3LITLzu=hU@EpV_1euD`rrJ!If!o%iK=_9AApn z0oy6t8iFI_E!^r82shbUZA}E%z-UTSWz(%F)jtwsTx*na*Z}GGcYu@3@VfSS{HW(G z1U0stALWzvRVwGfq;y8L`96FBG_qJ;F@e;~>%*^%tjb*V!F1or0ZgD~MgmDwQ=wKR zlXlPoT&pIDbgl~NJEWg>U6}5pRhr7<4wGK?x z2h$g#@1rf6WlawQ9A67bAwl%$oUD+W%&`kyGgFAI_mKwPl(9%5D!>%=&rBlhz-}#XXI%?G2vkxJ zA}ZRgj;)GjR2gM5JCX}71J|{GVpkuep1Bvu=LA8J(fJ_2@kNs4)tFLUu@ZzF&b2R(^6XqPwkR40Ts^YnC-@d&t-9NAwnn;kzpb#I|UzbZCJY=k5hK zzHpKjoypN!JRdJ=@zOHbKJC5{eiPbykrvC1Fee$DXOFFhT{G5V{Q?^8IrK!WhR&2t z{Hc+@>hRGuL2MGK*b2v3gvW6|K6w(-QS`5;6MK2rAnupWj6+ms(n9+1VMs!~l|FeG z9>_A7J^4m+f!kA4ZFX1d-6ma)+gfcj8Fl6wt2y7i-Q;#}x0shx=fEQV#HegM?ZH`K zqZX`!82&AqUx@ju?fl6*a*2mc1(CKs!tv-ep{XFgutoD0%x7f_pfTKQ#~o*<3|J=( zs1CadU=!wbA_#YG$NbQg8xTrJD2SzfVOIpu{73PCoQ}yI1kob#d!l*K4F#Iwe+K)j ze~I@04D+9UAI26XwPFvvJ4RlmRBp zpWPKbK@sLt9MSw0o`*`pi9hN{iLOQbvp$V90j2b9gnqpqwl?YWWG1Qe93nXYRE|#D{7l!6y$w99(hS^fP9s8oVl7qphlcvy@SW1vXBR{h~U&mts zmRKYYL#aJq3Sn_rF+{29FfN8BJz h6@4G^FzBJPn_waI*3h-Q?b=EA@K#w%-)(}K{{ctHH!uJI diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index abad7b93d9934dad31942f538e6a5104ffac889e..d347a96e0c9e12e09368916c90ce6afc8eebd3ba 100755 GIT binary patch delta 12246 zcmeI24R9036@c$nk|it~OU4lDGWk?X(_;3NBQclw`=jP)a=siI~W3s6{DlC=*ifK%2JE7(%8@Lx`t=mhjuXySFDV zZPHGLRPqd(vG(1!Z+G9md$;>^-oZDlyFakT%Tf79o2RXoiW&8D07}NDU9-3_DdH&Y znwdhE(c?%3q#W8V57$J`LA}wS>5&q;iCZ;c3TddQe?mj~E9||K|9gjW8{0HKTBP+$ zN+XPC#${cRoQKL;S8^uuS`Cl2MCC^cKb8jyEroLzia(yQx~XZ+%9b|={rZbsNk@7kv%k;(5R-mO z=t#GN%zUdu{?5gFlk-dJkz`{|_1T%zeI?9a`bIPF^tBXqr*BP+O=Xx{w~ClpI$&Dc z>Suo5=SQ*DFluWJGM)W_jfe>_9coE*lrm^q*IMVd#g@*tw=MIR#Foz<-F6kXvn^n6 zUBzu}wMyCEYFng~ovpSyR=O2n_~U^m7$z2rm5Nvv6}hdgG>GM(BDc4fhOit`PPkBVq$YTn2F&#AOJVAzTKm6%o-oBNDVaA|b0Qq8hLe40sR>SPTX%mDoFPK%qY( z7S4$T3j>i*VGtXDXO7*e7%ocKz$c9vW(JrXY`B1ATWNumwsxe=ZN~I~sXe`x=}s&5 z+rfc$aG)I=Xs^5D=JVKr9Cn}wJ5az5oTb`or|i%BHH; z#d)?*4AalUd32N}>w79p^8B&_xqjx`(v+)hY5cgUf2U1t|L;Kb!IS(M0x|B$fj|uK zI1r0&45vUmR-1Ey7^^%ndD7MPv=WSS+xF09OvwXl21iIW zL-MAa-~Us1|Hm$@->tbYo67}G0L~XCZJPyFT6#IN`|rpfPP_m1EjMx@4OL@tza=$4 zh<_)_r*;)cJ$N4@*?jItB~Au%Co0DkOR*ss}=&8Jz}{9k*X?KLOkM{MTg&hsZsh~x3*zshiT-_MzSf2=7LILuZcdHcr& z(m%&*8Nm9Y%u+E7GJx-9Kl7eAp~93}@HT5HNDbK760`IRbH;_azuMv)KyzX?7mSqc z8q3*J6KAqx&C*kYnFQe5MZ+ioNX0$i%dBUk5RlRPm4Fnl45xs^+kegkEk)UmDX zVl_?4=93Po!N4SRv=S!-7unRY1MI-v*8!jZJJ$d8KLJ`=&}fu)lxrzLO-W|?7zQap z51Z;9j~3>}FEM2ut{x|q4|%ULEK&mO#P~m0&57AuE|kPz?Ih`y_cJNM*Gh&{N-!`E z-N^cufd4n8<)|g4%3A>rNx#RxX@}Jtfm{m6z=V{r3VU(V%kQ_RmCz)KLH<2-Vz4pw zbTeu+XZzKFk`a_oMH{VZ8W6wC#@JNtHl(F0*?hVo87L`Eo$5qJb1Jq1)frO$&8Wbf zGX4hbwSGf6Ord zDyJkY(gl1dVCbhny)ZNN7Ukec=5voKni=ji%_kGSB%`ZH(}5od(Odu?s89t-uhsO- zjLoSdRd_)CHZ%Kf>u53X!+u)75qP|s=Bf)BGgDnVl;iQ&L4(+up%o0%0X*_en#X~2 z%W3{2;PW4)`7Ypfw|Vpxg-h6?!zbtl`=Nn<56#~IzBA)AmGlF@)K2T42EJeu&CdgO z;QpzuK7tD2Il943;Lq%!`3N->%uM&oG%o>uzL(|`fFC=gb8Y--P*L%gUZDy+8Ni?l zdSw8(a6w0^a3ApMKhpdm;AQ7%zJgW9H#6JRgFus zZ}c?qXKQHwEb!fu<}1Cy4yagLN;h~3_=zbr{~7RD1w)*~r}PgRCh3YK@Noxe-Us}O796^Veha)- z3l5#10KWGO-OmF?tb#0VCJN_KNsD)Iga34Nb2M(C9oC7Rycp|%9WH=Y^CHA-k zuik|i(p@F!x^5Sz_ZIe%*|{AYvB&N95Zm1zC(*?oR_&4S^!Xr9=D`IkJz$SJ1H>$@IA}uPamqC8EJQ!$gDUox}*<>mknM zeLg)rLP27WhjWrS^C2IZ24BUK9`KY&4<4Q)4)h2Pt(nxk+#~vkuY#Ozn@_AHju$*W zGH0)gBNO$y!^E#%kBivj6?oE_SJVR&_du-lhO3CK;NW=)93;2|;!J@PiJ5|+hm{~^ z6haw=$}pLzFY`$FIFXp<^MuLBKB0}n|p{l=9fc#F2N=)PDi%F#c5l delta 7288 zcmd6s4Nw%<9l&=V2g^5mKpL?mA~BU()ce@m+l|EDCz6pdq*8;{FHi|_Xe13YiHY?N z0_Zq)imuI(giJ}b4PzKXd!|cMC#RMf(f_O*ar)5A5dv#YwxdzxMZ}zd>kHE zWGYFhh+Pm}h}>z)=FD{ECmAEkSBaw1nU#(PMwF+KfCk3!kfZ$4lBZx6Fu3t(#i@s_(Sz30Z0fMabCZsaFiKm_aiuc1|J!*vt+5xGBkz{1 zU}7C&Ypez2w>Gyb>&708R^786g&MPV$Hwc!JYT`y9ItQY`AThU;rS|UY~%UW>`hNv zah*49L7gwnR_9N1)amw*f&CA`ekIti!S)#?b>57{b-oNwoj=1@H@+RmXPBFwXDxAg z>r-_MUlH}^T}G8zX{|A?*%I5C*b;k=F%q&uLRLt~3JF>Dgan+Bh!c`tdOo1(zQamR-A6MqN^lP8E~X41#{1o{|Uu&xTB(c%_5^+ zMNJlMdVZ2>Mcd~WBvrjI-W+3?u>P+#v3dH+;PKya8M+pqDPMGB8p}}NnL-)(V182J z*725@R*EGpB|k5Dgo#~DY>h41iRGq}wvr|$b_GS3E&Vj{c3Y}cboE7#ezMx4oUGxM z_f~Ww>{$D_;W*i5DDCqN$LPtL840NAf#}+?m)L0di3axn-YUy)SQC}qThc3E%{AI( zS!#Tb?zUZ2+%Cmzlm3|q^i1xlHu;t&G(6D#iEbXl~rqV1WiP4kq97$HwzlU#C zCI1z^>g~~Z?<`H7zTQ!B?@YZW#aZQp`1F$d>u$@T+G@1PkF*&ZGTEWUH;7)8k(Ald z(UzE24$?nu_-38hKgXgU_is#^O8cl?JJ~)?sn~9bZu=~q+b?{`vd5yXA;WvJbfAYi zb5Wjv+h=paH=uJK;VPHhZ*nACFCAIdIF|Svl5k#W@t7 z8)=DYB_T~CSm~zeNLH3D-K7>Zu`8$CItOO59cpktdqMY@YiU1MkLlO`WM9#=4r-ZQ{$)yh z`n&F(ibQlnZ{5i@$LyWlnc4@AgsM_5#e3&pIn&!acT$duS!&;M{N_-Ou1{l8a0m6U z5ziqMR=06ziXT1N-BOlc$X#lQY1JWZbpBg(8*NF`+i3kcHq5Gr!)WDgFHY@ioc1}C zo2{0;jOugPE$Z7nXd`#KuhgWa3_lB}c0PkvC#c=~Q97p{9Ynop`um#p;-~#I@R8>) zpjj**c?A#JtBUqR-r=yJ4F%J&fac49mw0Ht5_p$P^Nqm!0yN(O{OF^a5Q;0G z2SxHzai&SFz&$lIZwJ1570q7-KBt!EzXAT-#ENS3{}vQ2f1w?Y0I&KL%})R?*h=%$ z!1?VoKL;0Mmrd?|3tNt&+!-hP&F*#Bx!B!5a2##j%0-({LV z3w-#NE@%cmUVnBTjQ-=8KBI=FVm~+(Wzi0YfD1Ed{s-X01vKvmUUJLC z9{>;4&8HpCf}&BR`9Fade4pkc!267vn+y04@cv5LJ_%_i9MiBy=kfltf@17R+94mf zXAt%^kqIPtg2f;AOv_$npGTpcvROQNb|FfmiLN`C8zIJ8AwT@Zy6sZv@_V znC2mkhj1x!W3*xyIP`q13uZ}w0laY=&AWhi6w~}o;Jrh1{NDrbI?vMmcMKFgpV1Bj zz^ks&{BOW5H)wty`0iU0`ke4oUCshfC!&4+3voMe_>a;SDtZ5%A)An%4p!3>l}x z%=BqcBoERK+klrJr1=iu*P3X)7kGF#&0hmP^pe5N{_6&X^g8Vj1>XA>&EExH@(#`a z47{VC=4XIkd!OQqm}S~G8I;VBUy74be1sjo?pMqAV0+_N%aI>^f!ab&Il$Cc*KDd^ z=XBdkK@@#X8AMya?k1vJb`a5F^AgeN5Q*rPNCcZJC=n`ogGBVXZ6Mm6HZ534o7rx& z6Ko5b)7kAdk_1kTgm8F8JA`o9>||!U%TK1WyKMrQLl#Kx*w#s?-$CNqgCWsRqS;+! zbq=TOB2;vnAtWDJv(ptcMVlGT;gihNf)0{|(=L#>PG^wB#TC{Q4&mjyBqzyDknN_i z93*RY`UG>jfMo7hsgJB#kZq*!f+Ij|1(%)J3U0x)^@=2};P-_{#zHVaD&ex15(}3g z5CfNJmP(XGGKXjjkc3^bn=HfSH+KeCiXLgJl zSxS;jj;rMLk(s5SS#8-S5(C*G5d&H9k<`2a3F59<=XIDTQ+AhTj3EO)ZqYqi=D#yD BOpO2l diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co index a0e75556a249d8d405e82a5880f10388dbf0ec6d..38d9161b0174d8fea4029de748b047853c7065e2 100755 GIT binary patch delta 3333 zcmd^CTWs4@7(RCD=2DW_(b|zPrV*9IVI_64CiaFqc8S(9qSBR=CZ!u%vqYX4G=idT z(}>fkjVe?@+w;O3BvHlfJHkCgz<=lQV428oUv(Mdo7jS_*_n+w91;<^_TD~TLvf}a* z@Mg-o-D5$6n!AFh2_t0CGCpJA&+c#F75Wom9lpuD*On-YZ+k>fsYC2g9`T&9$TYZE zWxz^x3;3|w4lY&eJCXqa>UB3LsR8PAQ3M}XMKW1TkOM^)RIUY{BS8Sn>S(snkfqKR z!y;Yk?q$lHxTO^7Z7hvO}6ZH00AKewhS8RCyms^d9U--z2O~npgmctmKh|bcElT=k#CODJfJ#%dYTq2P zr^Sjm4b+NqqnB>E-`A^xu2+hvC|CLnU+(AS@=@|z>(FJK_O;>6>0yjGEd&Ylx1~J} z1HdtifBaRS&`n%#CU~99;&cZTQZyNj!o#UaHYSX7@e|oxB)>ZwiSZntjpYSS$e-Z2 zXf7gTqsOz^Jf91}_X*$9uuZ=HJe~o=LpoZ){L_v6R?K(qY~+nh&>-W^aKPOKPtp;1{e=R#>Vl&qVTY;HWg0Gx#24SC;ddjPqZ99bCj&j{ zIAe-s;xYJJsvZ8Cw!^8k$Bx^goOmhCGiJt&zbgv;xUJV=F1RoXJ(;*?4X=LfA;Z(+JH>nl9@5PLtTzC;)%j~t6#*E!6+;V^)_1f1ld){OT@HgG}UUBvt%HvL8^4d1PBbu;-qd9K~Ab`Q!aKD4W2MEt6v9 zA|&?V7qN>eDLy_Q4l^M`R41oZP({mW2)@-}H8e8i;$>*h!u@Bm@sMsEhuE8NWnEli z3mCkLv;|6^^rDF%M}&Prcmw^IpUc2sxz9`-Kg)S98qbcTjc@ihAk@rkZ}vAK3HnIYC|L3E&X5fIHW((TOyZ6 zM+{0_sV1dAn0~e2-wI0rOr-v=RpHZ{z=Zik6UbFJEdD*k?mxLs)P1hHsBddZN_Jm2 z<$meD0i-q6v{d(k@JaWKdlE=PBI8A>V;^y%{#80NBo;Kuu}4iEuT;Mh%`NpB7-jLO z-vHeRzwsNm1S@|q$_J;%Ke;>{_akU=;tl_>(aQJPDtUXbFVw>adn~~!b13NSsjR9r z^C2g1<9n-u!CnVnh8LJ~7zNbh8W3-!A0Y*KWoW?ImHqy_l&JyPW&8cb)IWG2zJmH^ zgZtwh)c^esA=mbtMUot5fEZn7%26#LIca3K99SC3B}bRWP{)kRV{?Fk;JvqOJe{ diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co index 4255ff84060e3cc68456858dd4cdd41deebc8fb0..1a77f541c0364e3340c274d53f11d074ff3dcabc 100755 GIT binary patch delta 8790 zcmeHNdrTBZ7@t}8xZ`o})5;aCQersn)VqT_4n0&J0vckJAO)ip?2j-NZ=Cg z`{wt3v-9nI`_28B*~hJP-xazgiRznoEO|F$0OIE|kw6z}Lo5`6Y7Di(>vEYNDR=-N z$+7(+1w4+})8*1H_5z?oaKfJisUw9Of8P-;g$on@`+BGzR?qWm#j65I(xE0$V!2U^ zBB>X@OSKX{6u%#f-%s$j4Lw!3 z(Me4#bsZu<<%+ltRp>3si0*5AP{);MlOPJ{V&bG(BE!0|)Z^O<8qi0&UR0+aiXIS5 zrpU_xq9(%^F*)_#BL=S_INLiLo!3zDKU!!AEH1q4j>szE0$I)ZCKi{w5XlF zWOb^~s&@NqYL8D0U_A+70|}s%1hAQbFFycnmM>YGywxiD=KXM%ajxmb#G z;l;UFj&re6ylsciX>t33S#Wz$V^Z{{LAwwJ)`VlUl5?ZZKqLwsg3JSh!Qhm{Kq<)h>Xa~2o+=< zO(z2eDq~ZSvLu{sBbGdz-mdPZSmI)fTnJALPoGCDHH! zcWA{4F`jU)HC#c~l}NhWSs{r5B}Y&&QPRQjEZi0BGh{w?y;H zqC<>no?_A%6C=`BLYczAm^4L!u`~*`nESZy6W@iMH6efP87{~+(8&KrKXYV;J#qP^ zitw*y-R(97&PXh}RMyA*G&sthUR6w2_JsVFsa7%r+xi**43nB-C6l`6M#7JS4ZKh` z5|c_&$eGk}N?=l`)su+2Pn_j~cRvq1qvHIv7ZmU~w;(Ww7T8@l*%az-Kf~z`R4S=c z^SQmvhKOf2oo1-i3-5Y8AC*c}FhlwfeR`^zH2O4_JxzfPCrqEtqOW>`T-%Azk99aV zo(P3%i)8}R4&e0MeB$=DStG)|baCD7>(q?m(I{maQcYvZG}X%FQ(PuNyv-qQluy?( zxjGffrHUQtnng#S%_FX&QcC=9Z;<1a1Yc&>B1k;`ezC0t`R|T30MJPA?rJ&S zLU8Xm{|^XW)+YB4O_&nQd0&oS6q*cRp!S9w|B~Rr3v#@N;5)|o|3L78 z*W@$t3&B0H5WG3|Qoj+yr9^p$?Lw1MfqZgaFZFqf6+;9*J|B0oor3*CC)+F)1Dlv! zXD--LzO`z{7N>pHsG92_s68`#M0I;sNvPB1kyN)QOHx;5W=d*yRyt8r9qD$WILAHo>@gLb#opJzD0ZI)8luQc8bJelTm#b#QclP_8u(od_InLn6fGq& zKaMqMq1(_wlG-7?lo2WT&1Ls#VO*31o(~OnP74z#pPB8^!sSr|@u8>0KPHeg9gK@E Mn;SCJh$G_0U+<@?cmMzZ delta 4026 zcmd5Nwuk_iD~+eAKRu^H7;p2{fJGQhD4(=!KO|9qcz0VNRUEjZ)Xmc7{EZ% zUUGSFe(yK$y?L{5cjm?wcH|@*b1);OBSozd3rPG1hyeCo*{a_=%_tK3h1uKhy=A^MrKOEtM<@XTUO)j#3y0 zTq&($N*1AA=_b@}9!BGO9-YWAG1rFC9>&XD8^IzIeZ*#?bgmi#X$R+kVst5F^Q=1T zg_eQ%<*~HjyrUNMspa~Sm=SeaAEgZ`R$I!zFhGYgzd|9~w@SpB6my!a0oY0v1BM|)r7GPl5q(cTc38B*jnF0)pVwOnQ$oLFvl3q@wH zu)@p>#b&>d;BO`Tmk575;lG0U8RbGzMuo5Gw zW&g>9S`9t2p^KH{x^8(dP!qC|glr@s8%fBPNT>`a6u=3s#tE%WIJG4Lq$G7IlzRu} z0f;94pX<67t_sB~pQsAgbm&*_XH`#+FJL8N{G2M0qc7Mr)-5ZQ*w7t#H?RrF zmvuezhF+|91`Y)}fqYxaUHv{Y+onqPbptHCE#0Y9xijqm8uIr`Z*K~uFld!@q53cr z>G+QuqUQK-42o|vh1R61O>vT)+>3cP#g&?Kv)8Y#wMvhi^*>crGn1j`-4G*THS|o| zA;tFL(2ctWYa1R-HpGYQ0yadTWlj~yNvS&X&jPN#?aeu@J}0Fh{HR-2R!^G%=(SC7 zK>8uFgq^JkP+e2UuQ1peAq23i>I9s$8W@`pM#sy{=5Bi>x96) z5i`6g`NmZKOodN$Lf~FKJ`@b~!7qjku+6enAZOG?$}tQ7mZ(BKuVVdgVoxLZZXzFy zPqZrPSz@P$!$EY!nkB}wA;-8%#qYCWhr`V`dAxi{xT(2;4>!2NK4){dw8881I{D^O z-orQg!r?|gzZ{){OO?~1F=E?Jq7Y$9NCVjWffmmqcyt=B9xM%DY=f5nJmEV^wYZ<) z8;cT3Bxwy2L(a4bn+XnTlGKEv=4}9d)mprf@awbG6FjvhN(c;q>!yezas0OuV*@y` zXQ}~!9)joW)Z#IM$ENX*Q+~gef12RS4r}pq$}oa6TK2ova49h~0OwUL{td-H*5W@C zeAzVqZGv;@+DhCd_$Q+P?3n86FJc(9YA3v*3>jncQbgyBW94zT3lHY2w+r!?9C#>F z5%htU@T*NNJ6)Bo%B1Rc64hB+vYx8G;5w?hU2dxSf+bX~@Fu5~^B#hFDtvyTy8M1G zRXvr;w1^Vfx!y}%xIC3fMuk6#mOD$SHHZhtUHP`A^YRQ*9p@RU2LGY{{kxqE^h zAK3ZwW?%UAjwV!QEf(L)gQpoGT`bFoE7RzkK;Xpoe8{KK&OXbD=aumw>GVNPypP8g z`evYVx4361TrB=*hpA$X9a>Y#EjVfs+wHK#LT`KI5-dDO^0U*DH(P^v-wt`}4RX`H zL2kYdqj=E{bH$nhn4L<&Gpd8b1+b9mHHzm8;ObOvk@}RhC7q{=S%r8~lo&e13wD?X F!M{)*eG>ox diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co index aabe09b4b2ecafc241ab91686e7c7454cafe6ced..489d5330bc8ba5ea51aee682ee4f99939bbdce4e 100755 GIT binary patch delta 9312 zcmeI2eQXrR6~JfqJ~`Xm*FwV;wZTxw_W|tg?cJ@NBeBo+(YRLRSjIBc2#B$DX(?P= zqFPDhJuYYCAd)z&&_?~BNmBup5HW1j&=&dJ3bhIeQWBwjD2`+#l=4TUKpTn;uHD(a znLU>DLroxO>2i|qz5TtJowxJm%|74m*d_Dme#>YrHJW%XuuJp+?K6QTa21Xr5*AhU z7LLJ6qa3+MZ2QxR^GR9Q|BbY> zaDF1qcsLu9o1BuBs)d}gmWoxFey}?&JyJCzO;y>e)<4vEWX-Pr{vY@4e&e!q!QKEr z)xM{-?^*O6lIAQ?>G{Q#)Hx`nsfg(uv`asvywWF&Q_!V!J8Pjd=&D#oW(f})MbeEf zE8~%V<$71zSU`6XQQ}?$6KHbkqPaKko#Yyx%e~7ze)}k5e6k z%~a=L5DdTV-#`IB7}6wZqspM=vB5QQS7z-xcb19M8Ft;0>~+-7+YaZiqrTowRdx+} zyH(je=s%R=yC8RV59>NZ6zz%H04r~_T?RUVLNClkDsh~5N3OT=x z96(o&rc-r1@C)SNL35GWFjPWq`zh)CvXuCJ`%vDNHRq$|Tz&wcVAS5;V9+=iG!6!h zgF*LbQz72wVAP{v#8EKrL&0d!uA4T@h4ofO0cF&80g7nd&^T-~4jYZbMsL0oQENp} zYeP|c5Jjy$rM$4b(jxjO9E2|~BW_-g;vLF}|FR&-B9-^2GC-MI@qrcz?|)37I}r(E z{zz?!5oL7Mh*>;$Bb!H?fc6(QXTR+9xD$KT|C1ip!v3M8{R<#HGC`Cc7ynt-=^>q* zJgoe>IZaPX^;*8aFqCP7H>2E2*15bRmQri7h zrSwHhwa-~8O;e(`vQQ;CtmRrINu$2^l=k)$rjl1mOlPH4G@Hu$R-iny=}EI1iIQ$w z4O?7Rab~7iCE2XyS|v$o{{l&mt=~iGaecO|(?hG2`0swGZB`{sk@TaL*6M3BxxAVt zq)MW##wy8fQVu=-iKXN_>%=v)MVc5sMj?lG?kzY}%Gr||wj6!+2`B6^E29VOCaShZ z`RTzL(`SA}`l>Yg!oOmD&eG^(N?G-DffA1_Dbz>j9pzddDPxB$rO#ldGh|U*znNYn zK4K~FJHXtE!zW+aSo#`i>cZtJW#G>O6xUuY7D{h9%D7NECXHRmD3iZ@epes3-_<7nb-5CYZ%`2j)AmygE0(OI1%{ zJ9!p4>3(j{Qu!`FbWt292mEk%ZA5sOiEi)hWBcoR*+`6u^+x&yM(E$pFkBxi^m5yJ zd;4R3e(6Q%lc)U9E}q_kN5P^}lQuAQ!pOgd`K4QUHRk3AjqO)sF032yCEMCPbBJ{KvFo8>lS$qLCrY zFZOGS{xyoZeTR-T3G?Rd zpE#@Y!dwNgVfG{4peYf;-!bwA%(qS%`FAis_OX$7Vea~ek#B?A_%_f+X)Sc?4%}g^ z!pMJuIn!w5SU^&trZ{Twyf4gbl(PBR`J$@;W2`9p<057&)uH20_`u z2!3sXT-h1a%ww_v@)`ztTLa7>L*Wer`ERS?u$bhMV8^zH`*v*M!U>_MHe!`&Y+{KT z6slDBvds)>z{Z#`qM|x7%Opu-mg70nIL7GxB7B5U zfo~#ef-m~c5idE0)lb&KkhUDl^0-?#(X5WF4jg7hh789=IdXMetS*LIaCJ%IM3_nH z!!;Gvq&^d2E=p!1%s1)7#frlO4#CTQyY$P1oL=ngd{$%TfKjl>_GYtq$-KwRU6 zL^F`)oxJ3C*U2w3a2J(!${rT3agaMu+AX)TFzO(8lv8ebpL#rr?=X{Y`K)?O?ntw4 z*}=isBD`8M9=R1Ay2(o__|=|~d*lNg8ilO6k$U+o2cw>8yauQ13v1MFkv(A;rKaoU a)-b$}I(o0XPnA>m%V$;D-XJ@8So1G*P}-*e delta 4635 zcmd6reN0nV6u|F&v=k8N13HC`A!Ck}%CH~ry|&sgpe;fqLnm{XFqQGK#u)7k#?81C z88F>&Rxiub%w&FS3&U#A=K|Y|WoAaU_=htjx@;_%#mpF&;8)_9gT0S?TOKjMBFWYS z&UwFc?mg$8`%d3|4R&aIZfJr=s>i(7v?-7RqA< z#Bs3_H)4C4TKeTt095M~C5H^8#Op^7M2lf=&h+!47TB2OOID^OQiH=u(W;VBEMK&~4aT7zsedI5OIAZ{{|G?sY3ERtVc@Bvq_*(8;LKUqDiB_q z-4<@e`kL%E^iAb+DEmmKF!IDL7}%`u2#@Z3nNC{{$47To(P>qZTt%l_!jC*efjz_-{?VcegMYvKGH;W zzQYIT<-!{oe^6DMbZuc>izeI`(;9vYCgU&ge|T2c1-uL}8DN+vTb%63h~ zvtzSY5FyB2n@J%uwa=74Yz<3=Sla5^<0=Q?voUSqr8`A^hpXMS1B8c!rwco&>GoI1 zF7DUBA>o==@ttY|(6D2_u)W+BOZ{dSzOFn%1)Bcj_V|6}ZK`VC9Hl)vw9$Q-jrQ1= zl`r&EpNyHdgqLbEV=K;6(de_$7DBLoNDe*KjtKo1Tv}ArP9x*S!RoD}(YDy7oxQd= z<=>^5K@lpn$g}NIh4b%OEJTpIis;Ue5PU~F<6;cwY#D70OT{3qaZ9Uije8WzDm)>K zyjQH9c3p>d_iL$p6#qj8d|$YQ8srgE@5zD8IO^L4!mo#OWBvyna-9;IzHm@g#|%oF ztV)RP&*Qd<5DIChFJbm6Mqgo?(4CLlA40`BCG6DuaKWLkK7K#i6Dt#DuP65TL;MH@ zn}w^#f13Uq!apQdU{Jg($>aNRD}aJrNO*8=wcM9eR)N0D=yQ(UZw*V8APw=M_?0>F zS#e$~HpKhA;)1SCyPds7@o*;%&-Z?x0XM`-{WE3!{sRWB@bIE9rZ+ne`cqc;@R4?o zbSe`+>55Q}x1xRz+Y>O~h~@pU@lBH2gYD!jIf(YA&f^~}hBRey@(D{|lhMjG*ci@U z+t66Y)z(>RS#x8pvyNdIGuPp&Y?Q+aH-n=1?+Ehs<{L6GLx(XB33sxWKJ6J66WC5h^wq9dFwz&rJC1c|3ePt zz)!4l34so@en5yM#rWIgYaLj(WugIqHq6ay)jWuKSDl)FjCs|hc|OOykL)KNc2bf7 zF{7(*V}ivQS?a)zRyDtXdC^fdzl^z`SMwpvJNBsg4a{rAjHC4z;azMQ6_zo_{t z%r9P7^I9m!*MXYbYQqLRz(1tWeOjN7IN>tWOhM7H#na~o>hE;|wc3{;~%vp%acqAH-BAZJY zH^F9(#IP_PlA6V4^%4iRlq0I6g!o#`EQxIKu#w54Nm#rr$foqL7MNDuF1pb9?jo$Lwh{_S*AIKH+i5vh5Y=0@1L%@{AQ0Bh1_mkM!wqvxzzM> o<+e=EQ{;_U{*K70^gQ0=g@x2W9=~3apn&g|q`!c_>4mxf0L~-h5&!@I diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co index 57867426d696c7e37b0d5d8d1a3efb24f6bab9af..33a1a09cc9b7a54f07089393894605dc9f3dc1cb 100755 GIT binary patch delta 3262 zcmd^CO>7%Q6rNrGCavvVt4WC~qF@DkT|#QR@lV1ws#_PQ^~Q0cq8cd?YFwKI3W?HG ziumz5Rubh2f}_uiYoH}lOKy}j`%RsNhh$UK!Ty$b9%~3&l z2OZmFdV6vlzv1`|-*ng=slmkA&aq zevtjDoXL1NkU3S(W<3Fr1FAeUoko6ezz@g)a)1HA0HDv&A^IJiBIgK-0Y_NW z92g`Hyh0p!l{hdWQS0x3ix>T_UXgR{76Yz5-~j9#j#G8q{~kDa%>0+lMvU@W(_LD6 z%RZ~vDrO~Wo>8U{5aC^4B3MEMONd|z5$v#b4AMFi(FH_AfrvN|(WC8|tvd(n!!{bp z@s?YZ1b2odl(B>|mQcoyYy@LTVC-dJtPdDFAmVcOynJojj9K!~gowu%5f9!+X@er- zA15u<+c+Bm13c}-LXtU+HZ|~12YKWv}_>cJgNcq_`tEA>Zp62KH zX{6kizlwfLKk0{6E_W}PsoQd>SNG*fJ}Y?1J1@VV;H~sBgI7|mbdtqCq*~pOH)+L4 z+S=KL>1N#4$I#(04);xRk@#pZdL&;66}$7HNGur3M~d-aym%xS3>QN2eE4uaUyKz3 z_%q5Q=?SI{eIby{jKF3MZRqAp_4cjAZhxWPuIG0)^!-Eq)kF_LVwoMT?;qFgX>{UX zeZ!)v(sq<1DY4PI7lcP;l4D>endJHXb3KW~9(45ZXyNEw1TS2Oyi;}pIqJc|$Yx6rE@Jkj4v dPYvMn{k;uhR_`#bt+qAO(r|-V&Z-s){RyLJQ$qj% delta 1711 zcmb_dZ%9*76u<9nx;b6%KKr09nbTspYw(VvopKmVW9cC9_U@qpB_y~z_Ch=ED{8b#!1o?cXjCU~Qd;wFz zZ`{M@1F&tsso&+dyLkY^B9r-xj`bQwCBv{@dZ!za?gCX1O9Zh*5K9EHiXaab#9={x zET~jD6kjw>ki5gi%`>Z14z7$PN?D?mB}!SWR18_OlR`hJ-L(JJhXd*o+s~uJZ5BV^z-U(Rj&h=l3lP+@YtI#fMdD{9)Zkf?bZE(TVd7 zQ*=aN)Ndt~d@`8?mGOajkcIf}Wi z5)bE#cO^Qwc!xXgbtU3~4v*L4;t~O_gzNOi*16c6w z^=A@Km8b)`?#ix)bT91vB~? zDKcsIBrwPlQX--Fwd=v@sHs@+a{m z_x!%|JKwqYJLh-4)6;8TaL2#mj*GnT{QSoKiWBLDp`bT80F!DmzE|fF#f6yOW{B<@ z*9KjS8NGk;?M9r|o2X#bjb1j8hxR`YB-=Q5@4eqg%3S`5CA8pFfbBxp19au|&8VT76Ti|AcxonY8aRT;x#Y zto?D0Tyj{u*@Dn7NPrk-a2{8FcYMfk*U4dga#gGx{>T}2mb$`Sl{L-@gCd~ssu+D= z?Le2RooKmQ?}@t+s@EN;RCAjS6~pLUHO$A0F+Ncg(FZ@cM|k8$M>J$UDvG9ei;1vM z8X3gpWVoX=I@noWfpJ2TtFORVA*EunD2%H(Ruo>~>TVQJmfZ&tD#hb24+<=MR6CJy ziC~MWolLsCV0%@2Y|N#7_*EWxAOH`b2q*%20lk23+vbAD)>ja1Qo(EU7qkFIX#i;& zzyuB8MKu5=(AoeT;eyApy&yUw1+QZW0)Uyra;kxQ-h%)S7#leQkd#28ulciVcHe&lM_>HLFdEi>D$y*lb{XQMpgqaF5NXZqS3T7sgTL zM9+_!62iYGPpg$^%$<*S%vGfIb4KOj?Q#wBUwgI~%MUs`6!n~9%CtzB|B@YT`q5Dl z#77`IdNx9K{QB4bJ3ET~k4AO~{#hgb_V#Q|&5)h5;bY-vQSA@%c~ItltNa{X;2-*7 z`q{vz7KmkVNRQ=V9w%S!xYf=HWMyO>hlwk>4#V$}TG)+j1D=1si@cH)c)y>#m7Ea+ zp-Cw?mCgC`16f}{mgH<8ACf}(DM|9@e4(s=Z#J8kb6#?abCHN(Qji&)j_WUF^=i#9 zqsvb*M#sk}k2W>(`W?=UE;RW+N&SybHTs7rzc`MNa(CC$v}3qwK!Wl=HSy-=p&xpr}!m1`DS;&m0s(Y1j47}`A-~VIOVfaDK}tJPRRmo z#VAe^pU^>GO1G1%DJOG1j|rcaF`E_rQ?U@EzOHt*5Q`a7P6wTqcl3E$_UL%iLUyIs slB?-%E2GY8W7lTZ^GkckP-eUJGKiv<={o_O$#h$X)g^`UHDfvSH~s5WfB*mh delta 1752 zcmb_dUr19?7{B*U-TZUA=USlIP%_cER=dva-c81se^`k_rx{M2<@_@)BKW6*5JC}y zh#=(@_z<{WC7-T=R4}2J9)#>Af?f;?OnewoFCwn9d(YWYL=7Blzu)=3?|k3yJBRaa z_AR}+M9-QS`No`iWRj<}4?&)kB{ufQEf>9 z9A8Q{GRSfGMp#3ammZ%FPScU%Wmd~iFawvt2-q~~Tc;X24ZPg=!$1%y1(whEMA;4w{azz6vYis@`nH@Oq>0}!ASJCSAp+LIUn zAeh89077)4oVCam>{Z#s3UVcDlhycE9DftXcjEXS6wkTk3eF>&IImpE)yX>(C_hCd z%I{>P`Mn{T0)Y+2Vo-7me=_YO`iC+7bVh7NHxs*0J&7fY@K_X&Me$e^uZprGQ6duM zM51cc#ORh$k*pL~eqZ=Tmn0KmwLDhKW3@a|s~D|v27Fbz?EgxqOaIq-9MYc>EYnJZ zBMwTLl2B=Ic_rAiI~}8_DfRnWH+_01ux2^Y2^cf|m-bGvd%+5NQ{ z_fvNx75fO6+!Fmo`Qd)X9CvllXq{K4yypxk{`qw+{;2bYQ_{R5_{VYroc1bm@#84O zsLl$+8}Pf%!S+6kfA$jEB*1?1Fw5X4z5p6J08{0HMICodHgT}pE=u9SNWU2FuL?Va zNZ8eHci06n;u5RH0Y^AI;1nz2OS%ZIcNivL_;3jfc`E9u57K^n0k&_N_S;Lb{oR51 zE7<u_Jn5Lo>G9Z>2(2@+SGq7$`C#iMJDxd>xut2j63e-chZ)-hhTt4vvRJrZ)f=LmmdS!Yu?s8#MQLvZLA+Q14N)lD`y&q%8md diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co index cb9649c5ac2dcc1cce9e1d4ad7f0c5ec1cca8b98..97d2a7d6d00ddb215cfcdf65fa819efb67d65858 100755 GIT binary patch delta 3337 zcmd^CO>7%Q6rNot{!47{T1~)iL5->GHH5^rVtYw9Ra^2y!K8K?L@EWv@!IZE6(v$q zsVc(jSV`1F1+E6Ez3_Wzdq67ca;Tv4h7$*9X%GhlLeN`;gcMaEHKoG5-JJy-Au1&I zN5Ycld-LYKnK$#z8$X}qFJXQ1WZR7`v2^R`>9~y&hz04Rebca+nCnG7P6cT@T?nKX^x60 zy>zn8^2+=ye%ZBwe{|VhBbhztdS;5nxx)Mx>-f5TH}#9&-qhRM(01dT`#H-M3Mm)d zT@=3JY4@22LRN?Zvao{{7T@rEhCgLKq1N$h>^p6P72$QS;4OCwoyu14L5s|wt2GvV zS=)lH)jH5>t-d`KM5tc(pt2flJysIX$2EaYl?Lh25{H(*3ht*-5H0Cw+GxnN9xsgw zOnLtRTS*IB%9(+V$}R9?(p3Ex_%h=frc2C(hGQjWlBx$$TwV|R;R4lj12-~!@$@xz}U0W#n?wSW!qzYl+0QP5%)GD%=;+crHr`4h14$T z(ZkXNQr_S3oz4dHuL;y_HX8E>XU9+A^{p++wE@niWP5l@{$Zn{!fT*^yR4Oaoh!2T zYMC50iK73KD^}wnQPz7Ogj`v(%Rk@#f2T?R&Ic+@Chc9{Z!D@>mW(V4ZwUL4`gQA) zI&uQ!1z}lOKXj$R zDfk7`KBQPgd83Pd{k=Y;iFPt>mU-RmsfA8FF~!ia7(P5T&&By%B#{>j(c)8LG@gtk z#dwjA@Wp&25-UV`F*YrV#bhCj-=q9^d)6)slVk}3?K;}frKcP94-q~0M5ErQ3T^0v zP4-U``!jQm_B_$wJfsVMd7ahzv>jDR0j&>1qgi+@P|t3vKec&D{rOXNdFK#6jE+p_ z3P+|Qd}7EHV|{x}kx%R;Vze)jFvVzc2;q#U7k@kbHhtNRkLP;ud6B`FauKI_*mkIJ za|vfRIs0xF$HW9{Qe|Quv)K;p$vg0A(d!_#P#j+rldKs!`EOA@Kx}42C&`MWH_c DZ+u}* delta 1642 zcmb_cPe>F|7=JVFpK*2CH+FClS=mKwmCe?5on2)y%`FMnve4T6zv@ooxKLz41hu)S zbO=&k2a%v85DyC-CPL~kcIr@YM-h0WdybTnu#Fk@U~k%@LuZut!DFF?|XLUf1SM1r-A z{<0~*wK}M|QG>lG4^#D8$>AD|92U0N?gO=f9u~cnT_+=XiHs12u!Gvk z2$vx2RIvlXh>BMs?4@=pShskJtrlIZN33MMVwQh}@Vf~A2I2Q%zBMSGvNnn?Ym-=M zJtt-!;QImCsTeox+V><8z>W>t&Bz5dF0Zq#U`jD34a$;nQn?SFDryprh~kJSj)>y2 zqI_5sk3|KrsD|vt_-ErJroXmaTKa=3Q!gTFIii*$YB{V{j@!4G@Hcul?SJ)RpW5VJ zmzEYD&?GrkwarBOGD!u1$Jy_-F?oEIr+i1Ke4EFWyMG^J^XQU-BxsR z4oe1KT+A-MVS7(J%6x8_BBvaAmjM*p?L&pt@xV{)0efswV3D|f;X8}Z+uPI^!N6c( zJ5a;kHZ?)t&Q%GY){bHrdtk?@wKO*O}J)ml?RZ2_Gs(Vw9P zWT*idYCxuL-qmNZ0lnCOK5Re^8*q)LD`U_<@83ANx8ybwh^Exi47D^vEzRthcTGXi zT8$084jXy{HuT1beEHGNJmD-#+3ialZ~kw4((hKya=b6O@AS5d&O`lj^*3rX+u8o3 zm;NtoXLM3|;Bdc5+`GI;jIJ6L&fbfw_-WIAPn+WZZ$GS}5As*>!_}GKhn3h5zw^(m z^26g9=ZB#irDyIXRN{;46M>%wo&@SkrZ#o`OBf#wvkAAB*m*|tu2U;9B+;{QHST0t{CUeh}=b-Lo^5(HP6w5!jD!y5F`a$HM z{}AvMBY(mqJiag)^?M3Zt48KW{Vfv}P#$SWhOe}S{^;Z;tK8Kbgu=VIDGO$N{Fj}F z4rohPI+qF@`@0qhKfIHQ`P~Is6!Vp?B`~wx@Kx-Fp3k$&4UcDn8=mT1ux!trgi2gd z5Ba_Lb!c|T-z@JrH>yR~^sYA-$BW@+dFxP8mvlb9$w}JJ4iilO8@)2{KAdz&br9w& zN1Si~3X4BZMfks1vn;}Agg@SOC{8=-gOV)=772x)Ws_^3aG*?d&4F$35-k1N#|f3V zryljM6dR-dhh}9ZhdQ9LbURG{yT0ODi^?8Rf(KzIgdZxy&%;_%CcN(}-;bap4@+3D zB87Vy9f7|TVXqlo{U*)}eve<&ZmxjRFHXX@A7ls_lOfzND}AS71PS}gl6eAWPc16h zkSE|Z#da3Hlm3tY(aA}NGBpIp3xvh9KB8nr&Tt^@p9@L0FtS8=_oGybz}mAcMaV2y zj9L(O#XsZMS^ZzU%y&hcBV>jv_BfRrEb^jdzxybm5~tJ?gDHnGF*u|=#-ULy`lj=H z0Lchq%i$sA!Y-;!y1xjJTp)kdPEsL23L?nXg8#)OTVYawN2t^)G%(Jk)f*PdAEff{ z_ow$0c$xOs%OP$t$Br(NM!S&>x!uxVyHRsF%k6gg8tNM>o2u$7S-;C)&o*%`uBpN0 zayM3T_3mBu^-cc9$Hbq*5~=qUlrM~FL(`eCDAJ%Ac<-b*f*z*)rWh?hhXpQ69BYPe*CbJ@1nf7*T^5I{L^9lEMrRg`u4d9M!LY*>eWzLLU9 zchyXTb=7Xl;&Ju1Zmx#a+pc3>7z$+ON|xtlXsVHD3Mlx$2qG{QOK~Ulps5hFN{Ikek=pD$dIR5!!sx znd9hn)LblWJAqbP^p&rLkvdKw-?1K>G;#vDEVt;2c8ifLCy~#huYkFBY2!)cFQ5xz z%r14{Lq@O2qSb6=-O}}wcy#@4m-k7QQ^;rUq19?Vw0bRKmbPIY_4G*{r_fro;f!>Z T$d-P|@*0*C{nEzQ(29Qn!qw8g delta 5244 zcmd5=eQZ-z6o2=wUAu1W-Yc7kfQ*sM2^0GH+K#epY!ig4lT0nia2pJ4a1jQmh(Pz9 zg@_KG@KBa8bqeAfV{h_?nG}@-LpCIaKNcKuf0)1@3yu+S@-^>!y|?8pgmtcoyrl1( z`@83!b9(Om-S*!0KXgNz*t(1l!`Y5_Uc(p7! zhV#xyC>)Ke3uvlU0ZLsP*h&SHVsKInPKv=vF>+EiloX4Sa-yU%WyfeKp&+p#ool|) zDx8^_4`-oo$l#0_oH2tlW>Cf*Bi(AQ-;}^*r(N{;VK{>cmE{3nQ#k$9bvZiyka`Iz z^O|G5F}JG&6jdpI?y9PzRs=qK9++Sinirxf+>K%tQWix}h05%v*xBHAZ9uBTl3MqN z?&qjLiMlSZqygzw?s|6>6=)L{E%BMUNjM;>SemkntI+Lor&aWT%>R9q5(_ z>M(S;$uRqeo3#O{7K9;NbxMY03g4XQi-;ku-OfaqsTL7KbU+yDiHstmrvYRb1bQGP z7Jjqnd|a$9p}g7G0)l9 zaMVdJj`kfHfbXjYm0}VDf^>b@MuzxOr00j;1}bs;e$WHN3=Qa0i#_R}8+bPlhiSB1 zA6*(nQX6fucq9*W0*~XLC{~DWiAIIWOF$P$tr)HiNL3;PYfe-W?7T*3JD~$&!5Z*< zggnbIh-fO_%TLrx8NPdbWp0%XFV_7oAgrX}u0YCgT`0i1lm|a&Gm=mPe027|Z z$9htuDJK>Od?-&OE$O!I$`SUYqbHXY=nqE zCn<@zT8D|a9QKa8YNWq}t=))-8;fCU5?C$H+yeU|CgS#~JX&!oXDh?S ztntAn^*EOD`4#DRa6dkBQk!u9U-w2gNNN}ECx^v;ZjW&)f8YqDq1nYBJp#9+TiCT$ z8|x@tyKW6zy2e~;H?1pmt+Cl{CU%{RwX$XQ($X>~`!x3{c!Dqa3Tpjj>v0q;5@e~N z3I>(oT@?kFmdsaqZ^- z#lP>lfjchLC&tlY)Yqnz*rA8dvO)g?!vzjf~z-o6A)n+vj1J-0CsNLxxN#vOF@`yL5%Sm8| zBZuU}+H;ADP-5W952fa^=I3z>jSlo#$hWk?a#(KQjmO}89ohUg()_YxkkygRtc&J% zN$q~Tx%JchIjK!H!#gyuYln_FeD7Du@XJt>Arol~PNc7f-`kF&kcs#0pEcCzmWDw>Hq)$ diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co index c6d8b1afd2f9ea2378028abda59937564548c311..3fcb1c684e870d62bda372b6231fa80542b17357 100755 GIT binary patch delta 12311 zcmeI24Qv$06@X`VZJ&S6c<=lJaVS3~2`1%d?`QA!(sOjX_XDL#3MwFz@^9lfloW?L zjhi6$aLz8W+{R>S4w_Io3QDZvwhk9@Gy)%_G;LHIC_)XYl0YjW5!wdSk`#ywo!x!2 z=b>p@MI=5;ImtKA-@JMA=Iy(kw|BFn&whN!-c!pin6~ZC4XP6vuS3ux-u$1zO};Lc zoByL}blbyl1|aRoaRs=irvm!DblU=ET1rfzh74r!{~te4yNkJ_=6~DM+L@+Ft!1WJ z(&oW*Hr%c#(+b#H#+{nO`b)TnT3fUSORs2WN()QpE|g!Ky`ib;J8N5yUex|n7-B9N z@0X4DVf;Q*yO!_Mc8s&LLyXqKirf%Ws6EVYj>KtjeRe1|8__!3y-wU?n;|*jm<|xjrR+2SV4cm!Wtj zl;5^Fh+Y{CvhmFkwtcgLI!}b2U{MHl7%kD!TP6Ro&C7y(e0iOteMPV$zOt^e{VE>E zFJP`;#bfzJdYjnH*XwQPW_}r?UJoJh_n}7+ipS&CGL~gsZf~zvu&n5EB2gX2a#)u; zJFAU}A7YV=9guOW;8wwH7`I{EhDv5O%O$g$l@g&jT;gsv99RerECvUbf&D-tF00tVNAt4G1~7G~c_OEUs_m+3@5tEN^D?1=M5YaO zXLSD^5I_e6&;bE-EW7FMi#UK@96%oqpo9Z>m*J~}^1t9eSgN=BaGt7r3Xyg|q#Y1x z$MT!*DGHTFaG+y2(DQJh=QpQ%CU+O8leui%J|XdL{$)4wZq|sz`-JNbmpvJu!Stsp zrdJxlCaYUHPMa`{^A@fZcp1arOsNaF$khmhZEhHy0R7q|kbElY)VN{qVeQg-`qC>cxJ|M4H!js8Z^Gug<+ zO-1RyHsfbSt$V<(-g9*n4i^S9-vka2R>v9OVhnmne8_9-ok7Q$3z)m25~g+8R?6+ zvM)Q+Qxs;0O?}`(wg%YAkD(eM&Ch3YDj{q$FFAXg+xYA zN>T%Ca?&+GuqZwIX|{_=w{$Z*NBxGh?q8vR|22?_pJZ#uK{LZa3oKC{>x@faH2AM`xpx~K|hn8 zsj%B}mNhGt(fn+aZwj`Tr-OUhge|?Qi+z?$FFVL~+tRx~ zUenGI$ot8Kgqtv8=?(WT%~yp@WvHcU0s?5yd_F={U8QHHqkkB@=1ECHuRR1lS z_W<9$hUU)z-(=2Yb-n=n=nWBlN$0BW(P>N95DpmcHne^(&3^;@WP;`=fH!v0{B7Xz z?KD3Jd=<8zjX#F}fR5Vd=mFP&NA}RX$Vden8cx%^0{DbiX+9bFp*JjU&VMI#ymiXz zFa#0!0f!-2ogv`oE?dYDz6N~%S(+~azQ{Q9lkHp081vgu_rJ1Gm)668T62k3=cB;+ z8$Jc?z&G_7f~Efh;1il?z5{rlI*sml9y<2Vp!v^%pY+iDm%u}HG(Q5|e#6AafS;Rh z>RZ?UkI<2~*z6!nco%rf3YuR8UTcPmsH2a7N4`tz7qGgD4GpLiSCIoc>djbLh9&}U zO;Q8Ubl_)tY3>Hz_IH}s0pB`A^LYkWZRo(y=#Fo|fO<2wmP0Fmx06Yz-US_P33@;j_!N!i3xEfA(fpgh`(B`V z6Yz=w;N<#03>}TthG03g33&Z`w4v_GDg`WEpAscB5VaJ-F+yFUSGS<* zGrQa_A5o4)i3yj-HNrIyD7a%DV#4i-C_r73pE&FaSPr|wJ~A`bm56Rkl8G_5>T(fT z^pNS?{xI>!EypYq5x*aFJ&H_pJ))mj^9VlTn#Us&e>^^!`0fcB?_W9q7n_!@3u7L z09n2maYbOwV$4q>==Vg3KR#J@gMtsoiio0R!6%8t0!|5Xy?oJ#IZR!@((kgeCM@`f zHQCG$Jw2qDpP29mWMaavc*%7BNQ5{n2_CW{$t{r;VGCr?v^41@E7J}_my)s>_BC5V>x$s ZQGAj)pk;(3Z{k747olTmo?(eW{|3!j{#5_~ delta 7246 zcmd6s4Qv$G5rB91>|5hIn>}NbPzb?}O&yn?``N!k2Kjz*6qmZ`flN~Z4&Yp@Q!&&P z6jAPQZ2oFg)1^5`i0T#;S;T57mwKJb6)jc~+#kNLzIM#bR=70ypLPhERehYVN*#qOydZUvhtcN(P4Hl;D&u58I0QSM}m%F*d|W^hE=%<#mxczsQ=U?*jGdr8w4TKh*RW%rDCl}O>h>~9vfrC*}@ z-mY9lrTHao=~j@hY;H>qS?RJ=UC}xw-aNfMJr+xFTrGWfEY`wt5p8VcxH@gz&2itO z?^f9sM18hJQNPU@4cOdK-TvEPzY^?MgZ&z0pH~_6Z)^QxkOJb!d-C-P5GcdNE$ zW#k`yKT2`6slcpaOJvB_mS$VB(pR%v)7vN`AqOPnfP@^7kV8+1M+u22AsHpKM0bkr zBNV`D%E{V7x;r)SSW(vFXn77u*8%A|AYBJaH`U+Om#ysJ?Fv_Zq2?x&$>D*D@+FH* zbv*fw9-f<^TB-g;oz}Y9V=ZZlO6WhAvsc=a}t@qnerN65;b#i1lOG&)IDZSr4 zN|&ToK5NXIXe*TdMaH~T;sx2l{Jl(l&q}{cr)pzO^#8k4)IZqh&Dly{vpq6 zw#*(qY`dtaT^7)(s^gBy^ou+_wQfI8*X$nGun*lVRqac&wVj=4hfuThWv8mn?3ku@ zuRsq~rRG;$-RGZWddFs)y53Py?@YcXF|KACx_j~S>tV~Gj%?L}#|{`9GMS>B%q_n8 zS!-G=2kxJ+o$R0Dl;iyIN%fCp3QZTME?+ffPm~UH(VUsg z?7G9BaNrnX}yTaA;Ov}%0(#S~TpwakN6<5CC8q2%<9 z*0fd;+%%B_vT4evDLYr}SLYp}k4(CCj?baX)wa`gDSN+1n$|Ar`7?A?Dbw|w5BgN6a1k&XQYf- zs_*8cIwvwmd*8@R-M(v?&e>WtwVgBhVsIsD-F9?psPTV3l_l9qchkvBA2JDbPB}Bl z_m1{l**mvn&$ZUHRvX+y&p%4`&|vAkkyM9#Dj(pmuiA%mxXqn60=90fhqoWtj zq4)qdtQ;(O{@ay+DGUBn+`6Zsxfzj{wae`~CtrQUp>oX`go_MRa5AA#5I zB>A6#Ctf1?$G~e2ko+p}9Vf=uAHkcTICO-Za0mFhVm2A##EM{35Fhrm_lnO+*F?Io z6+6183tE5=zM>1}qyxaut|s|mgU6}fM@hvom=K*y@?Qhz%Srwg@J<)W2Y~n86L<*t z)n%H1>yN|_LGfm_E|@F041C@*Bp(63Z7s=f0S~@Na;yI37)zbpGLCEU9iSMEk5eE% z2l&7~l7AhzQz5w-G@@FRL${gHY08dcZ9^$_P zike)kFk%b>@0~&NzW^_NjO3pHUvN)C{{$ZU25CQvOCbxS!lD;|rH(Hnc|LGX+q`j~ zS-_*KNL~t@Uq|xAz~i09=9#XRfWrO*azY4rWEaV6fzSI1$)5$@eu(4^z(-%xd1n7N zgQAPo1v8-?z^}hgI@$yLg0b1A{m+2UQ%U>RfLFddo}>6(pr|`HUO`bQ;2pyxe;fGl z6_Wo6_~^e$egSyl9g^2-f5r$*4u!uC$4mGtnowXt-%gicVvIA<1moM~WU$-+Z z$k>}l#H(`)A-sIIZ06(ki+If*uVjiI!SogqaB3cx2Pfg-CG6KD3D_^Hux=1X%NIf( zoF*R(<0W~5K675!g_qB}WE_L{gmCVBFoajfOCHnK>oaWwPSZ9lV84PZ7{>_-yc_R< zAek8p-mqy9;IV<^lyMAE5^=&pSjNi`-2$$Uh=M?cMcIdM577@H;<2@2h{qN5`^{?m zTuw}7zl<}MM01}d$%jiU`CK@%6fk#D3I)6n8F|2wWw&68d>|CZ7LwaE_R9EPl7nX9 z<*-?JugfgFm*;V0uN1@ydwo2vwm0B24MJx7_`M<&aLrnDkK%-UZV9{a38o95?8KYv z^96BTeIXyt((jTaScV_n6j+p;Lh`*5%O^}MKY8j2%c9&SIQgIx(X}TmVlF-j^>WFI zQgl5G`e82Fs9oQNYjkotQEHK!tRfLc?ydeQn%P)Y6H(tU`f+%2ScV`NY5EYVpk+9_b z&U|xb=gfR_#>bymnXNCFcX+m#S>8V_yMXbb$QexrgIO~v?=^TpFnZExQv`n-XTe?3 zH0hV?v1l~GCP6P9$YSE32ckoa&wK0lp$e0KaHGw5FU2M@jTPfc8?vxGBPhMB*kXEd zrih+%{D!`9*c?Nn{U7#B=kqV;X1?4)-`b+g&-(LM{kemmT`1Fb!t@mbl(n`_23>R7 z-E>39480g3Ucn|*essRUFk9$Vc)2B3k=}Aiu5yReq1^AfXp%YbaTS8K>K?FOZ3kDX zwTFg%0Mu$uP*#2B3ndABQkB@@Qj8rb@nG?5-zgUOz=Dp{M?>B`Tk4Uxa_#nVDve<3VugVfp-XnfFo4U0~jR%JVgRXkN{Fz z0CKpo0XU_C-+8paJEMhwa{vc`SB|$+3*0Z`0M9qob0$nGwhrDT|J1yoS}RTJta)C& z0Nzw<-o^?6i4c$o0f`Xs_6TtR3(<##h+rWO>1&2{`(WA4;eD8Q|G>)l%s?U;B$7cQ z8Pp|fv|=%=*m11b<5;mL3uyJ&lG63Sj#>VcB|_ZUjG*UHzD*f%ivy`0(u*$@&H!ch zk?(aixPMKcMypYuKS+o#ebDK*$ggi#RD1{M-}C0moz4(hyQ`=&4HD+RWQfJMK9rg6 zdn7|TgE&J5e*b@G$n)a8ks(U!;T`kCuWiq(TAB>aOK(e00JUgdP=}VVJtr+nb3nbS z1p4Dmcl&--R*%^%%vI&e2_u%fdAle={G4r+6pAgRZrV#wFE+6q!@pbFr`Z57LGYJ9 zYSX$&`e|ji4=&GjAX|cCLm}iz%<$nzM(8`6%?0xZv%#<^h}m#HB1H0M1tF9RMzW#F zY&I|E0_c6lgRUiP^4_P(6a-pzv;wKWp1+Uqrys57jY7~0gkApK#Q(dgdjAOFTQj;b zc(Zp!vsfF5lA4)zXdW&TE@5|-oaZoal$_|&GmD2~2l~L&WF|M27R15h)C`5g)D#B? zi5UzH9HgcoJOf fOXFU4VgjvY*nG$ujuC20lsH+49+=1JCy(IoRi4T(a+c=)_3e`-rU}aha8`COmJjVF}*s6M@ z$@9^>0)YN?y4E0GEIC`y|I;`v>604ik7i050na442unni=Cz4X^WhQsAY*-mZ)X1T5;C(QHzGouk?)5`O^P2kA3i|i;IUFB2Jf+TaZWH49*hlzI5*d-QU`ZMrEbnc zsNL?|_I6xuN8Dg7eqPZTGg=$0Gcro$MFyE7ZX+2vfT$~PRYH`?TkQ~i%3DhinWAo1 zRZA}Qv=6OD%Fu8Wn$!l<0hEgJMTSwOo*}5M!x diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index a6ead50fc123d3c482f0b24e95d2c8a0ad868660..134022127389de47b0b060a22a3378c60a4c8555 100755 GIT binary patch delta 8796 zcmeHNdu$X%7@yhhwReT<_3oBO&r(WJ(t7}R%e`n0()LO#1ZyD$3n_BYherfCwVI2N z+oN7dAw=6u$P$5wi2^Fmtci`1Zmq;egajL7_ye)V8u>$o#7DF|>g?VO?G+ypfooEl z^uFKxX1<;8_S&zbhkZu2-?I>on&Q9BA!35Nw9%4amvJJ7(DA1g2gPEx|Iz<@) z0JS6iG8NuN`j@2Ii0TDEyLxc|il`%lJO0`cmB3j=|9(Bx3Y!ZehUE3BWyy3|*w!0R zCQ}S~X)fcNkg=&Hf;Q;?K>d2HzG_+7p*fqIo1blJIdcVlqb-HMsGq;9pV#oG1KrSg z(Vm=%%vp#cj3?tP)S{;uGrF7;hPhIm!3M>1d5|qIW#Fmky&{wQur=Z@7dJ_7<@Uks zD+1x)eDfyt=K{Aqrbp-p}foazp&1hT)VO->N(!x)|G?2-n! zAE1y6j&&Qs@oo;B=$-^lc0Xg-B@fM7J_~@MAp?lZR!uY(0Pl4NnB}nwrZ#2=+xx65 z7+?i$s-5&s|__bq2V&Yr&+}LF{L)fkT7X*IcXEl`(UjV(Vh&dMFH8 zf#-AUa{$DbFSpn*wkf!_)?&xluHcm`Ee?zw3a+cOs1xsCfDH#=!`6bf04qoUt4IKA1bF8IFqDUFhKjJ=P#Ja@9>xLSZ;o%L61Z(Y z4sbU+%w-W$s@Z*tL98V#e5P%avsziXnN7%1fRj;tXOqE^WN;)I97zUOuigsb4kx1o zCnJE9u@EPtRQ=vu%Du=O%`rggu|P(^&kaY?#*wsfByHT9yOFgeIBQFB)~a#VYQoZ$ z{G}R!XGjw6T}6z19)-J95&!T&(uY*s6^;XR@aRF*Rqv2OGp$g`i1}{|>272swT{r% zigu~=p-)(0|J7C*F9zyw1KWD9%j9mXS^uR>GL!3w6tazhGU;&OGC6a7oR>-87%P*0 z_i46h@708?luA`1@N!@=kSDWka#cIV+XCAI+kmWN&`$$9rJ53J+E-h2CBI#AE(o(w zhdL9-QD|npbgtyhw68hR)(S3*)q7iI>0IYjC>>eaa;s!&xi6XiuV%YV$&{ZVC=v;P zo$}(2Z24+-QaZ&n_e?qos*okxDGkyor#3cHPoW;mc`3dxFZ~4_iXE0Bud*7{zq()O zzAvMsuKGo4R?;btWfTh8#$3rf)i0FZmsDe|WTIab zy*QYVm9n9ds$G*Rsd`mOg=$J3MDcw`q!U-Kr+p=sLNRA1?3Fqbb!^)Ig)2RISnAoq zYMH#5(np=9j9a#!?rW+4z5jKaS}LItwwROpsato?^wUWBGzPYwuzos%qTj`(+I`8t z*6G}WLP-rZpcgLEy} zTjt}nO!~$`nS8k`XGZY1gshYb)kz2Nf94~%o~orE2hOW+t|?LYM$TV^7x#-22Zo!>2GdyI{7A-zky%Z z(9rB_vZB|ZMLDC?3W4=xHex2KHVaHVmdZ~dJU5CjBK$j#rjDOY_~M1Byr1wL4iyMD zdxNASf0RQt;iraf4pqS_!vFGED&I)>NJ;Xrnf9u;5M+VI-IP%*p8o~4odu%9v8YTN z;o}=raPy;s@Eb>s-%I$8w{H$r!9l{eA4ug-Dop`L#AT*NdP*Ia1&Y5;b@-U@&aPCx zkMQ}U#(zcl1J2azUm|??0SF!+d8waC$LpR{hfPY8(Sj&Bzo$MoF!-x8ZZ{sEJmVJm ze7Iex3|4}T>()1IT+8#pdBe~{ptGE>pwJsEqmcKyDf9+?6jr*u6jscmZf;LSkg%@c z{P`66T#GOil$l+wGUqT|P9V?obPiXgmnZBze~^a8S2}4tyvI%Fz$}f&JZi`%W ztlM2SAFN$n?OC^>r3q0_da7E~CgyA8uYSisiVO zO9s=`AtHW|2PcWs3!zRtG##3BsRl)(zIL-F`S>6xKm(aF9s`B2dR>AyWoSLo`sB z?U-iKE_ZqxZREp5kq|(y3DXd66%+YUg&;rpfk}fg5Jgjh5HiZ<{WxdcUC1`FCE8?p zpYxpeJ@0$o)2BP9_Xc_FJQ=SbD$4tFK7v0P`$XQSxlfB#cXsQBg6B+lEIB#v zV=d&QKX2*XgKX~IUL`z9-)=lb*SDWwGyM-I7es2G5O<4Yvmcczh_4o~g`XG|{{vO* zRW?VAk|OFg`Gc9ZBA)(Yr(&{ZvbkTvTDId`7YrUE^SP%jiNskc_k@z@yus(r{N#Zx z=kMh;@vLSBH8JQSlxxDrF1mMh zYMQ9ddGyeCrmYA{0TcC}7Q|4U%aE+?uQWVkD1|$XP3XQ_B;30APpM?2yDN`#Xuc5f z`2Kl!cw~n_4WfB`QM^X?8)q{&tg(Gr)HsUXh8k|@q_g{qk zOMP2uaIu@cvfs7P=nMvm=P?W z|4nhM2I<$wP5=_*A*P6E_imRRh7Mu}j@S6F)p94|r7=5(OZE9!NN5m6yV87sH-N162z;LNB*y? z;l=FUfDN&BS3n4h2$^jG5sO8*@*aU&ys&-wGF+w8<;T$BcYDxk4u^9T`rX*U8F1os zPHzC$?y|Y(LSD`@~g z==gqm315ok536aD+zWsuT7}6c9VyiQ^NxHmEXe=o@%T7wnAv1hh9wkR0R{wD8+kot zgN!(z@@H!vX>Q^l%sIyo=49oRRg~NyM7j!;; z$J8m*N04u#yqb?-7XKKP$Nw-j3Qb~_(GCUEtx!8NSA$HEwNiWmenib^aO}R(2!|}k@+(2$@T3{MqmUqHhX6#6A1c&-0Ev@qcrr7)i<~SX Rxmi4UWGdCTGGFzTDPQlgcuc~ugvYR#*L+-M>2 zep2`(0I?M-tailPC4S{fs{`>4iLb1*IuY-b_^K+aZ1_0}*pUNvL>-7a5OpHzM6@u6 ziQ02KQAdt1>daxIvI7;^fmPUnyRidnIC%LTFb1P`V>s$CMxsvR6664yIa*H1ar?{2 z!56h-m;r-g@3O-bueC#HC~C`4)RsrZ zp&8{mj-hZ6{=1AAzaP0Plo5ZkA;~5cI-@ZlZp}XrrAXxcWddD^NT~CN-w@s`E}r|I zmV5bpJc$kg`G1`@{$HoZl{l;ZO?v2+^Fyq$Uj^yW?L_JE;cpq89{g8ldPL8hRaZxf z40?`h^ItmOo|Mw0tUd62;5Lw)rtL_UZAN@sU{jzCBqve)vbH{c$Mx0X$HlM9j;HgK z<%ec$7U>01ErlHWqN|UunKR~CIu9LjL)c=_tXUZ69hSLbV(ErIR!YtPS4x*!s$FEI zWYch;PEo3)hxD0RCGi8+Q*Yh%qbBWz(yY~;dd{QC=qG`AVa+jZx(g+7sJ2_r?L0SD zBt4+d)FO$mv0ok8@$sW5JKi73=NX>e6jl_y~ zw9)!>pQ{J<>0rB#r@ML~D}LUkIF!zkh##tb?Z9_4;c2Znu+yfYcH6{ncNJ*<@CNDs zy6BQ$VO?ZhG(#i4`bw1IZoXD2j<#oKYH=ji^pxt-ox$R))xA0~wm&hMJE6<$2f)_+ z9+5t>GyM;M<=@jr(e?=ox%Fqq>ZEPi8C)mz@-?F|v2*Wra&*%9Hh?5d`y{dY8xQ2X z!x^15;#X%D$%j}vpZenR@7I~~A358eloE!NOi!UdzsJ8;D$=9ZN?XL5(b-o;-IOp> z$fGa1w(#_k*n#Hha8KITkyKAVTPw`c7drs56#}MXDt{nF{0Np$#yI)N`SXR*^%9i8 za`H9uw0T?GG+{|GG*JvA+*1sjZCrwLx6#^V?t!S@-6g)WRAze%mX1LIMSRtsLig4Y(P@ji@~-=@Y7V*G1mio`B( zOBW%iERet#Cy3xU(jR}hfZW#r!$_dqE)en;!A)Ew6akHEYwH{DXS{4^j4j119c03U z^@U0Z%lJHm^@aR|jd*;74ZBG?e=udPq%>tN>?i(sf?w^93_~AfD;HYAa3OW7 UTsS0AQ-v@r(O890TnY<*2ksu;4gdfE delta 4625 zcmd6q4Qx|Y6oBu2{TuA>buIs4K*P^A2E4wuuWMQO>DDnqg9EoY#uO?r8#6;8O+>QT zF*iachRq9kfHO`=43LJX58c$oE=D7U1%Vh{5R=Blh!M>Qkl-?y_q}&-yGO{XVTm>& z=iKj{d+xdC-ox%2IB!UF7&;4>#M-W+-BB}8egi}TRvkku46>T1j)6@phnC1508lx8 zo>Ar=V&9^bVPzHo1+o+V+2}+D4<0`edEsJP`go`nHs&|wC{Y2GdUXg)Sk~qsBU1pE zODh<6CVDX^3+>GwMh^`F>dwt#28Yoe#>ouc$HIc%(a%CT>{Ad(&#^_YtJ5eB2cSMb zPlp51N|4ZQEX_Fe7PEBDd<{-b$lPii7zXH||ly}5_uBZoG#mMSYm}f?WdUPSM-3NZ z5|>{C=i%L1Nw!v!t(9bJ#o3}^-*Gg5e}+`dXXsJ)!V7^rFqOu?s>@^ciJC9P-V7B> z^OqDVnOG*Mn^fgJqet79_0`Q8ZH)uaqWt?s*E)Gc;UYs)L%y2IKP*bWK9Kov@(Lm) z+>nvaBuyn647ORLZE?90>)J|>l&%NyKXvW#b%(Hip!8tr0T90jrOT@&CD=rJIX&z% zF!!K1QUE)o=E}mftD`FKaVfUpu#q{kQu1y-#Y7|jb(hpG&{hACsdmXNL+Ugpqe~E% zUo36kdRv#SQ>1}y-NwK@Cbj%zbjOFGx%#yCfxFsoK*O)IsBZ6fbu&g%-SL}Y^14If zBHt*fOmv%Y5e5y&8NCvi@?a7Rk#G;8mXk55rPZ&T!-U;Is*zYQZKN$OSA%v)z^Ltz zh0^WAoH6aWcA_;#xKO%!LiL?+?Ql1X*%id`yr-!>2$R@re~%O z1!a8F(pdCX?YPcr3-mUj|FBE0R9IPVc2+N<^dKpsYMMwPE>4gJc{ z1ZH+<`PYPp&uRHJ!V_<6`A>v5AJg)iglC;n_MbA|A%^}OZRs8o9{xGyD{E5}H=O9l zT5cu0>N72$NBF|8l3YFiB4XJ2$O10HBUh6X6lD$Jl|O3v2Exz$s^ztWH{aIsAXL^j zfx!n_Llc?MV^Wl)@nyn+nn9Ac65d3`!sVZ;|s+O0RQPt;n6V>Uda1zz-E2DAkF278p^2%IAMG5t=^K=dD zVW-q-UrSZDou}&BWIjA^rz_k2sbFR06~ve4l36-@Jf#jUnw-bGQqcq#&6qDK^NO^tj9yMja(*jcZQaTr)uExsr0H1dBp2AP{^ySSFdEHSCxGR(N+rurqV0c a*TPlIeXn>@mfULbx-5HEi?g4Bi~j~MFy1Nv diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co index 8717a1915f38e482f9894f9d90be343b808125ef..52673cd1b09f7f4df781cdf103e76947a85495de 100755 GIT binary patch delta 3234 zcmd^CO>7%Q6rLUbCdKxy)dba`w4{-}rfpceiEYxY6;O@#7?h~`oWDv}dILaNm5Au5ucqTVPp<$!!FH^eCi4#fePx87M1R|SNWG)S!E z{l53+XWz^>Z}fa`nOV5Xe9U9#$;GG7s$OI~CbEXf0N8YD`a2CjqF9X^wuxxdI6E{| zW!iqO8L5eQkKyk$2V$@vK9KY={?31%kCd1~$91Mqa$yd zDoP?dojJ;tWO4U&_GnwF2IE+nsn=jEds0Wa$d2eZTx3TXweCl~cl{R-nog&E0pJ0h zOOlTV&g)#3eL>(soevND8XtZGqW}aD0OSFAz#w1{(C<2^1YF$;?-G=tE37mE$kG5_ zq5-@@0~k}8tq;H>DgjSi;XVCI(35}wVCAr#dfwl`Y_UzJq%Xi z%y3jPj!MQ+$+)rYP%H_GodCrKL9vuV++p%80)?kme!1 z@Rl-yw8&E%jIIXzmkG2TYBcAMHojlBsaG~i6}SKzznr!7sMBLR?x}y19(LpI(44{l zNP4Uufb_Wi`~N#Veh_v(J+2*2+d4nln62m;GB7KCBtDNS?bf-sAcGB>IhDcJf{!P_ zv;L)R#{eHzNb22aAsJ53_a#D6bmr7V{>Do;WO+G#QSPy+eoK zw~Vj5$=5^Ott(Ix=@0CEZ78yMnI-*_(Aq`!zKbJMQlvFWO;F~B6m6jv6ty)e%9$lE z{bNv#lbccp8PBk-=HjX{naV`GgYO!j8q6BF0uf{)crZ(9nK-_mC4F)n|1?adV6we`0K;rkQ2+n{ delta 1739 zcmb_dUr19?7(e%JI;XCCt`fRZs}Z_|+Eur!lO>`nO-<(7%))F-9hV_!_J;_<9R7O< zQqD`1kV+5r|>^1o1==PXzHq5U&cdV?iPo zRDlIms)yo>#tD+QZMiUqVNc=8c%qaiN_nD`$4ceNy!ND~Kg>?r|LVgYb;+w|wX|@b z2FZbfKvLVu&4>cP6ZQK;7dg7hKNcOT@;7?)-9FcpwsnNnkBNt>9;cqVFlRAu4k__2 z)D5|wxh@0cmu^_88^iXXYs57Olr=_vP$eIDi)b?Mxh&L3dKL$~xTFQ_*MAf!hzG3b zWLNWRwk67G(~?GpCYvCgaTgi#RO0U-xV9DRjhpKvo=$5H`od?S`WDERIMI!kd)891 zyUZ?@2fIU^Vz9F$=nz6dXQ$m^7sQZLEEBsN!C+T~XhYM?2?V_<1HO8af2y0lPw;kk&n>}mNuXw_P(e+{MK?9wwIVfZ+$HD^UxYqgoo1sZ6!sOT(mRTB zXvQa|(w47tXc;ea$PMK{WbunUy)cR$i18jrF~1Rw`ZJ7#i=V_9e>qPJAb%8slW@(c zv}U0JSz5)6Dbj+e2CWH!B=pmti$+_`M(UbXW8b%0SWhEbZmlv_H)-6yZ2@)H%*H73 JzNKCU`~@{(sq6p% diff --git a/op_tests/test_pa_block_id_truncation.py b/op_tests/test_pa_block_id_truncation.py new file mode 100644 index 0000000000..512b9ef2b7 --- /dev/null +++ b/op_tests/test_pa_block_id_truncation.py @@ -0,0 +1,260 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +"""Reproduce the aiter ASM paged-attention block_id truncation issue. + +When the block_id loaded from the block_tables tensor crosses 65,535 +(= 2^16), the aiter precompiled ASM `pa_*.co` family on gfx950/gfx942 +reads from the wrong physical KV slot — consistent with a 16-bit +narrowing (`block_id & 0xFFFF`) of the loaded value before it is used +in slot-address arithmetic. + +Strategy: + * Allocate a KV pool with > 65,535 physical blocks (NUM_BLOCKS = 70,000). + * Fill two specific blocks (one below 65,535, one above) with a + distinctive constant, leave everything else at zero. + * Run pa_fwd_asm on a single sequence whose block_tables points at + each chosen block in turn, with context_lens = block_size. + * Because the chosen block is filled with a constant V, the attention + output equals that constant (softmax over a single block's slots + sums to 1, weighted with constant V). + +If the kernel narrows block_id to 16 bits, the high block_id (= 67,000) +wraps to 67,000 - 65,536 = 1,464, an unfilled block that contains zeros, +so the output collapses to ~0 instead of the expected fingerprint. + +Empirical result on gfx950 (MI355X), aiter built 2026-04-20+: + Both the qlen=1 kernel (`pa_bf16_noquant_gqa8_1tg_4w.co`) and the + qlen=4 MTP kernel (`pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co`) return + 0.0000 for block_id = 67,000 instead of the expected 0.7500. The wrap + target (1,464) matches `block_id & 0xFFFF`. + + The reproduction requires NUM_KV_HEADS = 8 to match production + per-block stride (32 KB). With NUM_KV_HEADS = 1 (4 KB stride) the bug + does not surface — likely because some tile-level address calculation + in the kernel only narrows block_id when iterating over enough KV + heads. Either way, this file reproduces the production-relevant + configuration. + +Run: + pytest /root/aiter/op_tests/test_pa_block_id_truncation.py -v -s + +Or as a script: + python /root/aiter/op_tests/test_pa_block_id_truncation.py +""" + +import pytest +import torch + +import aiter +from aiter import dtypes + + +# ---------- configuration matching the ATOM Eagle3 draft signature ---------- +# Production layout per TP=8 rank: num_q_heads = num_kv_heads = 8 (full MHA). +# aiter's gqa-rounding selects the gqa8 kernel either way. +# +# Critical: per-block stride must match production for the i32-overflow +# hypothesis to be testable. With NUM_KV_HEADS=8, HEAD_DIM=128, BLOCK_SIZE=16, +# bf16 elem_size=2: +# per_block_stride = 16 × 8 × 128 × 2 = 32,768 bytes +# i32 overflow boundary = 2^31 / 32768 = 65,536 +# Lowering NUM_KV_HEADS would shrink the stride and push the overflow +# boundary far above any practical block_id, masking the bug. +NUM_Q_HEADS = 8 +NUM_KV_HEADS = 8 +HEAD_DIM = 128 +BLOCK_SIZE = 16 + +# Need num_blocks > 65535 to trigger the crossing. +NUM_BLOCKS = 70_000 + +# Block IDs to fingerprint and probe. Layout: +# 1,000 — safely below the boundary (sanity baseline) +# 65,535 — last value that fits in u16 (= 0xFFFF). Should still read +# correctly even if the kernel does `block_id & 0xFFFF`, +# because that operation is a no-op here. +# 65,536 — first value that overflows u16 (= 0x10000). If the kernel +# narrows to 16 bits, this wraps to 0 and reads block 0. +# 67,000 — well above the boundary; wraps to 67000 - 65536 = 1,464. +SAFE_BLOCK_ID = 1_000 +EDGE_LAST_SAFE = 65_535 +EDGE_FIRST_BUGGY = 65_536 +BUGGY_BLOCK_ID = 67_000 + +# Distinct fingerprint per block — kept small (< 1.0) to stay well within +# bf16 precision after softmax normalization. +SIG_SAFE = 0.50 +SIG_EDGE_LAST = 0.30 +SIG_EDGE_FIRST = 0.40 +SIG_BUGGY = 0.75 + +_FINGERPRINTS = [ + (SAFE_BLOCK_ID, SIG_SAFE, "below_65535"), + (EDGE_LAST_SAFE, SIG_EDGE_LAST, "edge_65535_last_u16"), + (EDGE_FIRST_BUGGY, SIG_EDGE_FIRST, "edge_65536_first_overflow"), + (BUGGY_BLOCK_ID, SIG_BUGGY, "above_65535"), +] + + +def _build_kv_cache(): + """Allocate a sparse bf16 KV pool with two fingerprinted blocks.""" + dtype = torch.bfloat16 + x = 16 // dtype.itemsize # = 8 for bf16 + assert HEAD_DIM % x == 0 + + # K layout: [num_blocks, num_kv_heads, head_dim/x, block_size, x] + k_cache = torch.zeros( + NUM_BLOCKS, NUM_KV_HEADS, HEAD_DIM // x, BLOCK_SIZE, x, + dtype=dtype, device="cuda", + ) + # V layout: [num_blocks, num_kv_heads, head_dim, block_size] + v_cache = torch.zeros( + NUM_BLOCKS, NUM_KV_HEADS, HEAD_DIM, BLOCK_SIZE, + dtype=dtype, device="cuda", + ) + + for block_id, sig, _label in _FINGERPRINTS: + k_cache[block_id].fill_(sig) + v_cache[block_id].fill_(sig) + + return k_cache, v_cache + + +def _run_pa_fwd_asm(k_cache, v_cache, target_block_id, max_qlen=1): + """Run pa_fwd_asm with a single sequence that contains exactly one block, + that block being `target_block_id`. Returns the attention output value. + + `max_qlen` selects the kernel family: + max_qlen=1 → mtp=0 → pa_bf16_noquant_gqa8_1tg_4w.co (non-MTP decode) + max_qlen=4 → mtp=14→1 → pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co (MTP) + """ + NUM_PAGES = 16 + block_tables = torch.full( + (1, NUM_PAGES), target_block_id, dtype=torch.int32, device="cuda" + ) + context_lens = torch.full( + (1,), BLOCK_SIZE * NUM_PAGES, dtype=torch.int32, device="cuda" + ) + cu_seqlens_q = torch.tensor([0, max_qlen], dtype=torch.int32, device="cuda") + + # Query: arbitrary nonzero values — softmax will normalize, V is constant. + query = torch.ones( + max_qlen, NUM_Q_HEADS, HEAD_DIM, dtype=torch.bfloat16, device="cuda" + ) + + out = aiter.pa_fwd_asm( + query, + k_cache, + v_cache, + block_tables, + context_lens, + block_tables.stride(0), + max_qlen=max_qlen, + K_QScale=None, + V_QScale=None, + out_=None, + qo_indptr=cu_seqlens_q, + high_precision=0, + ) + # Output shape: [max_qlen, num_q_heads, head_dim] — all elements should + # equal the fingerprint of target_block_id (because V is constant in + # that block and softmax weights sum to 1). + return out.float().mean().item() + + +@pytest.mark.parametrize( + "block_id,expected_sig,label", + _FINGERPRINTS, +) +@pytest.mark.parametrize( + "max_qlen,kernel_label", + [ + (1, "qlen1_non_MTP_kernel"), + (4, "qlen4_MTP_kernel"), + ], +) +def test_pa_fwd_asm_block_id_no_truncation( + block_id, expected_sig, label, max_qlen, kernel_label +): + """Output for a single-block sequence must match that block's fingerprint + regardless of whether block_id is below or above 65,535. Run for both + qlen=1 (non-MTP decode kernel) and qlen=4 (MTP kernel).""" + k_cache, v_cache = _build_kv_cache() + actual = _run_pa_fwd_asm(k_cache, v_cache, block_id, max_qlen=max_qlen) + + msg = ( + f"[{kernel_label}/{label}] block_id={block_id} max_qlen={max_qlen}: " + f"expected output ≈ {expected_sig}, got {actual:.6f}. " + ) + if block_id >= 65_536: + wrap = block_id - 65_536 + msg += ( + f"If the kernel narrows block_id to 16 bits, the high block_id " + f"would wrap to block {wrap} (unfilled, = 0), so output collapses " + f"toward ~0. Observed value of ~0 here is the bug signature." + ) + assert actual == pytest.approx(expected_sig, abs=1e-2), msg + + +if __name__ == "__main__": + # Standalone runner for quick repro without pytest infrastructure. + print( + f"Allocating KV pool: {NUM_BLOCKS} blocks × bf16 " + f"× {NUM_KV_HEADS} kv_head × {HEAD_DIM} head_dim × {BLOCK_SIZE} block_size" + ) + k_cache, v_cache = _build_kv_cache() + print(f" K cache {tuple(k_cache.shape)} = {k_cache.numel() * 2 / 1e9:.2f} GB") + print(f" V cache {tuple(v_cache.shape)} = {v_cache.numel() * 2 / 1e9:.2f} GB") + print() + + for max_qlen, kernel_label in [(1, "qlen1_non_MTP"), (4, "qlen4_MTP")]: + print(f"=== {kernel_label} (max_qlen={max_qlen}) ===") + for block_id, expected, label in _FINGERPRINTS: + actual = _run_pa_fwd_asm(k_cache, v_cache, block_id, max_qlen=max_qlen) + status = "OK" if abs(actual - expected) < 1e-2 else "BUG" + print( + f"[{status}] block_id={block_id:>7d} expected={expected:.4f} " + f"actual={actual:.4f} Δ={actual - expected:+.4f} ({label})" + ) + if status == "BUG" and block_id >= 65_536: + wrap = block_id & 0xFFFF + print( + f" → if block_id is narrowed to 16 bits, " + f"reads block {wrap} instead (unfilled = 0)." + ) + print() + + # # Cross-window test: pages within the same wave have block_ids + # # spanning different 65536 windows. + # CROSS_SIG = 0.25 + # cross_cases = [ + # ([0, 120, 65536, 65700], "mixed_low_high"), + # ([65534, 65535, 65536, 65537], "boundary_span"), + # ([100, 200, 65536, 67000], "wide_span"), + # ([65536, 65536, 65536, 65536], "all_high_same"), + # ] + # print("=== cross_window_in_wave (block_ids in different 65536 windows) ===") + # for blocks, label in cross_cases: + # for bid in blocks: + # k_cache[bid].fill_(CROSS_SIG) + # v_cache[bid].fill_(CROSS_SIG) + # bt = (blocks * 4)[:16] # repeat to fill 16 entries + # block_tables = torch.tensor([bt], dtype=torch.int32, device="cuda") + # context_lens = torch.full( + # (1,), BLOCK_SIZE * 16, dtype=torch.int32, device="cuda" + # ) + # cu_seqlens_q = torch.tensor([0, 1], dtype=torch.int32, device="cuda") + # query = torch.ones( + # 1, NUM_Q_HEADS, HEAD_DIM, dtype=torch.bfloat16, device="cuda" + # ) + # out = aiter.pa_fwd_asm( + # query, k_cache, v_cache, block_tables, context_lens, + # block_tables.stride(0), max_qlen=1, + # K_QScale=None, V_QScale=None, out_=None, + # qo_indptr=cu_seqlens_q, high_precision=0, + # ) + # actual = out.float().mean().item() + # ok = "OK" if abs(actual - CROSS_SIG) < 0.01 else "FAIL" + # print(f" [{ok}] {label:20s} blocks={blocks} actual={actual:.4f} (expect {CROSS_SIG:.4f})") + # print() \ No newline at end of file From f68a80c60432b17ca48ba10f6568969096228bbf Mon Sep 17 00:00:00 2001 From: "Fang.Che" Date: Fri, 15 May 2026 04:55:59 +0000 Subject: [PATCH 2/6] perf(pa): update .co binaries with optimized min-anchor rebase Rebuild all 36 PA kernel .co files from updated SP3 sources that use min-based anchor rebase instead of the previous direct offset approach. Performance (test_pa.py, bf16, batch=128): - ctx_len=128: 12.17-12.22 us (vs 12.51-12.58 in v1, vs 11.73-11.78 baseline) - ctx_len=257: 18.10-18.17 us (vs 18.52-18.82 in v1) - ctx_len=4097: 161.16-162.33 us (vs 160.27-160.49 baseline, <1% delta) Short-context regression reduced from ~7-9% to ~3-5% vs baseline. Long-context effectively neutral (<1%). --- hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co | Bin 24144 -> 22808 bytes hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co | Bin 24584 -> 23240 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 73616 -> 69640 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 82800 -> 78816 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 165808 -> 163176 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 195656 -> 193032 bytes .../pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co | Bin 25008 -> 24032 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 75168 -> 72280 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 84312 -> 81424 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co | Bin 21816 -> 20840 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 24008 -> 23032 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 27768 -> 26792 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 178128 -> 175496 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 207984 -> 205360 bytes .../pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co | Bin 26240 -> 25264 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 79568 -> 76680 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 88712 -> 85824 bytes .../pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co | Bin 22464 -> 21488 bytes hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co | Bin 21856 -> 20520 bytes hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w.co | Bin 23264 -> 21920 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 65080 -> 61104 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 74264 -> 70280 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 164368 -> 161736 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 194216 -> 191592 bytes .../pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co | Bin 24864 -> 23888 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 74328 -> 71440 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 83472 -> 80584 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co | Bin 21760 -> 20784 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 23952 -> 22976 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 27712 -> 26736 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 176688 -> 174056 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 206544 -> 203920 bytes .../pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co | Bin 26096 -> 25120 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 78728 -> 75840 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 87872 -> 84984 bytes .../pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co | Bin 22408 -> 21432 bytes op_tests/test_pa_block_id_truncation.py | 109 ++++++++++++------ 37 files changed, 76 insertions(+), 33 deletions(-) diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co index f5bb51990891ce4fb3515ca911ab84c3fda30d79..77f8a99fbae069e2d5cbfd69108d9f2afa2db033 100755 GIT binary patch delta 788 zcmcbxhjGRx#t9lsGeRe7EvxsjX8;2j%^(2bGXg0#APxwE@B@la=>iCsVFNFe0oBI@ zj@OFij2+SDL&?JYcetga?oobi7AYeXGno$6((~@8vs=(Ozx7_0I8b%Mw)9flZ+H68)xf&_9mA7O`BC^ z{z>LKpl8J*MplEVY>o zGc|)T9bGJ7OgCdg7}LeTVsmbkGZT}K{ba)!xyct|1i0XdGHCLv7&D;*&@_4gnr0n% nCmY3@F&zk+oE2-wR1iFQ6_Bb3nfxl&j`Kwb#Jy{PhRFi}o8_v0 delta 2126 zcmbQSiSfc7#t9ls7osL=EvxTvVE_Xd%^(2bGXg0#Anpi<@H@&;=>iCsp@IX-fa+rc za*5Fg)j!#vQBaAn9h;jNvxH>+GfdhCG``_K!!$7M(qz*oWFyodW+T?H`HS!$CZGQb zK57gM|NrwdH0)Pnt*EnM*tO4w$*0bR$*)d>q3x2IKNEu*Lo1kusR7X%tW|Y>HUbU) zwt`IoHX;pywxUgwKZq&mnO;;Y1={H2Bc;w@dQlz3_Vbg{0J1ee?0^6%O(0tn#10CQ znk*n5M5>kqaW5spwoDdKR^mdk$Us?Q^8@i3MxXymRP=F-BpE(dnmkX^fOKCgO@1e- zL7dNJCL2hxA^ToOM}pUL0@waWoUQxWn^^XL*c>VKpM507z6xXc=1^sjpGRi=Yw!yH zZrTsbbzuBs<;wV{0^FWU8wjRH1jQ>;tZ)^hBlZ45zb&1J3~wiTnNlq3?i907&^)) zTZL?MbvANj$Vp5}&51WKH-(p>PWI(+h`a5JGF(3<-Pv?k}^m|Pt$J6SwJk`pGh9<0|VLWOBX#N?s~ cJI)P}5VsdV4dtGE9jII(3L?4&Cd$YF00$}(m;e9( diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co index 03485819916f7f6a1dd3b83e7356108bf0336897..e6d915f04dbf049ba9c766a3cd48ad8a7e94e1b1 100644 GIT binary patch delta 843 zcmeA;z<6RS;{**RkMN0F%jzFEFn|G!W)Ohz8G#fV5I+cp@E;VT(ghGMg8~nf0oBI@ zj?jFipN=sxqng zmpPL+Nr+93G3R0A-)yJ~REFI|HQvX+oAxs>G=Q<}=1@H%OnkpN(;TOXN~{lmx9oqs zxzS;^#Aby^2gdq2fzTj^Qjid5XSia8DlQJ@vNMEep^7U(#ZREA*M*9Ec%!PffQsiN zf*HqPGP&Ia)B}BqpWi#2c8pnZlTc zE^wx)n-NUZ(Gbpbvvh`wy23?W98D$%#As{|h>>7odf+hmqM*p+x;QB=csdQ9yeQ60 zD4+!5q<|0zy@7jjcAV^F_IOE7n9zE#UfXzOCWg?-N%3}^3Skhn1yDn|CLaeXp8yrz J3KL~y005Wx&XpucmPra0#JQS zKqfKzp!z5KGYTpZwqtV>W0sK2e}+l>fW|lcXP5@2U7BqAglvQw#B9VGHh&TR!sPQ` z!AFgO;s1YrhKBuWtQB=O47>K(F!|KEF!|MKFtlA#^JijEV`v4_Ff|}rgSD#8&qkoZ z-&U|Gz(%AY&{ni*a*v{tp6NxkQlO1KK2quorWe&gY(GCK4Io&?2{vX*X zVvd#uZ|U!*{lHua#@(AQYK`Pnv12o*@o1`;qou-H{JUj;>1J2!*%B#94fA$gDS2B7570?uL~7#2|!hE0To}724*TC=XHE?w^o-7cqu~{Hof{97SZSqAyk;!(EQe5zcK;-14NHd`c(00KD zXgi^RZL)Tx?BwH-lAJK1^NUBL%dM{HI!>|dz2*S1*qs&m?$Fy E0N->ajQ{`u diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co index d8be7074558eb9296b67d3b2373565336e01547b..c3f4d68d84bda58b8449c11a5fa8c3cfefa5fe39 100755 GIT binary patch delta 4530 zcmdUzUuaWT9LLYOi@9luYwum%I$RU!Vw(o-ZvL30Sfoj65GJ(~=eBNcb!;!CV=ait zD9Nl1^kI{1*Y%1{7PNEFr^*2xBDodz$H4|e_RzsPHylHKQRc%!b!O+D{LVG^zUbr4 zg`4v`pL2fq_sx%!U(UHj-8EZs6@$X8_mj}s{LZ_GUA#rDwLYeeo8H52!=i1MwVhxT zp~WUJz)zsv$MFMybr+U8A02~C<2l)L8GTy5jONN}tnK*vYb^!+2)b16-;4BNe?gxC zd%<5&ubo;U->f@^ldHl|P8!D5ZdAD1{hokUh2fmE2Uml%A7uUAIq6kg?Vv?gNdoO`~knDBCp3hQ_JQ^#Ns@ zM%kuOHZg*Qvb%$v>jl-$UBfKDx5>AOM#yhq&7b#X^y_40Ky4R} zY$Cs(>cZ;8h=-I$`klUz8>(2ip;%1QVo|OA@EpE`2WW|)Wlb5McQ-FozWy2CSFCZFYCO*pz32iD|iu{ca7Xa9XlXz3thpTFsH{)XZsta|2mIDZMVO-1wH zbQ5Q6D*h44gV7;O4t-Tz>+l>Ke|SRW337BJ)vJ6BdL6xzgHKA8huezVA}JrB+ACKY zBiHWx+~hLOMi-XX;Q^daz5@p(XuPZyk>|9PKLdQ~ zjFoo+ue@aCn}I7^tvm^QF~p3#IhuiifX!hD`236JRA#&e{K|eSuYDg8{Z(@`2j7SJ zH)@W+Y5WghVDRbMQIdNUfIoMXlUiE@-Zo|BCxLs%to$r+k8S=rgXfWmT5SO?fy03> zm|=c=5B%pj6EWi|@b)jQd$4M~=!<-$+v@$-VP^R718fqh!554Sd(FHPK9$cN+761+rNf|n`-+N2LBgeAAO Z;KSR52~oSn+GZ`}r~N;gwJ|^L`XA1+8w>yd delta 8579 zcmeI2UuauZ9LLYOCQXyIY3}Wxb+OYJuO_M8nxsjZvTST^TJjWQ$yTS`rbDsA3_EP6 zjGBbl=s*}X8B5t-S_{L$#-?ZNAEmeW;08XZeX)W{5g$Yl1RpwAFrItwId{2{Mjyoc zas%!8<@5Xf?k~Ate)s-PlGAgB8No2)W^5fFc3#Y^G7yLlx&oW@6o|A=ops<7wpDlxP(i+W0&TtVf+ZJid79B0K9Ue9> zybBhWA_su*(-V1Nn%H}sd7)@4e21>Jo`spRGyXIH<#H3q3NGW@=?J*96k+1&D3eHg zzv_3duWJ-~Tv0+wAM__b>izqWYXF6AC) zzyo|dHVMma>~o90M8fU{`QVCMat^22=V94{ogUFSJZ$&iq8`bYOtYh~+=+diqA!`W z7adOL;Z14`|E zYKXv9iBPz`nFd@94Y(Q_aMc zOO!>weMq{n+Vly$g&HTZ7Rv#>Md+}gO^fixH#eq*Ni@H!*lyU`ofD9)z(X{;KdR4I zR2glWM4z!LM`>-kL!PmIG|MyAZ+OPS7p>IsQ6rjj#PkNywn2F2EqU5PqbJ9VXz`e+ zh`uAya@=aJ7`J}K<5pSUk!ai_@O?aKi8APope~cySpx-sLyKA_u z3+oR*y*gBnc0LowJM#;%EhzPQjM1A!<0koX16e2I@Y6458udoe@*cw1&96@j3+t%x z?VYZhi8{C!W+sl2uQbWhAgv~lx~%3~317RS=1#%`BWk{b@aucjJWTjhmt&BD}+0eYW@b{{Ez^Wyubz^>^rh}R;n`VAL6HI$kGn+;b=niJupr zGWB3JSn^xojf}5v5Tsr{`fBQk-ya-M%>F)NdZS(%;0+B%DfI`3s2LjEN6r4;Ffn~T zube?Wb=P2P@{FKSU4F7wxC!P;U9lo#`C0cjHZ`vUY0gt`Yllc(YMiH)`wd5&HqWDP_El zmMtM&rjTb$Aw5b+jgJcHHHAEH3V9Z9%JUb;vBdY4X?1d@IyqCFoT*Mut#i0}>!zHk zPR>*(r`Gv59b&4JGu6o{bxObXSKAh?sT@0I$<a;+y)l+UXjFSYC{MRQ)rEV8wf_DlKc0LF%S-jjnV27kGf}|jdAOyv z^rwwv?zVg*C{>j#iDS~;ibnap{oVhMjQ?(ASZ~?%k#R`25Bz8?hCwl-3<~8SkEdE$ zdGyg@ILOW&(t3Vm43)|+Tz8a6{jYDAgM)*ma`my|n9yy?1c4(`;B({jdNL=;f4=#C zC3D6hPkwO1Qs4($xyxA8f29k!v+5nj-d6R8kGW%!Pu6v)17i_O znagEz`8R_*i{L(EM_?^KG_G2ep-~+D-nAj`O)B$EZT0H!#n9^7ZN9Z+F0eKulwg}U z6ZGLAPWT7mfe6HQO|nS5Nb^eIvzKV@0RGq0G;apZ@20sQc(zGX5{5MbhFViV2k>+I z3|CFr5B%!^n(NoYqC0w^VSEMrSM)&O%Kk^eaKWcP{wym0mJWzRz{+pW57SYN&8O#Pu`;W z6mWk+@3~<;3x@JF18K@7;PGZnF^pdTpQ@tySHLqY&A$hJME~wkFUfSB7^>-jpMkeO zMDtbP8G+{gjFOQlL)o_U!XErtf)`^bKJenfc%LBnqlOyeLFL0?;=p_P2%)a1p?diM zp&`LgV_~;JV+L&vhk=S7K~)p#&aOy^c({CSLv=+ARS38sPI2%O)g$_d8VMQH&l|p> zFmVuEQ6pJQBt^Jd6J8Ng2s|$k)$Jiy=k^OmQ=)laF%TlY9U6dPxyNNc0dI zijkd_yqcO&8y^$Mc0w#lvbjZ`G~O*nND5I15MOtzmt37YU^Lzx2#|u^QBPiNC0~c= zRyxtw+u<2_>BzyulFi;Q^U)Kyj$yJD>4gw(v1Q3Wat&!)7<;*zcKrm^ICZ(qrn?l* e)Hu^`gz;Wx-kH9p%F*VuEuzTP=Jb;hT>metVt@Vs delta 9134 zcmeI2Z){Ul6u|F$wro_C^*Nc2sk3x+og4Q3X*?bV8OGWid#W5E3$s^}g40U%8#@C-r^s zE$Qy*@0@$iz3=Swp1s$ZE0(#v*8Pp_++zotj2la*JPELWFDZ%$!%sXTg18zZBEaW256Rs>e0GWFF5 zSGw)#+^gu!RA2+jedBAn>gcwPB-R9~QjfOTM?Hb+6yJ95=scch?_}~W&8Ldnl7HOZ({OI zSl6WL;_=!{y~FYBA~Cua#3+V%x=e@-B_W0u57AR5#O5*~y6F(g{v%^daws^KL+JHf z-ucfaf-92BRfA5#Cr z+Sq;FX=l)$+|FC0_;V_yz4j?BkC-)5Jf4iH5$2TAFRE!U=HxJ|12-bLI;=0Vnj`Y*e?_Uj+IP^0iWsAO*P>e;FjlUz7=?VoaTeToxQhn z9Df89(&L3kutc2(ZaGXyVcZXV>@}Ld4ZQkAnjZx&4b%KH^=%SWpj3&YGcaMccyToH zHSn8G|gWIE){p68H@r?{Z8Az0es^e%?|>fO){FKYu^XOSqsfS0e&+;^V7hc zjWj<8yt#(v-vLMW7kCee{3B5`(-VFHezAq-bHJNJH1A`vN463LyR)Gd=8GiX+JOeP zJl8+aD|iK27yYdudLkZT;PG`t2o=0hBKkVoi5QaoAo9FNAT%ry>WO-Yt<qUR-Z&%h<+atC5bEo8}Nk6Vu*-Qk*tSr7s7-_c(M&a@{Kt{dPxpM zIYibY`ehPM^2sEI#7pF2iV+Xlvl#K~wt`d?Lu5UY91f%2o|t#bw&DIA8Jpd8a;1l9 zVA+5(8xoi`HvRiDa&|;ueB1_=gpmDVgfTNBCTuKf!X`@g-1T{`y^dt=lJ@!C z=lQ)n-|vy5eVzKLN4n4oIJKv1<;hmB7gFLyR>~mVQUEQ9BHxYo8Rgm#Nu!R_IKXvc z*3;DCdS)|NWc=?LTsyd!q8*^RuMJRNUt>IJ*I#&I@Cyg1@U`N%C}AkyFZghL>K#9G zr1~;9y?6pJbdHfZpPR|l)SA({nwzfu6EfO#TGy%b3z~M7Fvw)2=SnG8Cx77@Gjbqz z|K8hbhR&xM(~G0<1IXN*Um+%NEjj80?tJdr#N=1z+69VglRiJF+L_Gsm&6Qq>^TtM z!{8W&N1n(%YKl;g?pKQvvmJ9jLF}{MSKL`#P6-CJUs&IP;{C;e;{B8m;C^>HRLuQ* zKXZBIS~UzncDxRJ!S!!Cn}CLiv|hzcY`K&GN0xDis}mCz5UW+|H!q3^J*{PyrALHZ zova1i=ypkJ5zWvCwOsutb^za0$>+>y@+WX3&HIv)t7oHHPiSSWJkt8FE_J0bkh#asadR_IL5h`Ct#wY8cZbtw%jes{;81RcmEluV7UxOjQ2YC?E zK&MugCAxO1ZX*Qz!+`}{8N8-e9 zXXS}4hyU-yyliNP#yvAWL7Ylltx8q8dPWyH&P$MVw;>Q1zRy6tXA?5F?YoY*opbn#n^l_zZq|MdSW*y%8&Vl zT}mFpd_!FR5zN!ZmH8(z58hDnbK-3)rJ*jxSsKO#%RNe~f5dz^TggW-Zv8#0B>zi*=EqzQ@J7`x<$!s@4@CZxk`kSirlC4=l{^je#=FWsnV5S{$b5^`YbD;O ziz~2U{_s=ffOVKxzOLj~L=Lr-mtn8$hZkGTYur@*d+%?nueF+N7984K7D7#>B{ob= zg=HoZTFiD57P?$GG@D9gT4p89P+v1nuoSpUNUqZ+_H7j>o2?~s3#WVtOPP~Qwm8bH zxDFky42K2I0tXJQ?$QDhTFp)px=YDqTZxNMo7GW>6C8GznZ$N;2|4i^vzeTy(BvS8 zK$INbQQ#&C_EMXbdAz8BH~Qco0D8S_N&v3bd&w`V-E2hw7N!j2`(gZx%S5x?h{NIC g?2Q0i3)>o5{Xw+$HL(?F)n$Dad2HXaT>+T+59)!XUH||9 delta 4994 zcmdT|TWl0n7(VB8FKpe~JM^aAHUt*5WbYRl>TY)_AVdK{k(Sb;L;@tNA`xNPUAEH( z8^`ZnrNw123Fp_k8o6 z|NQ@)f6jOQ%v>AQ9p97PX+$RagPDEdBkK_mo7986#3g{Z)8JqKj^ud%!OT98Gy^M= zG?3M*Vo zF-@2GF!qVKy}gVvG%l4ng$pzqvY7ehdfg3Oo1l%Vgb`*{`3#v}aNufYwB;&CS4=~4 z=bDVw+@ZD4sF~e0dM0n`WzBX(3tItmzhSCOSBDLoA*Y`v`xZEp(WA+I>~2)Rc})Cn zsiMs4^64_;C(_VEKZG=xoR>AN|0qqvxo%}?SUj83RO#;0f|!bVG_{9X00g7DO_;X} za zpbp3nuYt1bUaTH1o6KXEL)lp!vx!0tkO>T~z}4>p8}EP@9>MZw7;9LBXtQBCBrI29(LIS(0hRSRy=f|v+NlYzoaO~&|-OVQ~E(PW7M z#`pyu`A)F-nrl?zYboEn8F^s%zy|Rm-Si2l1#kV4=0r3_;y)5!def~s0kz=ri)l_o zHH62niW$vCc*D~P5KayH$iQFErlG;-3_+=C9_Y>ap+jwgL`Ut=4$4vf&co_y-5SWnCGh{-rAv4wA z6MmgBlfOjx`L_}v_4fEtaiehoHmmLEjE_~omR=#Y<8p!=KAIm1MJsFpv7TH+4}9G zhp-sgY;#HT%znER&T2ACtfkIkAujN(N#1O8&Gm$}neAQztv0u0i&<%LcArl&u-l!| zy}P}BuVmq}nd^uPr`6#g&|&dP>%gp)ux_VIVqJc}i|u^}Ie7eMMCizR_WX7o?u@OqEb{6VxvJ+iv~jM*SOCfArCVEl19^apwjGh M(d5i;k~1Or7v*7zlmGw# diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index a8e960242360d38a0de9b71a499064e75acf0177..4c2875e3af814bded2a613ee14293af113f913b3 100755 GIT binary patch delta 4503 zcma)=e{2)y8OQH^ZRebjC5D=|bRnc!yM%7g^Z9&tW+{t*(hf!$oq*B^XjUd#MKo2T zGDT4wj6G6TRK^vI7}G4u5E|(Sy~1@}6dNtwK+0N#vK6&#Ni({24Wk}~DhibCemw7+ zzWHaf6y4*`^Yi<>Z%%=;le2m+&)U<7d$0cVfupJ6Va!<{^5lH>R}PtvGV|yBsN+BX zANJYqgmD7IlRPWx|H*@V?NKzl{(tw-)9CaZ?-_3I;2CcJp!R+L&{t30kuB`t1_qnf zaE0qiwvYk))3&TuJ$1jnZf*zG#-6U%#AjCNuRl}k(!4d5tgey#XvV~>6roFxC$4%S0 z!W7mQFZY&d>HME{-{ls!*5B&Wj;%R_O-FLA=~%tUap?W6XI(gQpI$Y!AL+g=8T2KQ zvZs8)?+AQB`}ea84O3t@(l;HtW+}Y0>h+9-@DdN5Z!p{JG3BlI=yksx@ETVZ;YOsj z9rRW2zA>%#`3hUDK7O#>V^;Z2H5$_Xcw(WoA7jH0&`P9N?lkK|nNs<^Ke>@9SOtjI z{pQZUto588cspxLHA*j{H+e1J?be4UD*0D4A0Tv2&t5XUj^0~y@fVUQwcABh8hihQ zWIi|ML6sG`T1V@OYOS`W(Gq(b87sW#n#&mf2(`{O&d*1F9{-Xjf{m^BqoSkOsN0Oa zl}51%-OX1UrD?g3R1>+~C|63O9;RaP39I7cleD6_0k&d)rx{Uys+1{MRml$Gv)LWw zz1rgCyNvcU>ZF@Eau3QI`;H(FZ>cx(eW>&`{8qx0e-#-$7tnQj)YcMKmRH0Y9sTuw zqQAY&QhxYe?5m&?R(WxAnSDH#X8%3bF{}71A5KhbUXTk2c%ga1-N-18p=)ld%GL$O zfi+md`05Q>%T)Ug%GI63Jw{6fhEuo7K3r2jI*}<@MTz8Vce0YV&9WsokD(r9ys`s# zdaT=ST><|bqwx@Ky8~@A_MO04D&OI5-spM(x44Ynqqvec@*m*#D!29aF?^rkW|X@0 zJiLnFQE5MU?y}o@hfRO(vQ3`5bdlxtz(>|i$6D4R;B`@!tH3vX!#3tnZwwqES;Wt>m0C=Xz@&|zTpJRE5aC3s`{hKk(;CSgDY{GKj z!y_zT1N=v$EPoPsX@cc#zz=;+xzm3eI4;h(^l9K7yKIAI?FPQ;yDUEdyw1n+SAiGj zvi#S`?$2{qYFWo`AmKm*%ZGr+XEc5n_$rBwe;;_)LoELYcw;KbI!fRut!DX6;598Q zFT>VSc<$^|ET0X$=Q)&QcbPht6S z;Jtrm8=MAy>>N;2gGv?9Y9`Tas-2gTTFyvU~~fH`cN|;j;Df zT-#>Wu?i9nY-RZqz~kFk{zKp;=Ms=Px#xgy`WqYH1w3wXZUm~ zsBmphwr+0Q7*v%om~vbpRER4fpn?)pi75uC8I%Mc<8I%*$42U!~Hl3CXBuK0nq_I+htkvAG(>T?(F%+h=Ns+Kb@}d%% zCm4$op}|Cw3@-&lGP{{Zrj(=*sUXQ=vOr0Vk^M-~B$-D_NJ?3`1jw=s;!+!BHAy0(!64BQ4%5A=A%XNz zganyV5$TaDQk2{~MW%N`2@7OZW(87LiP2lGBw`YmaxT9rB*+kIFhF*t$|~8Fs>aFL zsZoIpp~gibDJ+CZyReubCld|@NawIZFG5tHzdkJyDMBM6D!EJHxJWKgI7xr2B4UWl z6A2~Blo2&TP;s8S_GnnD19a2~rMQujYU6m^n!$o3hIw6`?X~`E6V42%N7g0ubAJ2Qv zn}4=T{=!mpkAKhS`@HXSXUDsz%f@HRGEOu;yYH@}ar=2hDKC7b?CM*JmMRGqxPSY!zbpbgX%Ss3hb>4R3&Z; z+?L2TJI>T>boA%@sFSAxk5c)y{{DO#nk!W96oyumCjrZ{HH2+90 zD3PcMQv5qXMNXw^Lb4oEZ7^k9YRi}9%lh+7%ACKL z-CX+>gHf`h$;RwV=SO0??lY4?-&`h1Pl>IoIxOg)D0jS`b{Y&YJes7%3#$z$7AL;i zTw?pu^;BN&p8EHpC6#TjU2QNwEdD<6i07c$ps%)gyC5&#_MgQp{>85KXmP90gaRdv ztFEW>N(b&w2Ywd#8kM(~^yly2Ez8Nk&OnmNmzczvmeLY~tCHj`_!p?e zl&d&I?-92?(rYmq$$mtRB-*!*n@rP9sqD(?JAc!0^%fnpHI{6qv)uYXwKix+t=TQL zMi0Ak*u($x0B?VOVFaL?^C1qI=KTYDnxspqZ>C}OXrnV;K@5K z-+t#$-g_2lZ#QkF#jexcMi;5fU#2uWdcrIY{jHqI4ZJ|#C^5@7IXS32iE_{bQ^teJ zqo@aMt}qSp>Nt+QMUT_s#Ag}$)_nR!80)|5wSnxhEpi~GbMMn1(NYA_hEGKCJW81| zy6%hg?{dGcr;VOSc^VURuF#4S(qx12BBYCtppHt(>7`dNwq1qdd_HSi%6v+b>*)PP z&o0@J?^cRyUUJg8hI)Dc1!os?r&6AfXen*k_%!WcR`nOsd8IXYUfU|kc^#3S>!vM? z(dU~ysXevyi1dX+i*S}Fa=gj3Ife)Er0Le2L4lDvCg~N(I_%UYaf4$Clh|tX<4W|PGmR!%G>qc)e%H#1$l~l+h zGb6P{P@73={R1jzUQ7%lME6O_U!i`(gDlUY7W2YNY2r_)nU44T7{0l}rChx-l-;U) z_btAbt0F@kxJdFw;Eqj2TSdAX_)wVS0`P`4s!*kCH9LkXOY^y$|4mR_E!LuvcY#N=l69{Sfe-D~kt%!)JV7L&sDA<9{XA)ZQ+XSw7;1W$ zR4h|>o}n(hLUIRi;RMN70&n;|$?pQ*bB^RZj&Gvn{PSMTpxAw79+brgfDitW zW}4*NfL|$)yc78N=Q`K&{{$471&2Nb{Omqe(7m1m{_#U39|S&IP4Z#j;DKu-KMj2FCdtnNk1(<@fBv-pjmmdsn`)s~`WJ9;E-2_9z^CpY?Pq~=+Meoh zOeRHSsEhX&?bZAhpcoK}3dQtx;OQ92n{c4@B)=EO*+}xWz=drjkDAo=GgQNqq@ooZ zdUlX}6L4peU0JL?Z~AbMk79z<8vrHg_`z|_O(BI^rdDtL5TuQ!OPpVtk-9Hws0twxB;ncZ%; z2OG!qbZ(C;1XM1K1D9+|_v2j0+#87F#9gwyejiM~eh;X{?LK^~hoMWq4U^D|c9HcD%El4~6hLxG;+w#D+t- z2sWzwaV`ND%y~k1zhpODJI6<{8z*phJGh_^FQ1D9u>tRL@o`uv?_se8&+B`~`@*=H zyg!P!(HqhC%gcH3n)z@HFN2Q-@iM#~4oC3vetg!v0uOHSc5g6>dllltxIzAqAEy(1 z95(QIVz`+;PQcN8-Y71@=Xc|Dz7RVv0tqQrkoK!J`9!JQ2KKK0sy2pxtEVEzA{jz|(RSEd7Y>vHZ1`D&%EYB!QDN;7&>HnMgB{DxVDH1%KYp1&Ia diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co index a46abd077135fff438b88048c3d1c0d2b54f0910..048095868df2975ab02539a9d420ed1fedf8008c 100755 GIT binary patch delta 748 zcmdmRnDN1G#t9ls2~iWZmem`$GJpY$W)Ohz8G#fV5F3O;_y*;ubOD6RaDfBLfa+rc za*5Fg)j!#vQBaAn9h;jNy97B7GECZckYU=s%^JcT%##~bl_ptSYh)! zaV}`g5DkFd1tuVCoeKY5QnNR`55Jp;SVEHcNKHYXV;u}+RrxxkUg1XThw zX!9ji6=tw0>ujA^CwJ&w5wZW>w4Z^Y0gMl_G@+QbSvy;a*R>rE$ zKYa~ZKzb*yvz@kCAlQJjJ|X}byikec{AZZ74`_VDe}-vb+NH^+Psm27LCi+1VY7&E2eZ$A1s^pA zhX4Qh85;Jhu~yXCFzniA!{k%v!sJ({!O(U|&7X-ujiD7x!_%+_@sUzzFukY_V*B|?X#m+8Aa+22lqQg^31SBYNljkh zqB!}km?7!fls4;$b1}ylbG7bgZ(`Xm#oDsJf}v;seG&1aO+g#{i$U6Q!8*tX22!<7!5Sj-}u`}e@ql$||#aXmb#g(Aq z325qdq2eMLsOl}C;(ubn3}LwJ0uYOdgMp!XGE?L>D`#^fhMdHt)SP$&b5kc6)5ydS z#xyszfHDmYElgpeu5NCV3k7vI%SS0PF)?^fJ{T=GIUz=X3tm-3Pp*nFlP!SO90kyt zWC9D^9SjVU4F!cKKZucFDu|x^E5?pVA!f2wtR0g;?BuFgJI)4(!x>fr&6WoMqco~6 diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co index f638a1b9250dcdeaf286382f068159c1944f614e..3ef7dfd94bb136f96b58d667f3e864648641dcd0 100755 GIT binary patch delta 1873 zcma)-e`p(J7{}lD&80adtw~6=&`{efE2DNLznf&)XqqPNM6%fyVQ#Q>t}r#jA{Awf zC5=fbqtLonygxQHbhZ>MGJUr;O6}P{HVYf2h?5~~jNni~>cGMz>w@nk?_F~JM+3+6 zyr1WJzwdqTz1*9<#ofKeP47W>pZ}A_N27(#fc9zY4HOvDdK61mE5bA z&ODBL^!#&0wKR!&ZHUSSv@I>$X;G5xO$Vof2Z6Y*$%wrZG#?9&2giW;kUZUg9g%$3 zeGWb(tC8yn&-fO}v4bDiaJ`O{NG62_y7a3Pu*!Iw#xV6v+4LHxGEHx@{Y*i+%?_eJ zs;}YSKs#=2b7%_@)9`^B^S&n)w>LPITbqU{1hx44(GC96Q?&|?c?;yk`vpjq-qPZ& z^BY$Gle8!)%0>-Zwb5~ei~GOC?PnH9*Vuh&Z#u}i^i?e!CwnjWIcysKnZ&2eJnC;I zDWO$UWw%K~e*Lro84elCp6B_BXPmbbN%rPkS#V+Ym0Es4ek5$8$vPR;kaw5P*P4$$ zu2`Pst1Qpl_`Ga+VYWd}mhZSB{{=NvdY$C=CFWfxm#@tsDizg}tG~IS^~mG2C~3;3 z*XXWP?{1!>rZ14`KRURQVasN=4(7>s+uxK7FP%ehurXmf#{O?C=u1%t{`pMJUtk<$ z)!fSX=*w!pmvOL9&3%j~Un(UDr5a|1t$>0`NsRH_5j8Ilb->oI=BHSHS4;T=BUj&H zJUby1Dbn@Nvclwh)dc`DjK@!^`84Cn9sQRW|MRlCe~$5=E~@!g(l7yZfZJ3TmPL>9#hL>hh)CqY$t`uPMilxSV|C)d?xeQO+oQYl8>S WX_HWp;v=&V*iGY!dg1JD*!nL>lILdt delta 4801 zcmdT|Z)h839Dd)+-I|n4+t5F0Y)czy=%1`!k|xdSMf!gXT!oO;sh#a)6=g#sR4lV3 zw=9`OYJ2u{k*Y9gokKQ_ech~Ddi^j^*u-^A=!ZFiPU#03tsAZ^*m!q&ul4#xj}FWN z$?tie-|u(tz4s(f-n+#w$deaIu9~Q@zFm7N%o!k%UP=Uc@(@VuHi<9Ac}5+BnKBU6 zVSxLH|Js*nUOkOJ_9#7l+HUQG;2LUV8P=g@`d_Bhy{!lv+2-Sh(S8DbW zpa#U`sMJWYjye*mXY$xbeEAWYDPSGG!U%t@*jn#isCiX9DZk%R3)~B}5)K3`bt0~l zaCf)GE@Hcc_wBRroZrfqn`y0Wi7FZmnOBLPL;9GI6Cni;85IV>h>)(nhwBOYwXO$o zemg{$OKYLl6*c>2P@+6a1}p?#$^Wkm3O(FFXj}+NR5eq5-=`visIdHYk&&#BCegnI zY3O=Unj3#*k2Gse?b6V0hN`@niVBiEyQ0h(vl|G%>0-jJVG$o?jx$Gru!7!dI;UCn zW?zzoE2y$nb1R>ynn5ES?`$KhToGZKGegAe8HV?>GtKhL#0!!e0q~}YFuz}ECxVPKP- zAM&TKvuI$#ruDB18`+dG5K{JxS+LW3_LHbqM;5({-QGvt+3>8CJ}W$HGSR;^b@Jv# zby#tqBJb$4K%KQq+SOq%4{fj1-H8g4LW)f#4QiXZiOQ~jrPW?UXKJ8$E=3~a(t8GY z6MeLz`54%udPu~jR*S;7d+~iM4n{fVfau3+z8Lc(n|LecZ|qUm@4&p-r{=wwZ??-I zyq;~t3FD@QF3b<_UoVxxQ<%T7SIr;9{IET9qpoCM#{8u*g^{8-{}G(%d;OmY0AiS5 zdS1YQ~=Jj&tM(zscZ(rTu zB=i&J2P@Q9VhQu}MhNO(IxKE_r z7&>Wphr+hEwPEOTH?}GTZC>RhoG!02hqJ9wS&^%A{Rpm>My0la?q~$PPj|Twyg1O$ zH<|2Tju$~2p%b4L#lJDb9l4zHSg17qhZ%ZIz=tnk8IC%st}Om!2`n)HOnw%B#RLoE nqb1NvB+BB`CGatW7mF3lJ7{|}K$xX6MS0^dfqHBFC;xf6pxrxT5qA}6Bb*q!&y5Sg#qhrVxiZEB* z5>qYirCZi3%V4Dx2Ub`qbjKR|f_M61>z5Xhve^$U;^-K|GI1@0E|uMv=Omsw*aG1^ z=lA^2|D5N^Np6--^6|7eUdhGlzO0&vjPAlnd(jc)=~JXpi^fIBx3go|kfHuu2w)xe zKi9!}`2c>p^uPBYF+81bn?omO=g?QP@M|D96y%S$_vohDt`1VThw8#Jl}6 zCr1ZL<;4L(p53uq9T8xD^hNlMwFIOH6HRG_YWcEE_t-~B1cvl z?wIjiG8ekYsYOkb&ek!DDdCIBHQOu5j<)#LvYltwxFW4ogg#MP<96lnlST`sc%}vF zY1YDvCbHi54lhgH3$|Na_Ox34WMOc_q<7Le7zMo0> za~sd4IdW9avuK`4e+f48WcBoniMv%nCdTVbTdsohtc}n6+hO$KBm3q)oa^V+7QD_U z8unmwa``FTViT2XmB%X>f{4H&;oH2WTKN1PIERQEp$iM9_mClDF|9_?nivEGU*53~Fm z&<7v$nE*a|gwbeR>|rdaL(E?i={@l>&*p7{(nKL09MaNjs`ChGxSej|hQ zpaJ-yUs(Pk@XB9V-T}OGmE~Pn-(S7$DK^my3)-+AWRmX#4|`c21MY_hCtYv?cwY{y z{~CB@R{@*&77~+XEI$vtvzq0XfRFiE?$RE)oDGe(C;kZF=}1WOq28|Uw|d_cMWH4e zQY!d@L7;+H5NN21Y3OOr2!f3vN&`*QL8=K+tt%jTK;ZKDBsGj^I}6QOg-dA6DqJ*l zheR5B19T6U&*udyx+Kj_GjoNcS{gP+ni+VM7Hw(@5Y>ghehdY>Y zlLV1AMG}M5Sn_&3vMb+{jE8U~$BphvTyDYD_EGxROH8b{U~hg3<|)y*2)5kBl@?r_ qcy%8x delta 5378 zcmdUze`phD7{}kc{Hk5inpG{qxfpHfFJtaUn%G@Rk|wcZ+CdkqgJ|pA#@xy#1DPw* z#4HtY)nz@m6qMnxsK}U|FKS!Fb0AF6jVgsX*oK3$mBC3-;7PVPQ%PR$OwNp4%tA zgU+27I*`=4tWU~--n0z67L3trrmNel5t^Q!hf-3lWk;`o#?A|LdvAn}_tvA`*J{_% zs2269mc%Hw-qO?S7RpmA1M|}!p(4cu7NiG8U2gBy+U@dr+S?s<$a}R;(eb#WUe@)B z?&xqd$htw%8#XvZ=O`<>E1gEuhO0byXXPC_@mfirWJ{Z+Msr5mf%>KCn*XjGv217- z^|-{gq3`j~;ttAaQNh3n9$u0$#~m~j^S}1yh;x*}iRFgtz{6EROV^k8(~{g{^_MfJ zi>NuxI8%RnR?YFa<>pNORSeCMKR8`j;0)J+Jzl}G^z!|Tq%>%CMtDzn8c9Ez`lQu+ zWW7t+E$l+lO}ww^G;Q$t#lgT0Gj$U$4I54shpR?#fAoNr-u@JJtvT~(kEJXvmb7;# z9xuy}rc!H>ji%=wnvAyg@Or@-H=By#$ydgbm{{I`J)PH#)^Tb}(AstPZecRnti?5% zCboWLHrktQugkr;_n;Vhqf8}H;I7Q3H+cMQ!Rq?+ZbniXL?%;(RX>?ZOxT?&w;H`i z*@wJE-1hD#w4A@B-*EoGmg{E^Eu<1=+?5+K(U+>R<6x8Nk^bLF9a3F*^5aT6acn_; zQcaW=&7{I@LmMncyL@>YX*L+f}sE$1)k-zz%DQfDLORNClWhso9OMDmYL_*M$0Otm(2{FI(A0zN#0 zyMRxw)yJ;@UK!AHKk#6KDw3MF1sr8F654?8*{Fr8;(6c~*Xj9-z%Q-Me+?(zH-HCo zgeX?tzXu%43ujSDAMgh+t3vye1@7$DkSY!Wubi>YQQ!?b^zol5mq}zptr@-JoO)?P z@%&Dj^9$gIPHIRM*MRRlqUYZMZ_S@fb4~$&<(OK(#?kNKn6T?_-9zBnp9-;xnxlr< zP%wXD%~=6_@^cNT;z{7Gmo=WhSNS(9Hk6sMffo`oUuy}f2m+tFq36#4=YP@j4&Z~g z^?Wm>?r%dAv>Ks#yCK0}rsr<~9|`Gs8h9Mu5s`)V0Z%N_#}5JzCS7{R5pY~^>-nd^ zhgRwNDDZDr>p7#mU7}0SIJ}G^2f8SS*-bWtmgPzAf{z+Z#<&RT-u%K#-JL8K4HaYy zNKYudipWNPBay7vLu8|$BQhFdiEIfoAo)<3P$t5VG{)yq)=erqGoDxsf*F?g6Y2Ai z?HDf~Ce-iMRttN*Kv|CU5$R#H#i9&}Wf_JP$Ax1gjboB5Px2bIl31TsGaJ&D;24&a z#6}7-=F=8t{p4P9e3YD>^A)xW@ln`~_k~p1ESJwS5$!+`hSb5uBII82Y)q5h5NQX$ zit~w#-FvpRiK?OL@w(ib0_83xZ66QhP6$+^J-8|Vcm$jC_5zjrP3Cm4Iae2;n(1Gg UbFV74rX_bmv4btSp956&-yXD6ng9R* diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co index 1b16d360c1c28f594087502d2541e9c9f78f090b..92b5d718cf509a0e1b995ad91cf4e96e979645c8 100755 GIT binary patch delta 767 zcmdn7it)uF#t9lsFMKCzEvr9Z!2kv@nn3`vt zkV}j{sQ$_RjDkvp?bzJJ*d@qukYUokgACL5ZPpMjV4gfjS!r^Tj>2R<6@kfV;tG?C z)but#5&yv?u%EqkKYJ6){s+u0`zsiF_D>ej0jZ4J{6hL0BT$*bW<5D6R*v_KP-&<^ zoB32sm^K%w*|CZ!|7qIKz|a83cbS{urfg2q31Nbokj64O$9M|||36eyfX=botYZ3& z5ol5y%j65zTND-lwCo3&=-y(}Z{x|Z{~UAce)kR_yV0i2rjcR))6Gk5wt|hFtS2{Z zvx9>M3**zxnJ!?lZ9x7C=FMxoRagvVJfUF?rFg(JJHrY~RB>^rxW;700CBH5XtKIc z*@OsGU66#p&Tu9GReTcM;Oz{0M0aZg)4Be zw16paax$J=AgBYx2AkIfzG7rLU@`fifbitC!O~ptwB_nn3`<@!k>_bN*6%53wO?&)&qc-;AYYe+5I&{{Ko;^plPhIeyxl zD`m<=qNg@5kg;J6kz#^+V&EfYv#Juwo&ZKDC}yT@7EtYANjUqvX+JQc!PuCkY0$*Z zTTPNZwpl?37&}k?4xZRqYDlst;IT8oq=iNC^zWAapcry*vFW$*WZ3_VxplvL2aw%p z(`M7iu>a#`Q_HQSredHKJ2o${(O_ZxxcQ_5Iog#r&vQ=VFzoSx6nHR_2h3n+IA9GE zfzaYm@fVW~eZ;-Cpvme&Wes9bby+~gb3#zX=YhCP91IikCQtI&X65K;%#f3ql$sN7 zU}kOrV;UJd!kA`eE-n zMjurFWPe6MCBk-WZer{bA5GPy@-3&(p#s5H!wdnR_9 zT~xj?Zgw(nW10LxYln#PpQil`3=Lp>m$?aU8qm(T%{z3yF+xq0V&&lfhicm9dnP7K zK$EOk1r`6a><5|T-eS{l^rxWZ(|2=V$UXtKIc**OWQx-6jLH$qUwouT3p38>c3mDVL0M0aZg)4A#G>0j0b~1x8 z-7FDIC#TIfA_JJ1{y0oNC?Gu9Hb$BYo)m*8C&icv6+lyF0W_sf;F+u)BRlzcj3g&a qXgyf(pBNP;g^=Kmu&oF5p(D;V`4Aa20OOs8Xkd07-n2lJ&W)a~GW}p8G zK57gM|NrwdH0)Pnt*EnM*tO4w$*0bR$*)d>q3x2IKNEu*Lo1kusR7X%tW|Y>HUbU) zwt`IoHX;pywxUgwYs8fFOfRaH0&Vp1ky2+cy{Hai`}s*}0NENKc0hoXCXlTOVh06D zO%6~|oV-`ekaTSZn`Oj*FvVnZwC-nbV%cxT(z3sTp=bYpB`W$yMT#8%Y)+LjWg^`} zN}G4dNU?@UF+n{t@Da0FR*7VfYz|OyU`;styJ`$5I`po#ASi8FC9K#TRs?4cV2932fAauSnLbK(um%v@niBV!8~)6C2X z$}}{vgfR_VEMaO54J;ht3LM=wbA$yjF`aRld{97ma&Ck)7rYJ$n>;DPOvs@e>^KI8 za0q>XV{&wa?BwqelAJK1^=NWS$S`T&L56AjHfsoHFi*ZCrZibkJYe%WaX)5(+1#!B z*_&AQpJQ#=U%}9`e{zI0NLFFPlo+_SX=kI zcL3RqHf=VI4EyhFHuc%cG+9q>+vW@YEF6q?HeU?~^7FK|Z7ztO!Ck*50ve1^iU&-y zGc-7&ii<4*R1k7=Tl-)Bj3 z!BbJzuF-as(zLagp$pLXOLjlxK U&dKIElAIM#QHb*QS)7av0M=8oAOHXW delta 1735 zcmZ2+k@3e3#t9lsKQbn2Evr8f%m4;3nn3`I8a22>vt zkV}j{sQ$_RjDkvp?bzJJ*d-|QpJCEIpz#g=8K!}0mnNG&Ase9vF&nXl%_71X%s&4W zeAE~i{{QD^XxOjDT2W`iuxpW6`il1*x1q`Ds*AM5N7v2V68uwYEM|Fda7FnYmQk*#TvM9kzq zDZ9-qM&B4okDMp$o9|egFowMSGbmzcl3K#%8MY*Q1R6`K40H1)br@rq(p^%hX^8|h$%FkE&Sh{eRgaH4hc zspM@|t`_DDIf+TBIq?P-W@a#^k%a86;Fi^fC)o6Ky$9Dgm0Fe%U) zoF=Q@^f2QOB7`l=rXzmt!X_dic8tY$*Z109C2?MIec$`M z@Avb0-_N_e-t-^3o_Sp$A8O8jR4~-x4M0k~$VjD1j}$;dl86uB`D^mAC6Y#Y^*F$F zV%Edd;d-VPY%%?}2G<3GM$J{~7GvziK>0hQ_fwK(4z7S)jcTgbiU{8q=X=sRefB^(BWCG z@1-6U3_|vaZg7&jwD+z8sfp;4k_)^srvT4#hwIgfO}ag#(*1yZ5<5MlWoj}bja*r+ z1Dvv1>a{JIah}p}ne7e$Ic3~BPfRGd@y@Xn!MCsB zh{km8a?1A4)_j6UMcVB_I}qP)1-B6lN7}s~au{HRibw700Oy*D^vn0X4+>Mb`US5V zwwk%d`Yym1egX0TbMp3hV()zOFz8h?=2gA!=(dOfqsegwc$DHkKL+O66@pfR}GD0_4qjXInDsY6VJuk_D51Z`Xi16`xrELcp+lCFaRj$udtQ=Dc+NZx0v& z@X2tz7Y`#4+7Cv7Y1fKR5JiYvmke^ZX4Tw-pbqf;4d5MOT44&0miw~~%<@^?;J8YB zNkr2|Opt$G@{7~|{w`qmehI$c%&!Hl&X61NN@FBm@l)S`3DK;_)A{};I1TsdskKp4F50TSRO^vR|@vYLMn(7-758Bvel0r3geP0X{qauu(T~ZMJ zabP^~j1+W4Co1tot}6;jK@G+OjdJ{TjE~mI@dk{KBs6do;~C@e`AEMFPx$)cCn(B~ z@n4V0@c_nK6Y|es+`S~vAHulvh8!OjA3G>D)tg{x2p8lw%dP$v<9kZv_>UNGc_0PR zpIAfG0Ew1(AiTsnbJpA{WkIYM0|75r-PedyTz041*nXMKRU9|G>fKy5eGw2ZuJR z&5lEJrQJe8yS0*pG;JlJ-CBx6i^UT~t7t-oc-=UGwpZFnuE!?wc8JOrYgLrOQxdJC zONcV5KeY;%)UB9t)LEL8+8DW?fTxz%kUa*j%KIo7b3n@fnC zy^1Uzp)^Lz%^s3aYO_}`Wkx%?%Xs$)M5mxPz|ujuQ|Be0R|9Ng5Zbbq@$)kN{7NOC uS>gsWe{Ok{oeshbxIDn#4#MYP{~%kyqjCBy+lWT3>s(|fC)nX2%>D;|bF={f delta 5027 zcmdUzeQXnD9LJyMSzqYhH?{%mgslRO1)*Kn?h1um*KMFNuqY}JkU|8a%+P2g4CmIB zZY;=4<^aG&_lzfY1$VdwCain&0jT4+W=|sMoQsl2G0Sy<( ztq72yUK}MKVJy*ZsafGn^{t$j=6}VR;afE?(@*t!OtWm~$+!6v^7*n|z;@mx*nWR@ z32&DOb|8@L=54oNH#B5ZH$BfQ3}MLC6>-#NW1`_md_AZ4Yq+J(jNN0Jj-XJ)WrwGwtuKg3R!A#b>~HW_+bfX}j;| z_%@2K7qxzBxA7^3^~Eln+41OIpqd?hx)&a*ZG(58sy1}a@6zQRydU6%PAmvGcRQZ} zT%)$0ThPecbZ!W1Yt@X{hkBRBGblPthqv%{jfDAuIlf zQ<#o1e}7cVgqID)#a+(xQ&Td(puGJdDmZY4%JQ6eJVm%yYnVMgBEY@WFYD9@kSSQ- zt+bU?M0`!jy0aRUmmAYtHK?>K>q&;yNsRfuQkb;iZcKL{Qgu62E&a?6bxS`Sk%i;YNuix|gg#Tj^g1+wCd7ExM^ zx4+}D+}MJHefszxj1f%@B~iPK`3mnd!u_0`!@w($`QWDFT}6-d5B{fq49^$(k#J9KKgcOa zeLwOu+3O@a0BQ3yn3y-|c7PMwBk3@dBG=(ncHRnDrB!e*|RXgg$c--U{|KsNj)S~8L4a}o)0%SlP8xsVF?FXeZ_9=fR&_B@VT zVQ?^{XPZ0VLS)!DH6mVuzSRlUYzu;TyVcx31>OEMS!Xn;yf|52tb99YK6)cn_G&rl-s3pR;xpzg=Ix$I0b)dQe|6_r5w{DyER&luocE=u@q)4 zlN@YrvvfW#d#OuGL6i|z>|E@Qm2+9_F)FpnWi6Jd!(lelUmrxp)ZFyB?C?Q!9ir|g z`dS;B6(9ZU>kw^fNA`50C-Pkoha+PM(!aN%C+KbMXp%vKN2t2A@pM-^nu>-F(^uQk de8a@fh`(2ydf0{%=mm$+r1(b6zlr35;2(J*uh9Si diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co index e50e1bef010b21b373642fd1de21fff57abd4916..d20f4b751990dca2783e3f52beee5a4be1c2e334 100755 GIT binary patch delta 4492 zcma)=eQXow9mnrEJHEs<#DsLLUCK*FAqh0~o$tLR*(^va zSL><}hs+sKlxl%%F=MHyAS$WWA$M1C17&fk!f3m8%erhJ)=E~iX|&5m3REpx7VTbs z&*MDVADboTe);@fzrW{vvOIGsP#6ykHn4@|2b*5bPS3K8{lZr!=={qd>*BNCGu0jk z%-l_1JKQiP2=O@2uJ-@rLB75PEvf(CJ&Yc7x~B3Rb7<@wb9hYOT08!wk7|31KV(M7 zTDCF8!LRlfb6~H{^yagb=uTt%tr=D??5Wq;y=_L%-d2yU)CH`eJuixw`GEBGgUgYc1 zx74lGYfk)E`i4pNv)OXGW~XmF>s$57&0Ntgu`$>3{OK2;eBjknj|KH34-TOwLwd-|jrNK2j#4vp96eL1PvpEt=gk4`gm?v^Mnn3@@;ZV$m$5%lY^_}$EYp1I8u~St zFV&)e|KdtlY&xj7)-~8_=a9L184|qan;v$m#vE)$&vUF)i3OiLB_p3Rso6ciRxUE% z*n*C5)%mMswNC`G+AfrzIf}aH7d-u>UGMGBNIgqGRWEeN3aEeKYOZM4#0%J6LoXmy zr9aZzZ+09-XUQ6d>rlx2;}}YCwsdpl1=O~N?Z4L+|4(G5uA&>{r0o?f_WNV&T=mCq z#p(+_TlcQj?1UdrXr5|dyUQbPt+NkVVHdT3T$X$+|NHydM(cdLtVVN4sIaRv*PF^d zR%F@cPf0CH?Qc>3jStywbGC_vGq*R{x4eGm$GM_i6l;F!i?rrQz|ma#H|j>F_;dDb zg?+2rH^B2t&AtoltF*EE?gBm$qqzcnaGhghiPw$en2r!Z zj?wD;<(TmWg&qLj*Fwkd241*_=6iv^wRJwX`acVfj;H547^W8|JWlgL;PDjAe+qm$ zP4kz5w=LM{Yc|g^9Mkz{I$;zN3a4m(0r;i;G=Cqsa)9O^0$+WQ=2w6ZTlLHPx8QYf z482Gv%mME^N%Jb_R^XW6>ol(gKKm=0uK=EThj6!l6F5?n#Nikc@KV4rh;sw*mKy{) z#y2r9(R>^5k-yWt9Xb6urt~@ONJGMAuVWDB_ksU%!Ka`Hc;+}A{~Yj%6wQwSUzv4x zLK>U^2XiN#a0>WzGtJKc->%Sn9QgQx#=iqTk#OV5`CkUd=oZ(3C;BV!h6iXq1HAAk z&Hn{_sDtJ{=W7zjR6a}bEVBe0-3KTKQwKbKkmhTFUph*20eEnl=3fKec7f*iI6TWS zub-qH_d~);gXWI_ZyBTc6Tma?(mVsabdly+;ImT}C;PYd-4BkxtZ=_7;v;wg`1;oz z17qeT;DrZi{xjeWUYef<-sdlKr~f%{WNOO}yUV-4OLx%xBJj=bS&)IIfDbPi=pVqx zCA!Y%tfQY}3g4g|9%lg@Q@4rci-GsuNAuf&2is}B8u;vYY3_a(=a|Pc?qLuGUx9?l zeRM(s_~rqcw*glU)BGXe!^dd;DDdDb!14J%2@YndV~~M5fvyY#cZOrU1ZG5xCcz9PL>`+;Sj46v5Hl>rFjc}NR+a@!Rhb0DBAD`` zYSXOUndb=(B_&*f7b5dU!|j9wk4F=P1TM@+G#T3QYAAtCO;N$LVzJ35q7h8_tbhj+ zM3MB6HPSQosyHBg7O%5}PDuSP-H(Rw~CTQW&VUVQ0#$C0|OC9%3?rhX^M_ zxQ7^0LXam4NxT$MlCWkg4R1|UG+aT{1U#A;SMWSih-^&?%UQe%Ns8f3NUDI_Nt%k= zN%16Z7qNB)^T;YsOkTjNlC6nxfE>d^$We&|#KUetHoa3$CUJz84C7OZh85gdiIa&V ziimYY)C3+l5+j8p2@Q8vcph)ts(|Y%qJ~$cM1)ut##B@VZ$i;XVI`?xbt+G$R7D97 zp~@Qm%&1XvxN0nhZ$ULF;LcH=6w_iM*zm(UqsbVF5Xs?2BXPX#XjH{(i^gR#sd&s~104;_o)Ty1$~XA{R0*Liqs1K zic~I8IZ@qT)l+zu8Xfa*qYCf&dkPtl9Z$*iRX<_c78+8kC~9G$j7k*()~hzeO zN$ra;>3t#Uz;xhQh6+$U`ig9{HDrBe-wwYc^+0QRdZ)iK)!tf_&INj#?iAh)d`~@4 zDpeh%gm;6QoK9DVR5_%{nM`$9mBX5R;6QbL&PH!Bxkm}Gblj5bsv=PuVfmlRVyW$zo|+ozAVS`ieZYXa1%gw0^ZJkK0TCxWX8;hi}OZ)R?{O z+HI5TpU9f6C7QAltDN|G_i|-z9pj{PYkTd?h_#rTCntJG%$`AZ3}hW;FV8Pd+=I8= z(8PIuy2b1nh&r4orQ34lZ<>ADDeBavX!*hqIkWSwR$zOX960Ntn_imF6tq&H*?UZV z{9ZqMf=wAXM3>U?@Z@Q;Nope)U!F@$l*sY6`ThIK!5D8;zM9auhj}%475N`?L}%r%hjq#Ve`A zYv`d$`kws9EwtH7sSad{&S#%yQc5Lf{t(J|H}heI;@m_ZX3HM=ba6yqblk{@DqFu! zPn*5IWI--f$meAzolmsT{mdKxD&}FOy(XC(Da&jhJ;*jb^l7G`l?IJ$cLh1Je&ygH zI?9@T%c;ZKT}S(sdz2zJt3&BrZCSS&596riW*x&btTOW%y^$$BdWSKL(^WSxj9Ifk zw_JDY65nLD)@rXv4?|zd?^?+eHH?ZnjGWoKug#&WN!hcJk(e^~<;6KXr+y|V{)bb= zvzgg)V-@MGXIA{PnCa7Amm_9lYjx;oZfd!lDQHzdBiDBuIdWmU_Ll6Um8m1ldebq~ z$?L3C;-4@XR`EZ|3|W-DSD1eG)rKjCp~Fh!H1m$>X;uf)@Hy0|%#~RJbh7RH@EH~s zyS}ni>Nf3D@Z#5(b&;i>yhidS;E|n+wwkmKcsxpS3GubMki^YLMiYfS*wPFVC>cdmbVAVc=gR zn@Pn{P%OKHLuk&jFX(413)Fzky<_Q7ataBJlBcV*^sm z0q^<_$^Q*}&?pr$r)-QSvJ|zSuuoFupg2B2D5zTC6GpADqt(D&M@V}P_?gd1-U@v7 zPbA-{^CU~ze?=<30UM5=Ciza_6Yr4xTfndWndDCbKl}m7lfX}2RynS}nqMC%oNJ6< zH=>k(4BYV>T|k%`20rs3$zKQFZzuUF;I+3day|arppfbo6x{4c;AE|UKP z_?aaK`WX0I8%X;v7(IWMnlYMzGqdOgV5z_sa>JK^|Kfg<-wwRvQIa>ka2I>Wp$AQ-xy}*9%2j2S{Y5y?r`4Ql#|Br(rQl|^pQ8#eu zBht}xz^BGZeh7G{(L3ztW#GL=?=Vju2gNa?c~~I>KmQpSU<`QIHIn}U_~0^Kze^G9yCoJxvrp6cAS`=QQuJ|en6^oqU+h^}bFjYP@kK_Vw$k@rc6 zN_F&~P?aY;@<-UI%X?j;EQ?C(Sq(EiX7x) z=wgPVXiy$#4{^9*K`*|q!JvR*1S1@F5yRPqczmrwBJOC&C!$h?!dzInv);0sv8aD& z%jX&_>q=edFKX5KvkexpVi>+3t~Q<$32VNh(IPU#)%m*{EgP74O}p0EG_ diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co index 5bee0fa3fe14f3d4b5725975bdaa77f3faaa722a..95cd189849f73ad7cd4c49807cb3ccb04fb683ae 100755 GIT binary patch delta 747 zcmZoT%edhv;{**RgZPPB%jy}t7$5*f3qZJxK#C2-jDheOYEbC{D3^f+%7E%)0&pHWbWupOJ57`p^H4l+#IcaUM)zReoK9n6yp)RZP4QZtxbC$2C#$W&qTJ8>&! z0e7y}{p?LF`&C$5_E#|U?4O)t2vVgm`JSQOW)m4U=FLH-k60)Bs9xZh$OKgaG-&f7 zwG?KsDRB;mSSMQ;ToJkdt7$(2LjxEqvNoZZw)vi+3&^BRY#h0N5vBkwirdU#%LX%P z5}V@PUoHDVCb_rR^xJqc?2lk=-S6H3WH;Kh*)%fjU$t4#y^v+|CT}w~##NhN1{Sh_ zTroM$VcKSg@C%&v9l@}0hR{4~10^MNDYymW_6+qM81ZKE17#JoS3JOndNR?nJ gh?zVq)s9IacJeJCB@j1RD$S0w0pf0k)j*@=0X(mw3IG5A delta 1721 zcmdmRl(FF~;{**RhUAG_%jz%qGJpY$W)Ohz8G#fV5MPLg@GsP((ghGMg8@`MOdk_a zoEUvj{geF}1(gWfvAKz{OHk%N!=!yc;~V}nOas#{O*VZ(HbM zs4+17|Ig3RuwRX}qRxh4*FGC2pE?&Nzd8+uwo7XMObluatza6a21IMHR@Ld*2sG&1 z3N{(oh%^}5iZ)H2Bc`NhdQq(uXrqshlsbdyMRgF{&reDN$kqU{0|KNpfox3>J19tM zvVe!;qq-#^!tS4^89P^p0bw7I(%l<0Xmi-kBJ^TMFQPDpYQsnq&^G+#OX3{;h zWAhfdKdd2pn4lgR6cHo1nM>7#49_TSj?Plo*ktgZXqJAmv)n>L$9hW*<%FLW+s zAu%;?7VxlOW8Ajc(~o4Wzyd*OGBE9IE(pHBS^pypT4+Kk9x%<$AmfNCE)Eqp(MA;yAIU*gZvhp5lL%%A!(|tNSWFxY7wRWpirr@AV(iM0lbDp66K`PdYz|`@ znYh51=FTQirlFCOAxzZG&1v$7IK9mu;uM&eF8EHKm>@UVAW?t|URA_T_DVF9b!dRt z>yQAU5AZ{1Fge*!Po2c BxNQIc diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index d0a690ef02e74980ae953a499ee337f0d996c360..91a31c9482b993d65bfd71c4e145806a0b8a17e8 100755 GIT binary patch delta 1896 zcma)-Z)_7~9LJyMdTnpGZl$S_#I4Lz=A1&>>$M%Tajfm!9Aq>KM9CarR-HJtB*YX# z(ox%`!A7S~LM4#CVVS&OWAs@%Uew;;g&JdY5Q7>q0ke<=hnZP~xPS4keV(@0H`=6m ze$VIkeZF_!d!F`Y@3Zlr*pViNYx=Z#EF7wZK>5%S81qx0P@Teg()XyxU_l1r22zl9 z%>P_R)}!5UBmdvmz!8`+aaX}-xvSuG4(&02Q}<>~O!^RHbMAdWN;JfzD5*b^Yji7h z<^1BK3_7@%N9nzOEbRM}L934s7&mh+LxTvI#al*`A+w#T*nUn@~bvF(hyKXJbVq;*Y9>Ws_vN%ye(B#_o1o@o1#!D)|0 z%dA0^8kxXDU5yMfwPtW)=bVmR5+f46mD&Vx;*~foOTS-Z+`XZYyh*D{F5hhLkya)C zHoW?JLdT(s=(sN+YVfszoX-5~aw+o`ttxZBhB#!kX!6)B_fADjlyWU&|A|&ccDkoU zqtV$Xep!x6iX)91)T-lV(0JP=)O+M2UL3X*><3R-bufdjcV0sN6Bo}f4r@@n+k&nQ z+{AUKr#R+Y9^XhcYRZE5XmI~n7i0XvR7gKiRZ5Rc_3PMw{ca)rx$#5|hv~5(`;OPM zQyTpHOs>X!_CYCppsFnV%d>NZyE`}jssS(D5h1rz-pTywqQ@uY`W}Dx zT->AP{e%aF!XFOpzE8OSV@i}!zW#C2$Q~`s9i3tn#vp@Q9MrKQ*FAf;&a zP=Vd!a#AWdY~aBD9><}#j|Py|^wxBT9sU8CP;K(E16q=P2Q+R7(N`RwJnVo@(>$5a qlULv-LozLMW}Z*pbHEpv&!0>-JLR#VF?m=Xt410NkNpG1bMwDot@SAY delta 4799 zcmdUze@q)?7{}lDxB^9FfP-P3h~2ETK-XSrVadVDC?zwlG^9l|i=S-6J`E*sH zu99>|hpt-G)spV))NxL~o+~f2s4WW^ENHN-GHci6+^CQfriC6lBJ2l8g;eDo+)ssH z?S7EcUk#@odnek(BX_| zOtSAkHwHPkh@JWBugvL;r&fJuCLDtr{rGZ3kh;?vVUMvpfKWrS+4;FWp(;TsV_Y8UacVU5zuE>c`0 zdxrC#IQ`){3TAPq7Ry!s!J}k2kPR=QrjeIvm8G0_dI61%p3kK|-Gtn{A?xlovtG@v zgZHHkEUwK61DkaEA=mk92yH&mqxu6UZDiN)_5DDOjTvy<8bz1XIS|HJR+Yt?o=+7MoKc9#-TteFikg{GIydEGA|O9&TQW2vUQHOHGL9fLl*o9`yAu zf2wsC(U{I__|QF&MuSV8dC>F8H&FE?*rF2jv^Y_HS#V%Cet5+pUyeCo{*01uz&x>r z>oI?`O_^VVd2XYU*JIvaEsNmFXfqzT*|36S;lccmE+u~w^Z6%~{8`Ln4e6(KGP)1* z!M6yJqImxUcwlUGEi#E>{&t@%R^G-ikF3cb$NbbsE2*+LiTT8Yl7B2+20;#ZCtJDF z896TpocmE(@CD|vn37+`{QWig-(y~BRnGqt=8F$NV7_&#-|@iFI%UE0(j}D(W_R+} z9)TBvZkG$}?cTj-Zx_S3?W?j5OUrhKP>Z9PA(XK<6Y1FQCem$dA<|_dWwnkbLaohP zNv^eysTFgB(w(i=CI_w)u@y^(VVVeaJL<8t)whtPF>Zz|g0VZuJw&<{=hoWl$qcp@ z8_BJ;Gd9rml*iuPH?U{=_@l6$qGomc&qip@nEE8koQ_wdv~vDc w6TDwb*jfI)O|XQIo1mUbn)z!c*r@HrZR$^trHoL^w^oTo3wYoqc_Ik@248#`!~g&Q diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co index fbdb979203c96845eb391ade405f0046d21ea5cc..10ef399cd990f590bbd6873efddf519c12955184 100755 GIT binary patch delta 2590 zcma);ZA@EL7{|}K?QILgr8Gr>Fu-nel>xcEy<8ZpY^CiOGGC@hh9Mhl!z_kGVz%k# zOiL}37*w{4cJpN)CLx-Tirh)-BwW~hOBNGjh!Tw-B>SM4`L_A8u@UdReaRrDo@6g^#W7Rh~l=lfD+DJhHY|i9y`FRu=Ql zGg5cNC0&Sc(qw+AL1}JoQMMN! z#!Eye%(lO7Wd!95kunTwM_Jcz3o%7gYl!(?_BA49(bTIn^r>-=@3^lADYvkk*fPV& zWA()>a|=uJrxS8y`(2EBv2nS->9WNXxF;%A=dG7dJv6ZL>c*4rD7E za?&<9EtCyqSS~%}Fw1+7%(K!^=NGmtQ|8E;2l7-;omqFY?f3*Mbw4@d{N;Magj@@4 zf@Ci)CyM7Swm)uW)Kmj}EGYF`Tvx8g6ita3&}N|xC^t!oEeWag&`7dvBc5R-`yR9G zJ5kI?dyjl2w-5HQxLp46^iGQ=nH=G8jFpRqXHCpbxBTZjwWbwM!?DBx*8a@xX&dTR zEwSO`xfH89@C_U1L~KqzxCRfIbSf3)`)CUc44lG<9OqZuMPU`jk{8-?Zb;Et)0C z?tE;|#TCh(yEOu)43>n4cEiUNmh`9Bin_*VeuuubqJzu03;4PnbbSTzUpLdd2KeYk zJqR1w03_=I}0jQ=XU3W+!JXzLb%kET2Io%SnBFSDZ5hX&FEC-Bl~nwJ1~UNU%keiNo2ZMz#wYBCBcs6!6jpnn!_m zRnq)L;QI?{ehT<($9g*P79{e$G*1BUucP?~z+?B(+^hZDWo#(5FJ9P;$HSq}Ce+oj zx3g;x?+w;x!XSj4UmyjX=nW9c^F}Coje?NiCj|k4C|o=|?9pB%!=8)x@c31qUjMl?cyh@4W`#OWjI)awhpVZ1jO zBr^oAo;V>0oDqs4q9*wLp7?YN7A4+Mn_OtYS1^;};`6(3Wp;{u2Mfeqtypxp_otr% zcQC!qu8RM(3$KYk---(ucQAgo71uGd!T6lIX4@OnMcpV})bJ=S$c_N?F*?NPA1J&~ A5&!@I delta 5399 zcmdUzZA?>F7{|}O^aYfJmnbq~26WoV%doxowzSKombO%kN*SrhjG5Ir=M0<8!klr2 z*4i3k&;i%2Y;J~WVq!GLaW+9hyar9C8(&nCIirRT`(QFm$fgNSC%gCdoD0V<+a*g@ zo3!VDf6w!rbDs0`K7F}#OyBXnzH1qsxxTynWCZXOP#$>9OU{pu690 z63U2uWZe63ksuqNlTH;c(AcIPNoRjTb`ZK{&kfR|8$T0v>NT|AM!t zOSCW;Y~6S>JJq6g7}Bdn1)E(ujdua;+AwSgav5oH2gv&8EK73*+5qC$qd8tUx9!$Iwwb!4gI&jNWzg<2xa0Wi887zND5Xz@ z;MP<1^u|{grJB`|`DxRv;Iq@cdX06K8r2;=3=a&>3trQw#r*&qM~1X!(f=rv1GoR| zph7zz&UC8o`RUcExL-DCO$?VOo&_f!a8%ItXIj(<^{hVFczH-`^X-3hB-ix*!?-Ay z3iqsC(ntU7uT@Ti;NlMsxDAwNE}iRs9xcsqxaXkIcsV-IR!og(J$T%|CSH!TZNa}+ zQ)9L=7{Sy9k}pCW43XS~_>lm~Uqd|8MREt?;YvkBRI3*mp4O;H5eZ6f)bh>xu& z`Bv=z7XAUAvhPHE@~heY()xRlA-I3G0e}SJXWk=u7vj088CBy^#Ot4%=Op5HK2cp2 z@vZzC0Y)&`K^lgYOCyMXM)DsJA3aa<8;DPwBKcj!9p8}rKH`OE<@IMh{y>I!33+tS z5O@DR%UP0HiklHkUMBeh#K*3YycqGRn<`JOUycmia|_rIe>koND8h|+;cp~gjrfu& zk~bq>Pb)$-Zl;v=jlh>f8d_1n_LoThG3HGqk0Bm+lKc?jYnGC{7xBr6g*2Q%28)a2 zXAvJ;Me+-XkG7DUmH%#{GXcQ=PNAHyNN)KVMgU$(f#hU8)f@5qe4usnmUmh=F*Z+~ zD!mpYEgr^(rQOS5$=JMD+C6?O{Vblr;$v|bSLesnT33z3tPZQ}5Rq55T5COM3M*^j zuyk1Qa#lNAjj7YF&gQMPBFb=n7E7x~oy_mRz6|5FBL~)R$6+iR@!>=n2aET{I6Zjt zj8{#GWxeWA`Kz&H8H+lxjdS8Ps(pCxtjDS@=X2H~2hQVDq)SSlv#EJ-7AMYwW7V_b z>|VSzZf0woubLH}=7k5~YU%++13r0tEmbuS;Iapjx7SkkyfC^B2U2g7u4L(2syOLx gpbF`5Ai1G|a?vG?$uH%xC78S|k5_}qq5x(78y=KjH2?qr diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co index 01de55e94e045ff95ca0847d0e579c1b7d6f3529..8642f9d4b84f2d9b68d01095004cf667cafe1091 100755 GIT binary patch delta 761 zcmX@Gp7FzS#t9ls1pyPameo60Gk^h%W)Ohz8G#fV5Igun_ztHY|o0}NB1UU{eOxkylVcNdU8o~w4lXX;-CI{&%OrEDAFgZu(a&2VCdODNf)FrZnK1p7!y#H!sa@;C{_-3CWtJ|oXzu8 zLYOuysn@ZJeE!|EpMjwPjCom_V1{fC(k+1*u#9E$9+NE`-~XZ-0(6Ys<`Oe8kU_^- zCO6n_QTq71Wk1M3_ZFLe8&8J)|Cn3%yLSNDjW%sIjSTz$GHwpDy8qst zEKESbDmSp;HlV-@=FKvGCM@-5yr5wXrFg(JJHresRB>^rc!U$RQy5| zs(K5k_?;jyLl`c*0K{VAU~tHt{3vLfm7}pSLr!8+YEHa?nUNWcX=Dgzni;_rxEhi5A;6FJk+>Y~rKS&J&!%Cpp@&NZ6s1^VK delta 1718 zcmeycobkYV#t9ls4j~h@memW`Gk^h%W)Ohz8G#fV5DNrC_yUEfbOD6R@PQA?fa+rc za*5Fg)j!#vQBaAn9h;jNy98zaGfdhCG``_K!!$7M(qz*oWFyodW+T?HSwy&i+2_B4 zj~WBR|Ns094g1wtE9z_*cI~ra@~LxS@~hKeXuG85&%~g{&m9S|U;31n-6*g-*3 zlRwBPPChDTNV+zq$>&`8HmithFvpzcXx-1=#Iiq)rDcBwL(l&IN>ubyj1)P3+B{b( zl!-)7ZT=ur!y0151ogzgN6hA}N+f#%7@?q;nYQ_W>Is&FxxbtC10x!YlUbSuP3+v& zBH3e`4RnFAbLQ{hiJh-TBzpoDJDbdouqe&`-LfAPLGCR!{WhKq`^8vV_q%rh*^M@B zHjNDXIhi&;v${fRDh67yV{?IB1`88V|1BqS^eb&X=emib-o_7FP(mplFwM@e!3I@a z94aoMjVi7L6<>m;UKc99AP!Z%1ysBy49pOQ%Ps)1m^c^&3MUKsZ?keTaAU|xOiIm( zH!yQ_hB1vy%wSA2M@tyf&By}AbTM?BTqvl!SwBFLiAlhI^2I>8$pt|IT=04#aB^3W znQQ~J+Gv1QBMq?X0_2FvhJwPAKLklIH3UxP3btc12%78#q!fZDcLm#VP6!66VPIGZ HG+Q13-v_i~ diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co index 9862c11b248d5b24a493014844e5b9f997cd0f48..ffa2f8f171757e2163baa2a9eecd81e6e50d2f32 100755 GIT binary patch delta 793 zcmaE`igCpP#t9lsD|{wuEvt_)V*mpf%^(2bGXg0#AWrav@Dnmo=>iCsVF5@H2tf5Q z0hz?;gX*8`&nT!w*pAIjj9Ee)2N@>qJIF9?-)0VxKTMMY#FZv55)YWHB;f(1#W;R3 z1=R_JGydPaLt+Z!g^CK(3&zd!RDg=`8Tv^_jnD8;(|!hq1~5JkG+39A!D9b5&od=xuo9d8 zpO*cGe>O|m)d{eG0(>gI8;1F3sqbRDn0{Ey)IPz z0h;+1Q1O;5eh@6c1y7ltlYa%930;6D t)C~^s9srSes)_&r delta 2134 zcmZ3nfbqd9#t9ls4+1A@Evuhl%>V{4nn3`__!k>|cN*6%53>hGWAOO|J z1Y{DU52}B%Kck=$VLLWAF=h$L{AZZ74`_VDe}-vb+NH^+Psm27LCi+1Ve=Q^KTJOV z6@1hf82go z{A~rB0&GMY0&PW`CVvo9(lfoNRtmJy$45$?!Std!i0$Vmr2%AXfY<>6Qkp=vCWsvr zBsEz;Jcv{+3F2N#gl(C;K~aed$)XF25}O~0*D(70SE8biVK0Nf6--X0bEWgu;cu zjKv_5iGyKA-efQDZLW^ajtn`8NvS#U24)s;rjZGpX=Y&#Q{duAwM%gj(mtaIQ${>52!+}$?E=+oG{V#V9{8A hWu_helbihQI1dCslo!C2KL;u|2!x2Pfr&CQ007C58uI`E diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w.co index fc6bb7141b852ea811725d11bbde6a2c68f84875..88001d517768b5ef75d941f2c438836bc601f829 100755 GIT binary patch delta 810 zcmaE`m2tsp#t9ls3PBUKmeudDW&i^i%^(2bGXg0#Al~5z;qS;pr3)Zj1`m)z5P<4q z0y2rw2h~5>pHWbWupOJ57_)>p4l+#IcaUM)zResWUzjGJF;<%VNJ?R{or%HZBx!}o z`%E+@8>uL4-Xt-Jaqsz4l01j&)!2@QnGn81uL?E;{R6IlrRa^-wo`I%b7b@Q2 zfvVmDD*hz~%y5RwLRjn!bG+d~UlUKx|qV4rk0K{rllENt*f~!Ow`fH9L98ZGTIdGz{Iq}dh)>txycEU0$lKP z>NmM6(o9ezAL1Gfe+Yd8s8D|LgGdP`4gbl%BJG%N_)oTqvSa!YFu5wqjx!?=qHHTj F8310)va$dG delta 2185 zcmZ3mn(@I_#t9ls3E>m9meqf8U;qOc%^(2bGXg0#ApQ{y;r}Q`r3)Zj1`Zx51FDY+ z$R$P}RR3guMnNUQc5H59%o39M&oF5p(D;V`4Aa20OOs8Xkd07-n2lJ&<}boun0)># z_^2^3{Qu9-(6C>PwW7|3Vb?wzCZ9SNCciojhPF#;{!9#N46R@qrUpc7uvXRi*$6cF z+X^-X*oZU)+KM(!zN4U|XL?bs6lkN5kCZxt=|y!A+s{u*1IX3@u>%66G=XeQ5IZPH zYVrbciOCA$R^(_a5I0gHY#Y!9B`$=8n-7RjVf6X0L`7eRNRsJmuF2CR4aoC3*W_oC z8pQctMnEu}@jptS>qyCPz2a=$&)&qc-;8B*kW?4jh)W2Q4**l%8ATo@DW=W06hQ$n zveSf(rUu{H-%b00`4WtcSvD7Gj^uRlAPXX0S6T=)goEv=UVO4VrpgsQ3|IRP`26af2i< z!x=6MVX-sZ2!IQL8Gc}roq-`5ReTMI%f!L(qj>VJfNcTJ#)b?ziAkwB@djq*t}v#N zi3N;lX6^)IIy%ACx;eYRM4imdU`$s7!_5Z*9hjKDI81H~mYb{)BESW22?S5J3NaJP vD1kU9BLqUv;GWzRV#bsaGI>{s9g{@pG1K*Qt#mJ%!$ diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co index add65e7fbaeac8daa36302524e5c2dfcc3b73cff..26f0e85bbbc4773f4a3bc90815f7775d2680cf55 100755 GIT binary patch delta 4510 zcmdUz-)j_C6vywK-RvgY7dM zNWt1{nq-UkVAD+FPY|JyzO)#*t@J^%h)Ttm#ys>bw$QXFZNbujYnYhcnK^g5GyVg! z1GD$s&pGGb?;b8^XZ8opGrQ=S40Gb2Uo_3MF5gGS#YPk~=a>#YdKZ_>dC{vAI)`!4 z#HdmY;Z6MWQR?e>=eFmuxrmiaKE>zw%A06v?j|}rr^*}7*H72v3*+d<+#B7faIZdJ zm;hU7$g2ms7Rl`wzr-yoTxKNHf!7*QexdP04y|wO$o+qDZP%AK1a&i`Yy3rGTL*})X6(^@(!K6Lnp86 zob4^$ly~Ul9Xfekr+Cno;?T)Ebn?2+*@F#%nyP%kKjtmm^^g+{r+w}(t-$?;$pZnT zy12Smvf-LN+E@XhQRBYjAwOq(G5Kb zjoe2sll@&EaPBX2`80|dzu71j^hQCCh1i!Wi(~u{lT80*v$47Q1cz&+l5X*n-Yr13 ztAFYQZlm5P%Avx7?={*ZB#qqrDQ_m-pdZ@G!jnz))?Be)L$i^G(3}r|o<@@Lw}_{wnZ! z#m-xR&o&uG&hjR~aodq#uIooj?N+E^yaVwaeRkdtym-LQj{)D>UCQbHC&2O7a;XEM zJn-RhJD&vp%CMbZ0={G_u$(i%w>sjl={$%0Xuz?-4M?b-G7RhEN8oo)TgWgLfTMGE zz6ku(yq!M;zVEuef64VPIEv<1u{_U!%QT_%sxn@I{ph4)gAKq3|1OP2XbW)fFBWe! ztR`@z{;?c}aZ%@7C4Ey@j}*VpiD@y0vhN=`nmsHfqbbW&TEP@s(+V?V37MHvTauZv zgv3lGZpF4KY+bQ6+QtHsNfr>5<8+0bxpRxeA|zR63rk|0iJ+9_Ni9+WR+AD*rbklJ zQASfD(-TQFGa8Xoj7n|MG{iI3XwkK&TLpf&RY6-MpR!0%@1&j-{?+GaJrT35x;RUedP4;07QwmmC|IGNlckg@2rSv{2_vH@K z=Xby7dEY13hhN_3y}OykhS`mVDLXs6cHTK1ebG{8T=$zPb zi3Yr2VRmtwqlCwp-aVUXX%cxbe2e*z>!^SWb9m|s_&~JreE2;s%<=8vc3#-s_R+&P zTMNb0;M!bx2o%RU3&k@8&vq7s) z`U1GlZB+nHr2w2t0l4}C@LQ))DFCNZ0Ise8!i{qs&G)tyipIR57%;#utrJbUAgJc* z>+$~!LNIstuvQjkM}`?Vwb#|Ci-QzH3vs08dpHK1DgL~kFQT7a@#8hXUmUtfstc1w zBDwPb4#OlPltx?Nj7JxWTo8?f()80h77F|$H3BPZHCaEMx&*p3-?!-QJ<^tJH$F@Ar9JFU9V}aBDl2@ zMY7QV&HD%St+~AW*GMThr6voOMok9T@hYx}t{E*g)ATLsvaYYg7hXGSX5V=TE}b4T z=tqE-U$6g#Y0)CErc>j*OCjgz%Y&k`EF7)IJg9WN(Z(2I~eS3BOwca{Lj( zPac&i5lGi#1ir&c{uL5G@=Ps{@}DG*69(zoD|HtL|J%Eik?a)ZPbv9NDF2d@pC){% zF8(Xb=YSEE>JnTd1I9lUf&Axt!tc8%QzBR({IM%aev$BPx0L){%72IR*BpOQ#}_pu zg8vZSga*_e+r)pE5twe(vH(a=z>Q#IL&ca1L zzs|W`ejhmg+|lQcKkH7!0~8K;y#%^qz7U0hK0k#X{{V%7J`aV_m>fHhp!2%=Lvb3A zaMJ*92+xp{GDrJ^B*GmC%FyGLVIW9nb4UASnDEny-LX)N&M?pyCD1eAqgUh!RiV2I zT_GR6P!vp|HxP1>DZHKpy=HGP=m!F0al!Jvy)4V$7S8|KDY(k!X8L{tzaktg2e@VO oy23JfjlsNKxKIwT%PYc<<;U62{}!5;khw1$M8>=-Of2>O3$s*ZVE_OC diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co index 73bb3859163bdab6fe96bf64fa76757aae67af5e..c28403522018f434f31c9bf25f7d53b024255e32 100755 GIT binary patch delta 5223 zcmeI0e@I(b6vyvojC-bXD?Rey?50Z)A?t73Z95v5s|{!)|sIIuy)s zW{9jTTUMj2!_s!H4EFO;+A(J=SSR~cDPt>RKMLDGM21kIQeEBqaqiRm{@MQ69}yNv z?)QGqIrp4*^0>Trho^J{@92^?#<1=6nz4AxEhK-)h)l{Hi$kvXZON6CIPRp!E}0^9 z!3YB47;*Y<;vhb?ADb=zoI~itL#*KndSl`W8k$Hs3x~^26(lk*qw$Ho%_wuKB9Z9_ zy;PA%H8fQVrlNYrdxwejnj7)jE|j>w>nMZnFpa(DCcI`7=QeqMpx4}j*Q&*NwLEX? zH6O%lsif0Vw;|+!Cgc%KNV6DXa#10znvg@9kVmB~t`cE(^f>l+`CrW-G?SZ#s#&5>{%5m_SI}>W^089n|lA>HDF`>FL#YT*fl0~!u;dihD~rg<2K=e zr+So1v-%#kZ4IrgZyKKq57+L;!s4;r`c2SeX7n#{&#>eOhdWHVO|xeNR{Arc6(N?W zD`dXeA@q*CqRakXOHUf(S-&MqPZ-~ulqZa54dPy5$w}ES8-(Q8`TXqfu@yUotrxFt zlPk^gO=INt`lj);aCI80&g!RSEB;bAdSf&6*SaNryFW7`l+J&3-=(=C?8k9a8@wsO zkS~vVR6a`cV&Lb`(A)z2W{l>wz-N6l_X7W@Miz0^8Un+VCO}zNkAAOLU1jkY_%|M+ z`IEqx8)*JK@XF?mT+H7MhQ{s6yCg9ufG-acqKrwxPtg1=!j%eC;~4OgXH+DMkEP!@ z(xU~JmEs;ex12A=Gv`5bWX+fx1wALXT6k1F5X07ZFM zo{E3LNtzqwhUwAj5X}pLH=U*VJ-|=Tt6a%n1BRs;)gTKO@Q2=|xgU7vdo*tV9)6$Z z&A{h0>$hV$zaDu%qXRl2z%)Vg6Tp+}S0Fp|0WbWT_CE=H;ai#y1HT#ni8h=A!{uLT z{sHjs3S^-gKLtLysv=oT0U!92=GTB9TvT`~G5-jL(?}Ksqd$OmF*IKUeu1U=VdFzRoF>a6AU8uADi6fnDT-5GWr8@x99*FvgbOoJ6a*+^`t{_LG&!hTAd}LjF zz!4!1p%8I!guTs|H(%H&5KuH6KCcWt*N#jv%VOC;TxbhzUkT z($=yfF#^*fT_Q0tPSj}d$BF#NT=|1U%tR2yAi-z|CJPud;~$WXSl`{f_sW&5`iJrU zxl7vjz0ddi-uJ!x?z{KB-tB(JG;+!`>|jO~yjpiCdX<4dK1d6!$|;a(5&BG4sQO8d zw%w|lfw?7E!2Oti*$?+mwnNtb&pxC$Jit|Fz-!}Z_*e!UP})G;x0myAu}WWMEV-zB zWd6@(iS$k|Hty>JY3JfZdI-zA7blWs?apN1zR#qOEB3=^#vZy4fJ~+g#L~5e&qaLT z#JGs z*T762($&ejaM+gIy4c=3i;vDbd}Libn#V;~j*FACyXeW|;)y&i9?NlIwa=A?_iuyA zsYi;VnVSDL5L})>aQOnkR}81eY6#*z7Ly1cJ*G2rf4e=fc$` z*DQ&&Ic`dOP10f8%SA?Fklf3S$N!QTHP9w)?rn$C(7nDwBR6m?^vI2qPdetDnTe;- ze_rJynxt=>Mt)@XCY|!*sdWTxMOE&X!sAg{aKBGh4P8*+O@FR;O&)fGRH|TWA>c$(EX;@1}>+ z7pA3a+n1FyRSPBCzFj7x=+eFD8biCe@kr+-OxGol}1=orbjiu7L z(Kh3rglzCS?HtS|leDGRULgJQ^+1Vq@Q54{BUg0G6;yKFU4J86a$S)AICi$o*h6&P zLuz1B`r<1&3`SD&N&KpDNdTGz4iF%X#T=Qsm5nKVID+lD0>`|(SW)OQn%Venv z^WqP*+>Uv5Sj(4U9$v2HjhOGaUlF2I>&J?=oDLz(7pVfZe;o5y)={Jg>Uzw;y-LfU z!uD%c-_DW$?O1VniTW6rs1u~a>x8K7Fy^B%^_+aap$KO1+a^NP_7LW!>I|sz6IqG^ zGZ=|crU+xow;A+3ujOB2-jOq*^O&F7skQ$J^SOJqd=m46@5=sfJO0Fqp*L@XNc;=) zwh=A2C>IP(^ertf$NbobTD}l-$28@t|2nKV{R34fg8E?I3^qtwz5?6t{#?r+#=P&i zmUm%3mNR}oRQ#KP>$JARAns5+q2EKe$%ln|UG%E)k0^zq^r$-X;$b1y3sxy-flUoBhJCzxoe#2gk#0?HvV5VL zURE@&JfEm?;R7vnv|s~?fhY*%YKuaE7>L0DS+m$oLnnIuUhGb65UFj8J1hFhe8g6N z5UlM9dHOeQTc5PBj^q-~bM)pMI0XyPt2n9aR=Bi;zH?KNva`_3b$iu6qwY$zea|A> z-IIEWh4U-~ew2FA3@cLOEVMJlRjDg1e9&@izS?1{s>@iRCG`*o=ao3BRmDa|83}^F E0g_M2-v9sr diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index 3258dd48c71fd9e93c9aa0067375ceaa5daf5499..233ffd5e7126883c6e3d90b684df5eeb9a568a15 100755 GIT binary patch delta 2503 zcmb7`e@q)?7{~8@X=$lNia@3l*&wk3@}tlnYq3M1w6KhAsEcDTC6l?~EQkabMShgp z60*>Z?a(75OBNF60=l>z;f4~9nHh;0nG#~+jM^Z75JN@`iOeACz1R1>_KL~$lJ}uRVCai2bmwmZ=rWR7@B9YRGw-gXUEaFXgJs^)Qp)#r@gekHQ z-Ts$-ct3prBxXFg2kQkBkEm}_=cjK{A57B+lW#r#z0S|SM@>!JU!?e>`F`Gq^E-8Z z`k}f>b|obU=p20pJ)gCMo_o1KhKJ%Bjn&k}#{A2yl}gQl#L%2q)O2xbXpW|( zdcgXQi<1^<>8V~doBEa&z!wzKG3QSCV_0u{SA0U(5#b?jX zovH`SP$TfL3x5l~c-xD#Ov_6^!A-S;e!#kauTgSUDPRgDyuKFp<9)5n4^dX1#4YE6 zaRAvrB7AtTy1NqG02!~X`FPZU_%!ga+$Wcc?Hbbnu6i%XBX+H5IpL_O8sOprCxA_f zUJhhec^WvxCH8|mTiy)h&!sp`LVC!XJ{|Ff+cymS$eis-T+j{&0Qd1Y7*IinTABqp z^-*6X1Y9x?wpzvA{sbIUa#IbkU)(JC=KcZLVZytsVc-0^uy46PMRn|7M;$+3mpbNN z_`e;i(x4S)ytC@#QCH$+O_JQp(YTP8pQgjB5nfiCU``Sk0 zNqqna{Ufe5yr6E8H~0(l8=dISV)%*^%T&}Ay__dwKDZ>;*@1c6h{O+xYT0<9FKUAs z^W-*pgA&Z$`{evH#G#s+2{79`;6!(Uxr}ljedSemeWA-~!Kv9{-i1>`S!p>*Elw** z3kwY-wK&ZNJ*OD(I4eLX*^@z+oy73c69}0;A1FBpgO3+2<@n72?rs4i`>^ zqmfgZ%3MU)=(3Q`#zF@mHDK91MtOh4J@h*df2%BByL_(m_ zK9h+QSW9hX^q+|>oVN?!18}C6Y4yN7%?$Y`s)-r(KueZWgVFWC eN1>sa$?k#0FxbqTL{ZnujH2jiWq$2}`hNi95uj%P delta 5001 zcmdT|Yiv_h96#r*k9Br1_L4o2Z6QzwBYkg$j&yAq@(4C4GB;jBmY_^21V@G0mU6cd zLZs{pU4#!L;2>mC$c?>YlHD0GEEZOB_L#a0?(FH29MqNX(~qNk@Oe42)?B zAbz~n|M4UK^lNZh*?;F?A~2p^JPtbharp2!I3|qVi1XuYXLdN>nIEQIxt9xDr?qkW z!KHrZn}9oGY~v_`d)G6s@fx_X!8eqF#!{bP#(2cR1 z#}rlQ!Pvv&_Vz+L+OS%t3p3o{N~52)KzHclH+abxsHMRR3F!HLW0bB%YH)5Wuq zTwkA|j9JyRP)+w#=xO7uAxH6vra|()@0GIt~h}@5xubQAmd;8d78|4%!7Wb zn(?ifo|aZIGeXbIyGZ+P4$}Ui@921a-;1fkfLiwjK3{kZ-liKHm1vMT*GS1NC7sY; zeWqrcO5uiMuI=oWo0f?_b?TY(5iiV~_b4U2pJ+%aTKRHH*~;&C;FZ&O&nr4{82?d) zEBAfof~9jGQ5?T|QES8a44$a9`^<&zi_f67u}pZMarDU!f#r2NCoQK!P!{96dH#iL zZT?%QwHb#C5|SF!)4?Kx%wvW>(AmCDcn31xVcBhkR~EY=2A^cddZxg1=Fr1lsCiqX zaI2j;$w0(3KbHL%SZTw#Qx}bo{aW#%VR$7gLhl;CV$YGmv z(ODSVlm+#rj!}x^iz90A>=da6SF`ur(4m&Qb4bHueKSyNUM#v7>e;P}P%AvP398U) zE&FaYItbZlBb43y*b)zVBcH_~2h-D@EEF|b0_lO9b+`pyjn#L+t1n^?^xx{zvc@Q^ zLb~mx3EB3>c634|4S|;}rR3=36(WcPm1~?i{+`X2;odrqAOiCLUDW{L{<88q^gXwU+1HX z&!WodB{0suQ!PITufLP!D*kUNJMbQI!H|9%f0MfO0@Q-bzoj@Hy-fJr6a2s5)Kw-x zE%@{G6vv|q!k4%Nko>A5ylcJyLdgjanaFh|DFLh^{K>6S{w>1SZIJR?2tP1s0ow_` z<*SMIIR73pk<~SE0)RHcpWh|rql8yYvOiAv2Y03R-Gs0DQOaN7Z$m%}vL==E5^;I!cU(g<;DiNA(MiEvfZA~V7(qM_#P7H8d(bjA?IS6a= zTCEs{_{=7&-IJolhcMUHimcgfTtW=+qw#*co7L-ai_m0Drn6c^TdOI#IGd+ZG_cvs zVg!f7YpEq6sw{S~5|!o}F@)VzEBawpOy{sCS6AgV8;|Wm_V_MjKuFaTD{nyyGLo;m z0owJ<$ z(`F`fe|&z=@9&)4S$5wiRhQ0G4F~b1OFLJ-mT5YHIp>Ewxmx$19I_u}_OEc(^9$G5 zXW9$nCIVnUo)h)|< zir28&zP7x~kP3e`$1ZmIjfw7BHU^jp=mn)WjZ)ULQ z6d+UjJMo(Q(?XhdV4Vym?q zmseZqyV0$Djdi&s_x>6pw-4nD{V2;+Om{dHADX2V?G3OMH$7`dG`x9vu;^4JJBZI` zcTgxd+Sl%|mh_|TbQ61TLj^0b2L*UXy_MRJE-XbWE%6<6k&%D8qB0irlsD87<>@j< z^Q%j7Z6%%Ut^jT=a}UN*TX&mtz&ifiM^h!Wi}UMN;T87T_95%Sc{J^JsywmSIRb&s(EgaHr4Ow+C1A*4P<*u*&beaIEylxD{pEw!qs5 z9@XxX=M4fVW0jt^?2B=^7d8je%o{K#^;-16Sr0 zdI|hfnP|n{7K;YS8wEY|Lr7!ypX}413v3jq*=qjuXIx68b1Yo?wc%s4Y)98 zqrY)@hU2-<-enVxk_0c4I!^%K*h`UXya#++Kg-Vp-@Ak5mw-3g^-KG=;XlFAIl?Af z2R`%)%PZWgz;mhnEUyFJdx+(Wfh%K_d;M2}W7A3MaE%b*-UQUy3cUGm6uHJ);JuS9 z-vB)LAZ4>;1^ z4r!qSz(*Ic@xKGU=8G&J2Y$s{39a)ka8CB(z5Uznv*2iI_8cVRufWg5S$-LKVI9l= z4g6FG%gdaPHTGrS_H~A5xcT4^(u{+v2j2NC%a;Q$_jXR^fRF8G`5KpJ zc&_mWtYbYSOn8Sx8$1lW$vY#;Hv`{$kgfAH@YEkzo&o;)A8k(eZdnu25s;o zk}$~f0&qcL`F`O0F0zde1AqJPCGPe=4vs6+C5O}f6!3y~sC1zBfw%cxgYv%sZ=5sG zC&2v+*!WrO>gPG(cGltZIW>9i+%lHW13uKm@`b=tt6AO%yf?=3F98p>d51w2tOm#A z18hP9`1iY6o&tX4+bsVY@JGE%KiTZhF63*nO%vu0T3#(U=I5wS;}EGTwr8K_)_VNhO8# zP@L${b&;I29#O~;dQ1yJH9eUi?ZT2ou3%URk2q|RBh?XGoBau>%xEQq#OydrWn@;2_XYgH>^sA7w`REK*7Zl*Dkpg^G zvMG?Sn!$_mznsAfap7$Ktr^^cQy=8?2S2~#R5UVy81;)>nSAXd!{SSp$}fZ7 zYS9!Ca@Eb4-W(9XPqqGk{35zr@?N%};(zCm1wO^F`h1{mjUy{{bfbn%JFAfXqB3-?ru$+Okj<`!vHWEaWk!BJeOS zC>AT07-2$E<#@a}h~=OvClbXWEQeINr>8hM{^2&0E^^sN0ZX#2BzILhr#o$rTho)t zAC-K@WRzq}S4Hw<`&;rr>4z4RzNtWry(mY@TW#o{D7AAvTV*mpe@hoDf4tFjVtMwn zfvmjSugBAPdfN9TTRa&p*=RC9G{0FMa~-jo^wpbpin#f_e=n!`CwsS`<#qpj(4SRN zem#*^CvaE7|1?YWM7*65pd%8~3W zF?-t8(ItsFS z&*JCr;?;Y@rflBJI#{{vrF~|bR2Q%}JK6H0Rj&U>u07>H#(tb-#ZNgL6t5x<8op>g zDBeXQ=wz;i7qop;`hLTn$5meIWpB>+-+o?ch))DqPt> z>Ppd{uPtZu)Ys&0v$^pY2Ffd*n@*(FA&_-*+)A(8{v7q?cz{*ve}Jx=eZ;75$C*MU z|17fFl@19tS(L~tDA)dEUmrs3e&z8eP`Bw##>=S3EM2cS-$Vg6I)i^JTl|%)eXuhv z+3N4UnXg+l(a!LfXkJ11_KdA6eGd4!5Y0v4foe^N8d}}X&g63j(uA$rzjkJ5NuVzS zKkujQTY>l1(fm8W>lzkwoPQrEV&7Y=V3=;;Z4c7C7dYQc^W(s`e4FN{fHy6f=nE>3 zGIr+t1=?W*9Q;qy{1xB>nW1Uqb>MRe+Wr#o`a?9I0`A1|sZbw)LOe!0{0I1>{WM>t z9R)jc@Hv_n0iPPC`3B(QV+Pmr-vNs1*9?Uw2qb|;6AWblxHw}VO}Gbm-Ct?`0PxDo zG~dN)`R&Y!Sz6H!4ts9U{D;7uOST{myrWkW45yz0Ke~a#K&`a!X$n09y; z_=VeP{vz;LHOA6%07kHFh}y1jA#-vq^2P*;$O-US}fXG<_M3%sq7w!Z;kn%=8yY60E)IbO)w_$81UXhH17l6ptoq) zKLb2>hE9A2c+=Gk*Ydvviedfc8G-%)yw$1+2LB82{7=yQ@4)Mp6!aeO`C{6B9%=pC znTASQVX>&W?D#E0^VPry1)8q|J`kdLC5iJzntz_exnI8-MrU6DMWcSR4PFC0&`t-c z2Y%vVnr{bQ*Gcnl0H5mtPVWDAK{0lPcIW^;JVNtFfagC-^TWVrPSX4r!0Vo(dGs_W z24q?x1227<=A*#p^}}II=(oTd^uuBB3&5X$n~t;9icL_qmBD{XPrdUFIui{Bnf6^@ z{d#+|+b4)1dTPWlh|U_PA&PD%p)SD?-6Bt@=rU|QUKgQ0k6{oJ2<3Q}79ol=a~vlT z<60vf=i=NzacN!CjuP55xECdD82Oq5QKt1k| zhZNxshlwBWbd!R4mxtT~>_%$m1s`$ay|o+!<0X!?&xZxWpf<=w!9w7|#6l1_QiS03 zkYNfwpK&}wK~j&0=Sa^&$Vb``Y6TJjw@e~<1fC4gBRat?+U${hWL81msLdzUkim-{ zAL-fa;z^@kcbLr8>v55KyrMvYc_rgu;s~TlZ@8A&icUdbqWa+%T_PE#$V;S5Q8bQ~ zC}j*n8H1WIY1Efl2_G+!(tMsU$?Owr$s~fDac7z&kB1cG3-P39UrjA3LgI|#CEln< z@`gwbN%E2iQkWy7l4_-p^3W9e3PO>I)bFmKI~{oEwFgAxatci&PvIndo`nBqE6Yj! w{t8;3dT<)8Lz9vt zkV}j{sQ$_RjDkvp?bzJJ*d@qukYUokgACL5ZPpO(V4l1{RcZ1kRfEZO;tG?Sj1@M& z6W3xEXy*ft66@p|l?xn+Oi(32 zgEoIsRbd93a?aL?b@B?mDG=T9zmL?R_Hv1W{Fau38V&jL$9hW%BW8C?unHYa&(V`Z$` zEa_*+0@6GAob9yD2Eh#+^%Vio;Du5=V49tw#1>Ut94elog(|KD70*CZuM1P3f~wvE zDqa%>W(dP&7l2qy91IJ}COgG!vvRU@WyncPO3jHkFmrK%F^vo@VN5d@M<~tjPY`l7sLy2!Bbb*#w$lV3%fNj`wq z91oy1i2|&~n5-)(J~<#pg6Toj^MKfK%Bc0Xs$c}p&PgQ diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co index 1a77f541c0364e3340c274d53f11d074ff3dcabc..5622b19e0617b46aa23871b84013fb8d79f332a6 100755 GIT binary patch delta 1929 zcma)-eP|nH9LJyMxl41EbW6LGrP|gc>KLIz?eCw2IH=o+r(d{h6w~R(H2%2f z!aIM|BCI(*7&W-?8K;Ub1-(S8b4qsIn%1quvOXuJ^aJ3yjz6sT5Y*6%FzTGZr`F5l zan})9`5m${H|Yy>kSmYef#gy!BjZHVq~oFGVI6=Yh2KqSxadar>?1+%9a!S{g@k`; z`AAv)kLXJJ0xgI-wp-lp<&ON=oaSMyNijWwfI}zeO54&LM(2f0E%X z@^kbYqM0Vh;mukFI;N7Tp11D}+%#sF@Z?sT{Y0f1dR8t8J-F>*iJqdD#0NABR(F4+ zJcGp3!$xSi{~98L2lrd>-u>4d8<$o>UzN*3cgL?8m(g&vRVZVBi*BUjC0U|Q?G^6O zQc5?Rcv!nd44WZeCPT7eh2sNxVoNI)m?5>VP&Rn*J35ccf}1uN=a(ZTko2w>pyeah<3IGDo^vPQhKk4NK7L0Kg4WAy&Qy&op)Dqzo(St zK5#Y0J)f*n-lyRTPZqNy-tSX})uAf)_PZ(FaN@aV>3u1^$@Appg$paWwcCqTaGta- zQ)cAWZttd#=iG64e$EQ>HOXIh^1q=u+6sdesBemVE#tv5?qU4(C&d2873PUfJ763>y z-rFzoEaT6X^^Y-r{SC4IG~?5wB0p88?Z4QI@p8kFDUA~ywwIZWTs-b8r zmjN}%rQcta0kzl=2?uyXIO1cnIU43>zEHr=sIMgyp)zUgti~eTg5plz5p65da5To< z{2?x>;=O)fD>tUJjrVp1ctdkrke_gKBoqd_c5G?g^W4xr++NdkezXy;LkKM8Ubr9D z=Dv2py8DVBxK+77T`*9cU?02$`v7KhQ!dz;+vbLKm7INGu7A}HJ;=2>H|~ZVmHBG( W@+XSkmASPZy2x2$7KRE&5c~_Vy3(Zp delta 4764 zcmdT|Z%i9y9DeV5rDdsFDj;;M>j*SZYHdr~6ucVxhYsB`7S@sJYADK#ZbbMnV>2hc z)s~pW0CLGnCLi3NOtMXazH@etaDG^#iGR{9eBgsl644n=mR3K^jA*^PzSr*fqBl(F zE@^+y`}}_I``&xc-P89vF-t66BPN>gCEdHmbJ4Cc3~+}KftqgujxFQ(T9j`S$KiY# zNO48nf%@Sr{4f1bf4Up1sQ>plU<|vgsmp=M1$KBU2Tt(OA0x*!q=u>{t0}sp^2^O1 zJe*+O0`m)GFJK1r2_}y4i~0op+m0(|zN^A_<_tXs0CKrU0L3)RBO@gEY=OjkM*Mhi zqy>!LXx@v1W{}{cP{YL*`H>MbsivNASHvu2C1rJ2#pvbz^=8NQ=GWMh>gmxMf#bT7 z!@;1og~cr#?(Nl@SZw0(zJ1!skc~Fi=q2`|(~TB*w5CQDI;2c6iWtE-i8vDh?=ZQh z|8Y4xr=%~Zo*tXJNk{gWV5ZK9NnKaioq}myoCs=hw4RNX=i`tGqch$RO*DQhaV;?2 z*dY&pvl?SqrTI@Q5i1pq%vaZ^(QY%V(aK*Xp%K&%-M>h?9i)8QSF3S`6KZ>$94DU! z%+Im}v;A!rA0tP}V}QxR58FS%rRpDcFB4c6dc2amvX6HPUhE{x@O~Tob;s!k*`-X~ zb+XWtoANHDlb4c9!N?wiWL4GEAvsMAsAyvUVsNZfT*@Gmcz4p7(Ise^jT7uzR>D8s zExOcZc`3Qn`ZQ{IbulV2S|UV?LP zD&_bjPV2JA>94&FbojzWIQ+xsIKAZ8NgNMmMoA@pN)OHB2H69=xiWRDWso@G!b7VS zdGGpLSt{NI$EM;^pLeL9KKf8_MA#*6?5^r`2%~4R6trKE?7zsjW>jJMouX&0Doe$) zp!uqFZRQqOZ76mV!tdZWR+}^Wq4kNAPM=lP%kIP(P9VXnmdnLnb(^4P+mA#2P4j&k zR)~Pqt?6>?Hr$fEB6SSbr0ndE_$>meR_&G1`9AK2wRos_r~vcliX4X?L;SThd_Cfc zT|6j!J&t&nyFl^4h4`?E2hl>-hZ3eW9rEKUz|8Z7QXV{u>R0xP`2oatH0NK^h3pXG zFO3Tf7uorbphVXj_a*>HAU^$)n4dsAxu*U-#D~v{>+=^(0cuW)`H#3}6ez&JpW?(k z-&BCeqL}{z@zk7{zk&F{HTB;kKC@MPC4ND?ON9aRy+i$h5|00#SCI55~}vvd|=z=~Ll$5(((SEs<*94-MmU0wlpdOQO3dD{@QTiYxMTD>;E z0G(DF3!~i3R!hKMpe@CdD9mB?`<$qt)!`5Zw)q^w`Pf>W!W=9soKI_@)hBfG*o9iV z#pwrq&+T&bzjF8h-L5fx`pl!40mr+_lG!Rumz=1=)SJ45zlCa(=c_QMCWYE5;cug| lWw!`zr#{{c@o5VimS diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co index 489d5330bc8ba5ea51aee682ee4f99939bbdce4e..11efebcf9d11bf33c52e30eda6a19c679a4e85d3 100755 GIT binary patch delta 2543 zcma);eM}rh7{+I3@3IrY1trq)DSBVij%hB<{z|9z*|7l4IWa zz3)8pJ~In@9cPTwD~66@xU2l#lKzN4n*quX8G%Ke0)?h1ybR~7>9I*G0~2^8>!3XU zKkM*%v6|Up`|mwaD>Iy7o&=raliWd4=GC9)9eY$#z+N0Iz%%F|{Ln@UHm1&LM(fEM*L&$jAvBFxz@ zrZvr=(%ceeo$PUgIejfDDfhM?^|=336f2DR`H85cDDI5<&iZx(=^HjC?R;B~Px{(? zCxLVe&zN z&6VV2v?pc_-JNPyPM|K`8e2&^e1cI2xpraeXW%o|Z+<}f>w_Xu+#BeL`y z`TNtxG<7h6+5wtBi21$^+=hAA<8=Kt%<~?jxd-#!N7W#rXG1t)-LPN}<~N_wOVvP| znn1XQ<_B@TXNUIxhGbvHJaLu~Wt8`C#tF~s>k|OPFh5*R^A5}{hiLu*=A|2S`Y~@h zO4najrV(HQizn#Bm^w89|2s6lhWXu(X?_E9a}UkuF*kJ4{5Ix)e5m?QCV$6?gC_du z)-XS&De4P0sbwb6f^?(?R?IJq(tIoCKTPUeyZ;iLXxy-Y3-g{?eSsSIF~9yD&7Z)$ z_a~ayVLtZ@%?~i@{wC1%7oBLt3#QfASTgq-=C`%Z^h7J>L%C|8^LEUaHq*Qt^WMk< zbmBZttT60Y0c>js#{db_8o$t_ZHBv z!x7TeoOb=-oqUj}IbA|fYB#%^bPKpD-h_|F1U^e^ikJkkIF delta 5393 zcmdUze{54#6vyv-{b@l~>cB8SWrMLFWxV(Ly8iGD*LEu?4J=Jr=2D>$G!U1##2A-F z*HT+tB538QHEIlo2qT%(?nm}c?bxljfgTeQ zXdd}5=b`!hT4HwPKgWP+V#Kjv8tfVqmrqTDx8$)U&Asa29I3h7+!Wtjeq{c>Sy|yF zaBz%k2g0(dtdK$4S(W8~U%dpbn)3!3nXZmj128>Z22w(;si%(vUyN~Nw6BGX_0@yz z$7>tuc>TsPt-mqW)+QA)B*QIoup&2 zih5DkOS-+i!YOK}q&qq)_(N-=I;TMU+5*Wt=WmHWZOsanv{7g@W`rKFLzu4l5BJg) zy^Xz8!l9^*faey}5juCpCz?j#_yZYZtb#3Ro6pZY5n)s=BA!am`#GU7*mK0h4zTVA1`&it>w``e8T)9$g zOBQn}wWT|#(5KH9Vpp%1M1|z8bawp>gU;)wRH!^zsJwI~BS=-kovGZc?Mzk!Kja_c z+ddqG-P=a=-UR7d^O5}nuy7-8?)ZJ!^?P|P?IC)@r7NLI)rBx-2u>0%Hh65)S5(AFp z!42qhDjH_WqXiU(wfug>&)mW*5jU>U`Y%J=+oa`Q#1l?gBvoq&8JxFFP}a48fhSZ~ zSv-#Xr`BotX2esEDId(S^%=x#@|Z}Y7{3=8W)^diNfz;`r(~i2${}v;Qjsj)MO<3* z=GR`t%X@C70LTZK1QtMLw1$Id0;fb$jfWA>e5@i_97BBb9xeY7aZ1TdHJ(AdZJ)e< zm4iQ!A>z;;-3`R8zZQKZb&u?30i$1Pc{$?oueAI@#1DO=a^4E#Haw@wr<5 zHsYfj7ikT9kipxaB+7Qj4S9Z z$^ep7AVg!?=xxQ4absVq(Hq3HHK0xl1$~IJzE%oJx`o0Hw1;wuG$}=F}{I#P$zsW~5IxeI4m;K5;u$uDqK4M}1u#cEW pUiRlFeZ&fK;Ng6wpIBilJfb*t1I6iOKQYhNraV-@Ps)%0e*=PLWFY_m diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co index 33a1a09cc9b7a54f07089393894605dc9f3dc1cb..fbf8ff45ddd15697e8ea53a562d2f4a117892935 100755 GIT binary patch delta 760 zcmZoz#kgS+;{*++4Zah#meqGyFn|G!W)Ohz8G#fV5O;V%_#IiObOD6RP{9giK=m;J zxy0y$>Ywb-D5yl(j?GPsU4k4387A#J$S`f+W)0y2=E-}Ml_odoC`{H<5ty7Nt}uC$ zn%?Fo;u6dP``KIfvp2Eqf56|6~Ilkjl8t95UY+fyxv%`^ibMa=d4RN<$6W ztfykaw0V-69jl1)pQil`3=Lp>m$?aU%H}4W5GJSzX)Kd_jJI&`|3fte=p4JvE~eiY zfhM)FO#WcKMN#oj%YKlF?kzU`Hl7Uo&oQ^|ckckQ8*SQb8X5LK-F(z$E7;h{esa?` zCpc)ZFh1Sf=>itp2IQ|`-h9UU40AnDGYa4V^Vk_qSfU6sh(pCav{1#BpyGSb)aydU z8zNBELy`tN!cKMaza+z>3l1y62XlV=5+N&bMQz8}!k$N@`r qlXV5fCxu8b{qUMB6=KJW0#=Je}+l>fW|lcXP5@2U7BqAglvQw#B9VGHj4-sF#G&h z@KIx6`2U}up<%xoYek(6!>)ZcOg?okOn!A53~iUx{FxZk7+S$JObv+EV6Ce2vk_?U zw-sy(un}npv=wcd+#;r=XL?bs6lkN5kCZxt=|y!A+s{u*1IX3@u>%66G=XeQ5IZPH zYVrXY#mPs-3`y6fG_(e5 zn?{EHA2)|uZY4Dp1FhJx`GAcE3**PlFCECyuC#fdGZ#nw8Xr(0i2yucCOg9eYlIkx zB@Pwm&_)$kf{I^3Q?CmZ4~RikZvhqW2>~;N;j;5UEG7fIZi-p5(`+3F$zB0=Ocnl=NWS$S`T&L56AjHfsoHFi*ZCrZjnxw8CURaf8W8=6aj& zh)XaF>}PM?&)&qc{{eH${tAYk{gVTvl{PPu4q*Z+QP?b}pT;tIjnWp5_l!_!m?8g6 z>^7&Ud}G|4WZuRySwMS-i1MGN{R|8ZV0@Rk32qwD&bZAtbiOe{O_XBg;Qxne+U9>I zCQLw+tXKsV|FrA}ndIJL({JO+u>bz#wKnFA4>muw*$Q^jWI6q5n;RTNIDlePJ-}ky zHXHb?;I23Ega#~>;sMj_3K4FXzF#L;(HQM)muQte}sS;&Tv@> zi=Ck&0WJh)_<>1w2A)Jz@m(M;69>bIqRF!&wgp%^yD;P=CZ*=Y8<@Hoz?epcaHgp% zT!EXT8%%+-lPiqrW{F@r8*g3^8NkGJ#9?w_wA^HY7y&ML$_$#U6=No3Q4Dd8MKFXu o!85ri#*E1#c=D=Kmu&oF5p(D;V`4Aa20OOs8Xkd07-n2lJ&W)a~GW}p8G zK57gM|NrwdH0)Pnt*EnM*tO4w$*0bR$*)d>q3x2IKNEu*Lo1kusR7X%tW|Y>HUbU) zwt`IoHX;pywxUgwYs8fFOfRaH0&Vp1ky2+cy{Hai`}s*}0NENKc0hoXCXlTOVh06D zO)gMToV-`ekaTSZn`Oi$m}9azTKBUzvFtZvY1v=F(6j%)5*7X9B1MjWHn&QdGLh~f zrOmfwq*z0wn4lgR_=wqTt39_G@*#CX9t))5Ro6V(`TgghZUN);Z zfZD&hkfVLa<^+!w-1SR*pd})d;sMj_3^%M%#l@lG4BDvTN>K4LXzF#L;yo#->Mfw+ zGa|qYXSghc#m?Y?roj)Y;SQSkE|54A2g8iA$)`fL1voldGUOyCrRKyNn3vt zkV}j{sQ$_RjDkvp?bzJJ*d@qukYUokgACL5ZPpOZV4i$OOlh*7c);d$;(5#hv$|Ktj3kgUSySJEa-oBia(I60m%L8TxX4K`b8s4#8r(~{y8G5yuF zpMjwPj8C&Rp_))-0yN>C8PEiYzi1|$asZkj=PbslX#A^XKgbmK7Mp$>Plo+_SX=kI zcL3RqHf=VI4EyhF4)xi}G}%vX+vX4cEF6q?Hh&EV^82*5ZJrSAz+HbP0ve1^iU&-y zGc0gK6&Ht!XK0~{D?!DlpsCk|il4|wRc`?mk4OPCgyFKwKrAK>h69b0lhU_Yxi}d! zTyl!+M}$ F@&IOGrWybM delta 1704 zcmexxf$_i%#t9ls2QnsVEvugp%m4;3nn3`vt zkV}+4h_=c8jDkvp?bzJJ*d-|QpJCEIptgqp4Aa20OOs8Xkd07-n2lJ&W)a~GW}p8G zK57gM|NrwdH0)Pnt*EnM*tO4w$*0bR$*)d>q3x2IKNEu*Lo1kusR7X%tW|Y>HUbU) zwt`IoHX;pywxUgw)Rgp0FRGOSE%fn`QfDx|s19QL`AKO2*%}~rK!B7ckgW+~2L(w@ zR!}pVd`?V(RBaDrls5B;`!UD3^R(_~Z(`Z6!q&3Cf}v;sek86wCG^~8Y2$z}z$FjCz=85AMs><_VTjxexbOt}BEX+JQ6!B~;4X|Tl1 zF}YC-yB Qt_(X)feeV6^+3bq0d>~Am;e9( diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co index 3b787fe97b86a76f12f8c82c97beac33e912819b..03e31ba690771eff9190e5ec5f558796204ad880 100755 GIT binary patch delta 2509 zcmb7`4@etV9LMjyBqkowsOUOYMe8=rsw??(`Qyw+{1a_wS!I;{fkC?34)ITwvK8to zR;NRRZJjrIjMh$6S_f8Q&v~xIT!og6qO%kcS1S#rjKX4Rwb)s4+k2OHNzMxE4wCn~ z&-?v(-|vMazI*ij8TynFRu6o;^L$IN3sU%!S1L)mr2tyuFz+Gz6UwzQoJM7ZF(udG z+vlkx_3S>7oc-T3_%<+>qMM|;1C!M00J}f+>h^?`4)F`hA8@@#i5r#xx-)0_yqEI zpWM>0w@rH1oihtr$ORIC5ohpU<$6>+%nQUB7M`|=6m?R1y{c(r8`5)e2A7ZrHaF=l zoQ1U;(I!hS)a0Q1a@>yJpSFv*QF*)T8`mC6Y}9m!<((*Qb$#h-rNlA*&GO@F_U@B8 z-oLj~1IHluTPx_|Z@l|Jg{EN|QuDr#j4HUC#ozMAskV?isG(<&QCMnqJbNTPtfYR$ z3;5AmNo-RB8~R$uPkrtLkXJ_!+T46i#UE`ABpS}FM%5mUkEj)p+GpD2_n|fw>(Mrg z{iw~&3zrjje81W*;#RWWPV@+3^%nCN$JWAnhx|?hEY=7}FAeyIQ(?iv-cFF8$R`D} z;((qv|55vlU_n0&{?f9+Wj)=nJ*;{>Ax5=Fji2rdV1b)R1kz*Yo149Wn{NXrh16t# z3_04`l)yba2Gqid(_jh+`C2fq5vGj54-6H{Q6qjzP#7U~&PfvnOTj3BFR#TK@H_+R z4uElxy=S>y#6x6@dF&X#^n zgISGmwi_bvP&M*S{||6M!}?OxJ55T|yJ-NGBZo2|8o9n29l6;5e@BMOp%Z4WEwzhy ztnB7|s?yCGnz&g3AH=xX=!HY6phn2}6s|?wJh6>Kj;OJ)&NRU|2;HE?RZqSH$)cB5 zEv0*;fU=#?#nTI$~ZD8td~gjc$yAoAuU+?OW>Es>29vQg@eASrl#aTwxeSGk6o!4r0T1aTOMyRR8Q9!84VPoXbVkwbBhs{hvv)L1&WmY?( zsIQ!0aTLoV%pRr`CA8qlX2xTaTX>ivxz1w6892dGQdUetbTkhMZRRq$jA0m)TxNGV zNyspf4t8^qOl<|F@*#>WrN#0woy9g8I|`(;;mLLfBhNroK3b8_x1>U3sOID SU3fM1ao6$M+sDzDV9vjzGNQ8p delta 5027 zcmdUzeQXnD9LJyMSzp$&xfeE&wL({`6f&S&ds_!(-r5dO2@qfB!b>5z;KZ<)NVZ9~ zqjU=)i@cok7Su2!=^(tUc)CNf>=rU{g5fRugCLSJCr%@XV93DWylbCpcl$^0;2)l( zd4BhKe$Vf@->=X2p7uu086(5SU3L^Xx_frG$<~H|*rXn0#x4OAI-T&bABd0J2jqQi z+znjPV*}3(3;6$$nQH_2SZ1jSb zi{v&0NYEVXB_FZ(Y8x6?dNQe13$uc&Jvr2xg}FhR3|Qw97s&U78=|PZazI=t7wup$ z&nwto(GG?3e1h#0?Ut52IuLkC;S@F!s7;~UO=*TMV>7Q0YWPa}h-sF>D8j;4`f~ME zcFUL-&Kc zP{-KkRc(;&GR;)Deyj`)t&gA#eWioCmmjDtgJqw(GFZxbbomGGhj_6Ri$k8Bp63C- zUE9nr-Y(c%JuRNCfIrDT@YidURz~=e>E4=dHo{%e?5WD6-OE2gClUSIsU2)!#my9j z;n?VzFSTD(xl>Zw@`Ln@+^^}rYXfw2=rrB3=EUQv!b@7iyvcC|w$r<-)hLj?T-~p< zl5|{sd!Nvq(Wu;RELDG9>9SmAI^2@ZMtAsO`rCJtx@)5gTUDjr(5f!=)w_h$oqwxM z-Lg+@>Qsb;`ET6~@nZUt);fe|@OZ5)W3L^%_YhihXTW9bucs7;5cFlsP3dyPw8i#a zi#(P`TXfi!YCLC(i)tjDK5DWne9njubgn7_uR_-UhvH*J)D~Cv6}+Fr6j)&@_q`4t zhq^mDm7A@T9E6BFY=nw$L{wXJN`~onQfNP?t3BYjhS5hr)@zk+AAIU@j>TxpR3@_>rg5no&=$y$ z4N!xI^qjEQ@RKPL&*0%^xkrEn6SW#;dPXCKF-Fgk2c zr4ifdy^BidR^(lqD;-X9C;HC8`96* zNQ#wU3P;uDFKAVD{$Vqx$wMxr`$;&ZRQ(?31cj=tLZBE$4f)^Q2=QViBwgOhk?V3Z zx0Xcr9!8g!tZ*}Ds8Jq75MXqUM*MTiML$I(Y`U^ZyhygRn5hS@yGbTmQ!)SRxcDD9 z*{m^B4{Wa|nP|=-`)Kr`midO6>PxvN9YKgQhhr<{i|ANfJfkBOH}Ko4$BC}|I# zQ1Gt2)Zb#>K3~o+Vm^xG{71}NC-{$GUU>`(>yK-9@WA=q@<9J!Zayx)$4E9q%uNpp zs^mNe^U5qapMiPZKq7v8*UZKPOD9YqFdsc44@6>a3CVen$Wbb=4=}esL)}eIX9?K2 zcHNsB*En51KQ@br63KM<9i^B$O1v(~EFyf8S>i3kW?@lj!eNoW2vZ@hKmY&uI8RV1lXn&vE(90+eB zjdH^8lG1nk6VXb%3FqQcLW(9(DO!1vSE8kEmz&Aok4oqT*$cRp`_b~{PrtR24^+n{G&45O!*U&3gbL4sZM)~v~lxf9vYl7o!56O|e24#&U4 Y+St&lFq*`4cA{y<5zN1Vu{aR?3mpTSqW}N^ diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co index 3fcb1c684e870d62bda372b6231fa80542b17357..958dc23f7ff068ccb06fcc69f762e060a5d729b8 100755 GIT binary patch delta 4488 zcma)=e{2)y8OQIvi+zb}hDk%Wt}dmilO}1Coqa!i-m)qF320G>w3KEmY642vA9Dlk zm_UeOlQ}F!bn|+&E^De~DWXw_kh|Nsf@mD7WLw)2yeXp_+f;_A7BMzx%S0&%?S4G( z>%7??nma$&=%J}Sm8Dw7E=DXX& z*`T|HzHWBHn4${|IA>-1f6l>qZ7o_>|G#VKL+D6t^$F&=$rH>AlUh&R^c^#G!=>*r z$0ys@F{QwlhD$lH*M57raLSDu^xoxLSZ(ECz4r8AxBk)KAgA^Ey)}1vhfDlWRmrD4 z>2EHupQ4-k^qoqTK0a~ArR|Wsny+b&5z1q?AleOPV z+(X(X|GnB<%nUQ8^y7j;}iAzWc*#xssJ)Rj&Pq?Q57)uY0((YQ(g6weN1< z#gtm~?{=+mYvU8A_2T8fRB72qecBt}7)34m0}ole&Eb0XQI%siG#ayR^a9em(>b(2 zj7#sB*->L0>SCLa z);aE5dh$v|t$2&AR)1@}Xa-O0@u2~&^=Pv-A7gea>Oy+@S*w7Prs%o*-7C40l>lp9 zXAXU#)@FTKdB)aioE<~ks-2tUdn=9l40EkUd)i_<_e0zt7P= zz8q^Wx=qy^18muY`!i-6*leZ29>29=m0hyoLPhXT3Lkc}&E|4l2?)TklOiZ(f>QLBt;7{xf(eCgZZ|oRlM>*p+r`Wx`(O=C8-YV<+#>m%k9Vp}9 z246mK@!D6OIk3$7`l`G>Ynwc?{v(>#0}rgP#9CG>@XK+UtH6u**+zzVoj9J!3k2Cl zm;IM#&M!*zb>I{2bo>DD@@krI0zTifkel_lgCqZ)g${-p#s?mwc^>#cKh0kP?tPNx z2Z4_*n&|5m&oDf5|DWjt$KgQ#VVeIM_{c7r{~q}KZkqoU_?*)*an1wpG4ogEZ^Dnk z(fKlowT+v=)mLd=W8VrqGo{nK4)~!rX?_>*z&nIH^;^K&oIltA?&6cj32l^PxE_$kL;&;2>8?mntvHM^DCOKwt0qU z1`g7Wufc)x>ok8D_{HNie;jz=U7BwJoRIqyB-NPOP;orqWPJn+uL zwt+D-2E6+x1U?W1q7NQsB41nL8>6Ea!W`(*c^F13tZ|p??BD z@9c==|Ae*m^Gvy&PQ=**;FW&#|=KhO>x`22aAPXh0`O!K#aZ@y0RIpBNUwn2n-TR%eBYL4k{Da2NA zMn(!V=?xnnN%x7dXavl#n7~v>N|*|&7>58M6e6amNSLZ3iB*&+rjalSh=(x^N>Q6; z%*w$a!9q&LDS{zYgcN3LDwv_95W{9DfvK4w1T?6oNR3!nuEb(9n2g}Op`egS;s{9; zu&G2zjYu5#6N*b%hZrTAMM)&FDPoGDpaRK6xe}|2DWK+roq$ZbPfn@0hLj57Cd6b6 zmy-lCZ7CGRn~`JzYnD>M1SFPJ1!s_=3EY~L5OF`U5W!QE#h{phbF!@9Nyus(FISF{ z+e1!}5r=~*TtiXg#1s-(h9Z*Q6(x+DP@)0}h$lf@rcX(cX~z_8Y79-B$-4cNG1^x z$s{6j3J*RKiQsaPIN5$A8OD{P0!bE&Nq7hAqp2W?kR+@*8V=%hL?cP8E*g*G&BO%q znTd%q4u~m15}+nUW44~#$a3Zbl7iU4wR*^t5KrNW22QD+f+r(W@T8=}RrtBd*(p!q dLId{|cJnQTmPYg1QCE1vx)$pSuQhUw{{b{Oj1&L> delta 7106 zcmdU!eQX}m;6;r$@}E@Jn!>$?{jCznb+-?FW58nT;s~4t4}5}*n-p-U6Dik6JfFxtK12C zFT1wtW~g;B2(XX)fA{IzvJc=Tb^kktoWiHeYiH3@!(>}>7X3t9qp9GL@?d$gGFX|+ zhMXhS`zku~kE5aC;5L+hB-oixgDgHOFRl0)w|RDX@*ad{XUkABAF}R05JYbb2f5^d zD3>}AMu)~ik8>!5I<=MBsB75z*n#aqNAhb8OHw<6&g9O9id1&`aNSzrt7Vw~P@KyCvSyXZ_)z?Mx!wEw`4zu>9V!0$t@#xHyz9H%{Hos-8!BFL?t^q* z?ZAd~@JGS>QNFdfGrysW$OnUmf(KFFCpT=qzu06$6mmsRS+f((j_k&!W1P>D`}j57 zEw?^6Y&GjC^d|8nPc(}bOVO%Sc3H*Ua%kVo9Xo2H)@*B=z4UjL`l#($oPDLn>~U9* zn|!e?W44!c&WfzEk7N-tQDLbxAB|p759*c+!EUGI{#E7uP*Alg_KHptJWaas$47 z_Jm?({ZU+s<;KxB%oeGRU}H(vF;X-*(__nZoWi$?i^yG$_$0GPe9lao_eqA4`aJ## zd|7!87^mHJocV|dIl&oNYv{2gDz%1jwHeGUqz@N_jk=EPsj^{vHbk0BYs zB+d=Ab4g{yWj+UG;s@MhxpHL#?zNTeeXlT-KXoi*D3zi{JZAR5GX0n;mwV)ToO7(j zJ>0ARp3mJ$eJ*kf=}OlV_-WgUhu=%*)y~iv?I>qwG^jKj#W9=N#~PW=j?dtr^3iMa z*i6Q->TXNlChc*kB;UqEckB?>FH*+e#rwF@6Q9*bajaqiqnI)KMx}7*l;l=!YpwcB z^m2GAcd(Y5*DNYz7CmNrBTWlh@)@TK~eOZOZn& z+%b!i`4`t?dp>l5<8VZgE^)sxJ* zOJTVac-601z6|);iw4)@uLi}{D~3W71mHtUG{I1YfKS~tkS2Tq_?bzTZv(z{n&l5+ zExrx)e8ehRU_-rLqM`ggaL1e}NCEHY*961-APqbMznJ(eJVL#hhwl zLjwthC(sg5H0ixE9MuAE&oLX&D&P~xS?&S8{~s)G0N!x(*ck)9i*A@a0*S zHv$j#vHYvR`%kib4{+xIaC-gQK+#;S2}VL4z?x{m2g@6ahzYmIf->G5=d>t^SRA!HwEN+f zoqQzj1<@}=1Q1=ZXn=}hAVNitV2HfrqErkRw!UzT(m>2Gh{P#%3tr7bf<$(E+&*d? zk5LzH-hEqYJQ#T5JaysWqhX+K(N7b02aJT>5g(0AY(;7t7bWV<<4(A#Ebug(#~-2j zc%-=DBI@^pEgzPsEid?~H{Ru=sqwr(^Wl9G&7BVzkx7U3W+z&9`rHEbCb*2qUYCbD z_PQnN*z4t~W3MkjQzP4G3~wNAY>SArIBzUUub1HQQUk#obtj-C!S69FqyQ~nh`OWH zAnvCf^z%`gk57_3VBjOYqM~4U@QDKTK#l~xUcOiqwk3A@-NvbjxO~)`1m}Ys4>9hi zF8l$By6}g+G@L&gr3s5Jp4KdSL|PH?Kx-C#ykRSbqbQ*telhH()rnC*ZDxR+OeiV9 zOVl9Xa~TGrKsynLxD11sNK=zsF&bIoz4X$BqsAHC9(EhOl6+yBu@o>)jTG@2F5*UY zVNaaK2n#V!R diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co index 51d44c1b4f6fb49071d8ed03ab76c17ba4a2b9e9..4c4d2d43128a4d2fa0585202920302b3ee6e8e90 100755 GIT binary patch delta 733 zcmexxnsLDq#t9ls3*shfEvql^WB>yg%^(2bGXg0#ATEf8@C&NZXb6{K0t=J@)yD+n z5~HsGqHVH2qo5LDJ2p2lb_sGEWSF$?Aj7nMn>BcVx63$dVymi6I2P% zpv{leQkcP})Hxhtog86sMdbdkru_^I4PdOu+Js`-=6{ARAd@z+ape9*m;$sYZnK6h z8_c9hY>Ib(wd@C(!yv^7cS8e7D zDr5n9l$#RnmQUti*sVjPNSBja?188D<08M-fER!##m@z$wp3IeM b$21{kvKNqQh@IS(YRCB@7UIU$K*Qt#YWl26 delta 1709 zcmZ2*gz>{^#t9ls1xXXNmeo7>GJpY$W)Ohz8G#fV5Ie*}_zv}`bOD6RZ~#if^f3YX z#OQls6^O~%}tD5f-?UZChY?n-|(Md8klxzvgs4D5o!>#5o_2iBHY33^IyS7 zje+6+e}0CB{c5Zgbv6vU_SrD`)VVPE)oC!aT~hOBVo+mf1=BD!AXFph4eO zu*twiq`}Zuv}y7jF(p0Ii)y7n8-0AF)EP`Ks)N{meo`7hwg!kD5Fn)qWNU)hK|xZJ z4LlSl-xV_?U7ONoJ@F{!n9p3T``MdV_E)jC?5|+x+5caOivF1)MUHW@Ql*tI*n;;37x;1_5&jtjLTV@22BiQ>XR2kDaODU za{4>CVrZ8oc`?LiJBdxP{a4F=P%OE(*!0_YGVCv4ZQbwQ0c1DYwAnN=?BBNepmQM$ ziK%(BfrkYfL)wJZnJVRF=NO{OiIm(H!yd0gE5Uv zO<_!PXJ;tW(8$RW#xyW6o}3V;w>cqBfr-h%cQRvw+~frb0$lK_B7X9&1T(=6(3)dI sJjDJ0phEe{3W*X-8{#KhCE771BuuUXQUQsRcO}|!9!P{JTMbeM055;A6aWAK diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index 134022127389de47b0b060a22a3378c60a4c8555..e86513e2aba54d76335279111a969d85b6d6f1ff 100755 GIT binary patch delta 1940 zcma)-e`p(J7{}lDa+h3c+GVqw9lACTO2*r z^T`LdH!{dyBOqyu4NcaFNNROp|K=`(wT_R=rV&=QvN8D>NXcmFF#$_kh8eUF%%bim zSv+c+XL%Dt(gl&0oj6>-iNR9qXDm9<^l|mwl@S?$)A?^DJ5o?n)*Qh0=1rxF78bd^ zAMt;z=~eM$OITGr!3lR(VzR2(8S}s6e;UYt8RBwhQi;#_6aF(mUW54SwnYXn1Zz2F z4Wh*8Bp&OkXB1~ye06uq$TYigYft7u+b1)-4Crv5#k#jbvu@`~tPA~z42pFJzJ7F; zLu1e9c=T`9ha*M<&YZf%p>r=UR{wavXnh-3X8qWZkt(K){7dDQ9j}E{r5WSaAMVFw zRf$?2;`ElY`1Sa4R^1%Beslt_zLn%)7TxKbLc3nLikHsT8c^occMAWF#kb$9F)$4R zu6V!R@W92N%uI0BZ#HP|51UKeFHHQR?bWi_TZso}+>pog!6?1?@!5m{_uNcW;-#;A zu=>(+(SO8T=HH%6P|Z{ozgxN2{BuU+@W9WXs!}smTQ4rhWL1g!Pb&9QdgVdfw0y7V zzj4*K4hE+q4?zX0#v50cSSNEUuzbHe|e z((?tvQ)Tr(5#GB^e-ghE-fe}zQ7UzpB=$Av8;+<`h6iF>`bHDXMms`5FnI8}$l!iS z>=28gL_#s(anjK1cGFOD6+*AOP|)Eiw6%LYq}J^c10-|?+B<0I3G7h9sJgOK^ic_? zC*UO&l2{OShJ19FF0V*Ksl8b1a#3N0x`={X+(E@$K@Z)q+Zk#B`}_BJ4!$@XnLghP zdl*Kvq;EN)qe7&wxw^C23f3 delta 4779 zcmdT|ZA@EL7(Vy*UcLg3Azv~!OPyUQ*tM6ofVpF^7Em&~K`PFrS=nUB6qQ6w>NeZE z(k{(1v~pxMTNWE3`y=sVmkm_HCCe5y`eR%6V_7=%#}Y{v%@W-b$4tHVat`eBM=zMr zHEG}HJm-DSN1oFs=icedRODAGY9KSr9~#dDd_@G1J~)D0xd|jTP2!EHzeO2)=hHxh zMgY~3|FI6$!yUw0-M`Pl2Z{4}rAaV2B@WLe!3jA!gY5fxY#vw07ILsY|NC{#Yr;Z5 zn4DtUfl#9h3n7H7bz%6r>+9h}0l7V?Z_@*iOs)Z(P)_^&Ecj-MCENT?((SJRN3WK@ zNP=<@mLtxQVg-HBZ(?(~&0E$6&1^nr*-{XMJH0xS?MnGO;-cExN{qmE#VBF7yQD(I z6%uZ5FR2u9rG$6wDuKOT2AYbsYRfpinLk|ooW}EGMp#G>QbHFM5`5sekTm>{`+4xm zGWMf`b@3T!Xs_h8>stu58#8a6;I&U)q}&XNHnh3|@x9(kVzAKzKYIFim3!eiJx6z( zo(l@1*7EliDwr}N&0JcUNFAF+k$(C+HAKqoTee8k_QQ19skx9K3Dq28KVY8+LLn6v znj<1U!X9Oh03j2$`oDnM^&Z~SI7-0L;0Cp4!Ms&VlbPg@i#Wz7oGqlvddTS~_(b#Z zG&0A|zvlY%kv6W;*6kY4T-LmC`RSB0F5b}IrBC=#(<96^WGp_BD5X%J|j_MB4e@FJ=ArLxLm}Hm(S*G_DJL ztN&d}b_hN`Fh;{Wcd79QSwvq!fZd0lWKqbL zV>-zGT*((9K7R?XL;TndW&SgWH`FQl8pKB{We`~Cb)b&YB?T_T7k4bA%HUv-6Mcxc4o@cMf4jisJkSQHO7FEiw~EJa<3_3qMiB_bkajh4|=a3#l?VgZM~X z$$up^10WsbrzuxDDd(jF!_Ufs?+|aEQ1Vw1U%Dj!CgS5}<@mFR?_Ng$)5AmkfjUNP z%7QnfCYb?zEwPh2;(VaV(FA(-?%UtvWf(`*BD5iB-o`jFv^yFwWb9T9?T%^;o2+&W zomIHZYI8aeYjM=pVpwfy6k$M`*Y5ZG!X_#mWrChS;>8$j$QMmqo_d v4LxDcn?>ze^fIZKj_EcLc5>Dj+r5dXBR{T)os^>16uTkCs3}%rAoTwL(jgeE diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co index 090e5efce8e59a01a4a40863f5868734b7b09c90..c60a789b11faf6de81e909f87c15aa0c4328cd84 100755 GIT binary patch delta 2535 zcma)-4NO~A6vyv-ef^kYu3-12`@rIaAdvRG*V3|v(JzK+LmykrER-lhnB7y%;QYc`2J;nPaZ+>Xr?wk+;xa;=+TRi`uFFaM znkLEDHi8@1)-v;Chd9g4*{q~8TN9orU(PCu9Y*{0I|*4;+?4RW?Ryl-bB3h6X}=Qh z^S$odhvZoXU)p$;#iN0>Ms}7--!5Vf;8$vDS;gA`?%&*HVq-pBQ9tl6#v@w{*20{O zahr)UuDEJ`Hjl%7+s~OSiwxQE6vsXz&bu;Zl&0RWO565cwf#PmHJodr4P#&1PpZmq zMoElaw)}ZFtE3v0Cdx;o&cpqs?Qn@Ls9O#FXGG zhm^R!cg(_`Da2(bD-8d<;z*SNPoHjKtuHUn*sWZgwX0mrT)NlFbmIM=RB-r@v)xw9 z2S#epIb1Yyj#CCaXtjMgowd7RF>P15-`;srfoqCX^KD8>_1v40+xb_w)&BW(LRKY- z*Cypc=3j8BNxHQ0z@Rc;z%-b*g84OcF)_Iu|_r+S9;HL*m zj<%Z7u`@JZ0la?!w*wDtq3cV5TYWV50Kfc*7R2;y5E9&i4UYk@Y1B)#;A!9u4K#la zcuQU8e+|jL0KES_Ley31-vNo1SLY`XN&;WGi{>fd=y{sI54>@K&T-)RFVgkeLB^07 z)y3(=5NwFPMf0zL7kx-sJ z+@$#qM(b}z9H&+2*;d$aYYEL?0^U8U!VCl(F`Q0tDJyIQyNg5R4B z33d2gPM{9AD3XvD^w2Hp1z{IY3W6>dgkr$uhtL`H6J4jsEI;Y z58Xi`Ec*F!>27Xx5I@OGGHkRo-M5t~yN6tRv?@Kem2u~d!`EOu^UFc`OVjoU<7UUJ c($7SgN;c(BA6D0k;q;igdaBd+KgB%!H@I#%=>Px# delta 5408 zcmdUze@qi+7{}lD+EPR^DnnGn$cV54itXLCwB|&iv>-GXSYd85R-D_IROJtK#2H#^ z>&)Ud%_|5%pjvdm1KZtU**zQvb+w! zB;@_}^E}V{+|%dn)9c-f=;A}VJ|h)>?ojDzf6NAf{D)UyP+kI=rpdek*}KTKJK+ZE zv`C;h>R-m8cz!*cSNP94gdiNw&YuGPqtf<)De$4P)&#ltvbpR)E|(kN*XCSUT$>dZ z_kod7t`&%|C@hAM21Q}MtFcI^n}3*cPZc*8128p}1p;D;uBVp+=SMlJxwnS$^p=7B zS4vtaPy)irihbm@Ot-tYoHGPguAUb(aXA6=>iI#wpsBFja=B!ubXCpGd8NQ|xm4C3 zPhOd%%Vgc!nzvli%VoV`Lmt1jsaa)+(AQ+p@NJ7b+rCK;i~68ed`%k?d%!_)%J?7F zlHW3mS_*~smJBEqEUAYotKv;-VnV^vF|CKDP&NN^XX3ZLf^S(arLqtqP#n;CKYkDt zr4B3quV{lQ)1INNkI^3QN~t~H{+SZ(IT^nVh0LN|YR{Pvx9-su9(@oJLti?J)rRwd;+Sx~_Fx+IQl+q^ zarg-xPJ1+MS2GiXX-irv4W@jZ%8#-%lS~44W>Cj_yptYQlM)A?m*>(*#|^E@VuZl> zbsfLMchmdP{ps2CdRi*YrpM^esh&yC*AG*oL&|7s+f+Hb4vmE-3nTYKqTD4so_3^> zgf?Mke^Z*u1H}i%ClY%IJ)amg z)N!Nm>4{aEC-aV~TO*7NbW&~4E=WwS!*f$+a`EjgHw5?h=V#MrX?krocrNNv;&UNS zF5b87xDfwEburTPI%4o{zcW3#4$V!S$+d0b$Sl8GOj*L_TP*Zg{z!=y3Gv(K)m9?4 z#+wzL&_EVGv-M6$l$(SH*DmQs;L-akdvt{kUQ#`29+_B%dC(sHamg$;LE~ytD|j!R z(nQnC;lR^=ZWH>!ik3`e)dSNxk}p8qHiH);es(?SzXEY%CCP1w_bpe1KWTL!L;j3_ zM#L9-ldg(*0r5M}lYBGcUu;bLLdVwah)*2DL|&!*yODw0_oxAYFye>bAbB6+_qLJz zDB`9W`@Cv&jQr9M^x*gb(lDaD)C1oKBp*lo>@dl%Ab#yLlK+5s!3mQ8gg6*d z>Yq0Lh71pvl5gE#h{t}N_LbEf#Z3=7zan`K;_iziPi(6P-B*)5(f?8uFgPQD1##bG zGC&b)5%2q%SbyQ&KVy#%#FgT2L(s*C9vjRJqZ6-OfU(Rec zS6NV;nRemKl1DPm%9yc(-D<tU;_6ls6^O~%}tD5f*c1KCha@OFm2yv4dDXj$v!GdlZ$i}Cht=bn4BiAFxg36 zZ}Ssz4`u;*j@JF`O)UEbSX%a1F!b!7te^{08MoO&MvMulOkwjpxhPf+b|$Da)S%7# zR6>|GJE_;PihTavw4Z^Y0gQQBn&75vF48T5nXrsy@*R^c9N+(E`fWTJ_WxsU-S6H3WH;Kh*)%fj|I4_!%!kK19a0RZ$ z&M*be#%_}f1$8$c4_0Ji;;^2~7$iFRf2brEJhAysRthr{s(_}w3TRsFV4D0oRCaQE tm?S4mXgyf(zAz;w4Zq2c!t6LD{2^{FfEvm<**#p6vjZx+5+=&X000}yutxv@ delta 1740 zcmdn7oUvm);{**Rj*y93%j$2~F@OP#W)Ohz8G#fV5Z?%Z@NX2L(oimg20ug`rjH58 zB}N}q|73qgK_$X=Y;I!g5|sJRFlisq_=f)s)4;S#lTDwHjZlM_jab8G5#a)6pZ^Lz zY77kj|MN36>{ny0sIy_%wa<7h=dy7rKjVHr?F_zZ-?j1mOqfMJl zBg1}9rp;{DS4d69Ks$D9o?w^3!UWX+%ZVKQN}J!gMsd`~_(4lbD8&P&*%>a_po)t_ z#Vxc^#g(AqN6^&kLd6fnp{loliq8oHGlb!?3qUL;4u%^AlW+NLvvM*tXUIuRO3jHk zFmp12F^x=IVN5e8Lm1P|*Z{_KF*2WAD5$&nyT2k6(+#`HhJK=x^@Ak2;1xx{pG5rXfEEH_V`639Swg74<=j8HWNzRC1 Li0Dd~C?f*^{SUiM diff --git a/op_tests/test_pa_block_id_truncation.py b/op_tests/test_pa_block_id_truncation.py index 512b9ef2b7..45c57d7ee9 100644 --- a/op_tests/test_pa_block_id_truncation.py +++ b/op_tests/test_pa_block_id_truncation.py @@ -225,36 +225,79 @@ def test_pa_fwd_asm_block_id_no_truncation( ) print() - # # Cross-window test: pages within the same wave have block_ids - # # spanning different 65536 windows. - # CROSS_SIG = 0.25 - # cross_cases = [ - # ([0, 120, 65536, 65700], "mixed_low_high"), - # ([65534, 65535, 65536, 65537], "boundary_span"), - # ([100, 200, 65536, 67000], "wide_span"), - # ([65536, 65536, 65536, 65536], "all_high_same"), - # ] - # print("=== cross_window_in_wave (block_ids in different 65536 windows) ===") - # for blocks, label in cross_cases: - # for bid in blocks: - # k_cache[bid].fill_(CROSS_SIG) - # v_cache[bid].fill_(CROSS_SIG) - # bt = (blocks * 4)[:16] # repeat to fill 16 entries - # block_tables = torch.tensor([bt], dtype=torch.int32, device="cuda") - # context_lens = torch.full( - # (1,), BLOCK_SIZE * 16, dtype=torch.int32, device="cuda" - # ) - # cu_seqlens_q = torch.tensor([0, 1], dtype=torch.int32, device="cuda") - # query = torch.ones( - # 1, NUM_Q_HEADS, HEAD_DIM, dtype=torch.bfloat16, device="cuda" - # ) - # out = aiter.pa_fwd_asm( - # query, k_cache, v_cache, block_tables, context_lens, - # block_tables.stride(0), max_qlen=1, - # K_QScale=None, V_QScale=None, out_=None, - # qo_indptr=cu_seqlens_q, high_precision=0, - # ) - # actual = out.float().mean().item() - # ok = "OK" if abs(actual - CROSS_SIG) < 0.01 else "FAIL" - # print(f" [{ok}] {label:20s} blocks={blocks} actual={actual:.4f} (expect {CROSS_SIG:.4f})") - # print() \ No newline at end of file + # ---- Performance comparison ---- + # Measure latency across different block_id ranges and batch sizes + # to verify no performance regression from the rebase fix. + import time + + print("=== Performance Comparison ===") + print(f"{'scenario':<30s} {'batch':>5s} {'ctx_len':>7s} {'max_qlen':>8s} " + f"{'avg_us':>8s} {'std_us':>8s}") + print("-" * 75) + + PERF_NUM_WARMUP = 5 + PERF_NUM_ITERS = 50 + + perf_configs = [ + ("low_block_ids", 1000, 1), + ("high_block_ids", 67000, 1), + ("low_block_ids", 1000, 4), + ("high_block_ids", 67000, 4), + ] + + for num_seqs in [1, 8, 32]: + for label, base_block_id, max_qlen in perf_configs: + num_pages = 16 + block_tables = torch.full( + (num_seqs, num_pages), base_block_id, + dtype=torch.int32, device="cuda", + ) + for i in range(num_seqs): + block_tables[i] = base_block_id + i + k_cache[base_block_id + i].fill_(0.25) + v_cache[base_block_id + i].fill_(0.25) + + ctx_len = BLOCK_SIZE * num_pages + context_lens = torch.full( + (num_seqs,), ctx_len, dtype=torch.int32, device="cuda", + ) + total_q = num_seqs * max_qlen + cu_seqlens_q = torch.arange( + 0, total_q + 1, max_qlen, dtype=torch.int32, device="cuda", + ) + query = torch.randn( + total_q, NUM_Q_HEADS, HEAD_DIM, + dtype=torch.bfloat16, device="cuda", + ) + + def _run(): + return aiter.pa_fwd_asm( + query, k_cache, v_cache, block_tables, context_lens, + block_tables.stride(0), max_qlen=max_qlen, + K_QScale=None, V_QScale=None, out_=None, + qo_indptr=cu_seqlens_q, high_precision=0, + ) + + for _ in range(PERF_NUM_WARMUP): + _run() + torch.cuda.synchronize() + + start_events = [torch.cuda.Event(enable_timing=True) for _ in range(PERF_NUM_ITERS)] + end_events = [torch.cuda.Event(enable_timing=True) for _ in range(PERF_NUM_ITERS)] + for i in range(PERF_NUM_ITERS): + start_events[i].record() + _run() + end_events[i].record() + torch.cuda.synchronize() + + latencies = [s.elapsed_time(e) * 1000 for s, e in zip(start_events, end_events)] + avg_us = sum(latencies) / len(latencies) + std_us = (sum((x - avg_us) ** 2 for x in latencies) / len(latencies)) ** 0.5 + + tag = f"{label}_qlen{max_qlen}" + print(f" {tag:<28s} {num_seqs:>5d} {ctx_len:>7d} {max_qlen:>8d} " + f"{avg_us:>8.2f} {std_us:>8.2f}") + print() + + print("Note: low_block_ids (<65536) vs high_block_ids (>65536) should show\n" + " similar latency — any significant gap indicates a regression.") \ No newline at end of file From 603bebb533c27383b676fb59f6a6cb54b187c66d Mon Sep 17 00:00:00 2001 From: "Fang.Che" Date: Tue, 19 May 2026 02:55:07 +0000 Subject: [PATCH 3/6] style: fix black/ruff formatting in test_pa_block_id_truncation.py - Reformat multiline function args per black style - Remove unused imports (dtypes, time) - Remove extra blank line - Add trailing newline --- op_tests/test_pa_block_id_truncation.py | 98 +++++++++++++++++-------- 1 file changed, 68 insertions(+), 30 deletions(-) diff --git a/op_tests/test_pa_block_id_truncation.py b/op_tests/test_pa_block_id_truncation.py index 45c57d7ee9..994c5a5865 100644 --- a/op_tests/test_pa_block_id_truncation.py +++ b/op_tests/test_pa_block_id_truncation.py @@ -47,8 +47,6 @@ import torch import aiter -from aiter import dtypes - # ---------- configuration matching the ATOM Eagle3 draft signature ---------- # Production layout per TP=8 rank: num_q_heads = num_kv_heads = 8 (full MHA). @@ -105,13 +103,22 @@ def _build_kv_cache(): # K layout: [num_blocks, num_kv_heads, head_dim/x, block_size, x] k_cache = torch.zeros( - NUM_BLOCKS, NUM_KV_HEADS, HEAD_DIM // x, BLOCK_SIZE, x, - dtype=dtype, device="cuda", + NUM_BLOCKS, + NUM_KV_HEADS, + HEAD_DIM // x, + BLOCK_SIZE, + x, + dtype=dtype, + device="cuda", ) # V layout: [num_blocks, num_kv_heads, head_dim, block_size] v_cache = torch.zeros( - NUM_BLOCKS, NUM_KV_HEADS, HEAD_DIM, BLOCK_SIZE, - dtype=dtype, device="cuda", + NUM_BLOCKS, + NUM_KV_HEADS, + HEAD_DIM, + BLOCK_SIZE, + dtype=dtype, + device="cuda", ) for block_id, sig, _label in _FINGERPRINTS: @@ -228,29 +235,32 @@ def test_pa_fwd_asm_block_id_no_truncation( # ---- Performance comparison ---- # Measure latency across different block_id ranges and batch sizes # to verify no performance regression from the rebase fix. - import time print("=== Performance Comparison ===") - print(f"{'scenario':<30s} {'batch':>5s} {'ctx_len':>7s} {'max_qlen':>8s} " - f"{'avg_us':>8s} {'std_us':>8s}") + print( + f"{'scenario':<30s} {'batch':>5s} {'ctx_len':>7s} {'max_qlen':>8s} " + f"{'avg_us':>8s} {'std_us':>8s}" + ) print("-" * 75) PERF_NUM_WARMUP = 5 PERF_NUM_ITERS = 50 perf_configs = [ - ("low_block_ids", 1000, 1), - ("high_block_ids", 67000, 1), - ("low_block_ids", 1000, 4), - ("high_block_ids", 67000, 4), + ("low_block_ids", 1000, 1), + ("high_block_ids", 67000, 1), + ("low_block_ids", 1000, 4), + ("high_block_ids", 67000, 4), ] for num_seqs in [1, 8, 32]: for label, base_block_id, max_qlen in perf_configs: num_pages = 16 block_tables = torch.full( - (num_seqs, num_pages), base_block_id, - dtype=torch.int32, device="cuda", + (num_seqs, num_pages), + base_block_id, + dtype=torch.int32, + device="cuda", ) for i in range(num_seqs): block_tables[i] = base_block_id + i @@ -259,45 +269,73 @@ def test_pa_fwd_asm_block_id_no_truncation( ctx_len = BLOCK_SIZE * num_pages context_lens = torch.full( - (num_seqs,), ctx_len, dtype=torch.int32, device="cuda", + (num_seqs,), + ctx_len, + dtype=torch.int32, + device="cuda", ) total_q = num_seqs * max_qlen cu_seqlens_q = torch.arange( - 0, total_q + 1, max_qlen, dtype=torch.int32, device="cuda", + 0, + total_q + 1, + max_qlen, + dtype=torch.int32, + device="cuda", ) query = torch.randn( - total_q, NUM_Q_HEADS, HEAD_DIM, - dtype=torch.bfloat16, device="cuda", + total_q, + NUM_Q_HEADS, + HEAD_DIM, + dtype=torch.bfloat16, + device="cuda", ) def _run(): return aiter.pa_fwd_asm( - query, k_cache, v_cache, block_tables, context_lens, - block_tables.stride(0), max_qlen=max_qlen, - K_QScale=None, V_QScale=None, out_=None, - qo_indptr=cu_seqlens_q, high_precision=0, + query, + k_cache, + v_cache, + block_tables, + context_lens, + block_tables.stride(0), + max_qlen=max_qlen, + K_QScale=None, + V_QScale=None, + out_=None, + qo_indptr=cu_seqlens_q, + high_precision=0, ) for _ in range(PERF_NUM_WARMUP): _run() torch.cuda.synchronize() - start_events = [torch.cuda.Event(enable_timing=True) for _ in range(PERF_NUM_ITERS)] - end_events = [torch.cuda.Event(enable_timing=True) for _ in range(PERF_NUM_ITERS)] + start_events = [ + torch.cuda.Event(enable_timing=True) for _ in range(PERF_NUM_ITERS) + ] + end_events = [ + torch.cuda.Event(enable_timing=True) for _ in range(PERF_NUM_ITERS) + ] for i in range(PERF_NUM_ITERS): start_events[i].record() _run() end_events[i].record() torch.cuda.synchronize() - latencies = [s.elapsed_time(e) * 1000 for s, e in zip(start_events, end_events)] + latencies = [ + s.elapsed_time(e) * 1000 for s, e in zip(start_events, end_events) + ] avg_us = sum(latencies) / len(latencies) std_us = (sum((x - avg_us) ** 2 for x in latencies) / len(latencies)) ** 0.5 tag = f"{label}_qlen{max_qlen}" - print(f" {tag:<28s} {num_seqs:>5d} {ctx_len:>7d} {max_qlen:>8d} " - f"{avg_us:>8.2f} {std_us:>8.2f}") + print( + f" {tag:<28s} {num_seqs:>5d} {ctx_len:>7d} {max_qlen:>8d} " + f"{avg_us:>8.2f} {std_us:>8.2f}" + ) print() - print("Note: low_block_ids (<65536) vs high_block_ids (>65536) should show\n" - " similar latency — any significant gap indicates a regression.") \ No newline at end of file + print( + "Note: low_block_ids (<65536) vs high_block_ids (>65536) should show\n" + " similar latency — any significant gap indicates a regression." + ) From 5b0b35b6faed45ee7868aef3b4c8709d4d2fce8d Mon Sep 17 00:00:00 2001 From: "Fang.Che" Date: Tue, 19 May 2026 05:27:56 +0000 Subject: [PATCH 4/6] update mi300 kernel co --- hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co | Bin 24144 -> 25176 bytes hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w.co | Bin 24584 -> 25616 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 73616 -> 76712 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 82800 -> 85896 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 165808 -> 165728 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 195656 -> 195576 bytes .../pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co | Bin 25008 -> 25680 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 75168 -> 77176 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 84312 -> 86320 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co | Bin 21816 -> 22488 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 24008 -> 24672 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 27768 -> 28440 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 178128 -> 178048 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 207984 -> 207904 bytes .../pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co | Bin 26240 -> 26912 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 79568 -> 81576 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 88712 -> 90720 bytes .../pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co | Bin 22464 -> 23136 bytes hsa/gfx942/pa/pa_fp16_noquant_gqa16_1tg_4w.co | Bin 21856 -> 22888 bytes hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w.co | Bin 23264 -> 24296 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 65080 -> 68176 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 74264 -> 77360 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 164368 -> 164288 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 194216 -> 194136 bytes .../pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co | Bin 24864 -> 25536 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 74328 -> 76336 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 83472 -> 85480 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co | Bin 21760 -> 22432 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 23952 -> 24616 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 27712 -> 28384 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 176688 -> 176608 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 206544 -> 206464 bytes .../pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co | Bin 26096 -> 26768 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 78728 -> 80736 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 87872 -> 89880 bytes .../pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co | Bin 22408 -> 23080 bytes 36 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co b/hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co index b2e2fa614ed740fd824a009faf9743cbdfc4a2b4..5861fd741cfd3ea3569230dc9021d4c3b45efead 100755 GIT binary patch delta 2483 zcmcbxhw;V{#t9lsH{vF0Evuj4!2kv@nn3`vt zkV}+4h_=c8jDkvp?bzJJ*d(g+i^-=>Ab|1zFDAb_fe^<3YW_YlV0HvZETB#xhVj38 zpq~s#d@_%?O+8pcAXGyTl7>*IhA@N%utAYf@hFHm*w9!cb~r*kKSRTQ8-~tHYK)8w zwjf%Ok-^4xPcxY92xbdFIEfGm8;0GJYXa%X z3&caE%NZEJ@<19a52V5JKzg!*gs(6bd7wTCfnQ8Pbpk*a!W3`bAu)v!9B6{jKodj? zG(l*f31SO0L1>@}A_bZtG|&Xm0!`g5D-B>rP$^2sQ5_F@9 z514C-N_>+y=xgxp{nfOe0a&(xururCXZoWlkkoKq($D&CW2x2~}JiD&C-rDy{?-??F=!QO3@&1kHR4 zsCte}Fhd(|nln@)B>^r3W-JDgOdJdoswWpkZgX`pa%RX$OiIm(H!ycKgE5U94PZ=j zS7R8{#mExMG&Hm@f{D7BT5isba%N(h;4#@SMsD(j7y&MLn<;wos~9sOh8n0FVxTn3 mWTRL!CWe^FS+RCZ8)7D}0#Z9-C%=ld^2#t9ls7osL=EvxTvVE_Xd%^(2bGXg0#Anpi<@H@&;=>iCsp@IX-fa+rc za*5Fg)j!#vQBaAn9h;jNn?yPOEBL4}F#P{N*+AT8@*Fh-jRqedDRlK`_{8eD0MzqOLR+4Ip2-HXGLtW;bF#zi zx}z?!xkj>vk>kG-l5-UdNpfx|iB9z;!kMaYCngw4OfHaR;d;c`x}Uv?W&a13&2_T> z*oRC2t1y;tX4Lo(bnqY!Vhujw-%b002@Z_kvTUAaG^B#KeY2$PAP-^{Ho@O5`-OjR zUgVxH!e}&ED5gZi)FR2iEG0SBFwHXA(A3<(JlQnO!oVUe#lXNQ)zBi@C^0!X%{*0e z@UE*wbI{DUfQmDu zGU%h2t%#VsDbbE|LnOrQ1yDn|CyOUZaw}iMI9kFXK1F)%`GQZUKkWb;4CXB zC@`Kbyv>WQf(D1WDOh)P5s_VaxuCy71AEVV-jioxxuDA(IDDVy`##S*&WDG0@5A?a z^;qA7Oh*S4rx;eDO8b)}-IZMMBZ#cix zz&4uT{-oivO{XmJ_)dK_x|0i`M_*epM%}s|!zk4K?jDPKSu7e!(pU(gw@_WLr8mg>@DooeA;Ine#@Gdowcem`LzLs@7 z!;BZ;9n^lP|( z>tvzt;{K1GLfp$PZCqze6)dMZ>PX%So(-$eVv zU1&a;O3x=`HRv-#U)O*cs%qE_`xHM9z21R-)9v@0t{e^rAR3djXv`r9?m<{5M<5fg ztNe9=nu4{sa%h#`r&js>YEFQw@dmIZYVD@=iuHCUnuRlQNF`3&=Mu(eEbvV$;R7w- K;O=YF#povx&XpucmPra0#JQS zKqfKzp!z5KGYTpZwqtV>W0fe!e+3^k28RFtC(B6NOunFMpwZytBc;w@dQrWUfuX_A zPf7#G)&Q{s0;Dv7Y)ue5C`f9ugj%RJh)t~e4mDqS0_rEfP*Y`xnJ=I&vAIODhLPjH z5|R@IWJqvggbZm;JRw8ai5jv9CkDvMaBkpg-Ot{{vcGS0sjM~opbk$JMz_tT>L5oB z+7P{9put=EyJ&*NAF4YxAGI3fp{l}K{JUj;>E=!s1xZGu$*W>ZG)yg$ z49rrJQw`HBlMPMH4a}2G(<}@u(ozfzj8Y9Pl8q9Rlhe#oH774*l$x9n$5DSH3|uI| z2_7(uogu;wE(B(XL&X(zP{oy?;y!5Vb)n)d0jTONpyDgizzk=&EQH0*a3mNm1ZMbw zNp^-4aj4>JKwKsc28GJWTJhTgoZZYBauSnLbK(umT+CohBNGc4)6B&f#&j|>gfa~c zT%BN|&c-lN16McW%?1e$OiVIvlRqZPO`eb>zy&YYBPXv)G838rE#@aci~Rz&$)A$U im?lI{)=IWxa)_Fo1*8(9C$CDj zA(BDj-^`=)l_#OI;r4r|lkl;P?g!a){v?>a8rcW(3ytag6xCziv~unIy!yqabFg)h z=^l6Xz~x1zXWY3LF8g+O*I~U^V|&J(18|vps;5rlid-8>;R(Cxggs>wo~9G_l_ubY z_EX+RI9~KXnf^@4{~j=rkAPR_d=P@|nnclzY@a=ebreygJIEkG3=wcySq^Sf_>fC^ z0)2R|ODWX4lw7FE{Sp8^ke>4$VE{e{Ix7c)-*Sj6pLjRjrO8njkq=0#{k8Mz{(KYh zD-eG*bAX19rJ-ZXLdVk3v1^5nrJ-ZXLdVk3u`5Ey($KMGp<~NJ$NnFotM3Xq zx)j;(QpWsEI5HD%8X1|$>wY5tB=)m)WSFC7xs8m++RzsuTQ&^boz7b)E%~3-zvOGv7#JhvwE0M!reb3?t@Mnh%R zw)N!NY4uCV%r#sHS64W1#g@DsW~)y-h={ zdcBG?)UFD3NnRptz^rZSscr9}K8kKh_~dk@*-X(r@ah7`*{>{?nrUZ+diA)OaqD|h zz;|u~>Ko(B=AE1Nz_mKryp@+1r}9@8)nl*xzLDwosE6Nq)?z-E;(L!fEN`fbo`Q;P zdZ$^})=V}0Xj=8WlXTJ<8|tLpP(AT}rpC;-_2e5aZhh*FnQL}4AU??#ti0wmsXn33QF1$bMsD-xF*b(}gnu=;Htzhg^Bd?{r@R^Y} zP#)_GNPkHroMm|J2*K6c2P~Li|oTL3; zriKN9yvWzhY05J>oe-I!{KRWU{x;?Ei$;Ed^5hEtiyBV>E11e*kK&J6tXaVV_9$ZT z73J{{OH9+|C_i<{$mc1KR~Ub>1CxZYYcVE&diWNe@i&3=9#oJomM9wZZk!^wkUAXHLgaZ33e zf=+pIgHbGF5U#uLri!yefDH~);kApigX delta 4322 zcmeI0UuauZ9LLYOq|K74B*t!OW@i`EElW3h?`>{!%SxGP8j24!jG_#6YnLY5pkOyw zgshNK8+=eC-5o^qrKQ`PIBDgKxrgEnBIt`4`?5SZoPP$ss0EpuRIK;h`#U$ec~Rf) zgMo0q_jA6#`}^gW{BlUH-g1;wM=8vC9$$!DDbzTO=!31uqhBIQTPbgYeaX5`88#Gl zgMc{hU&cYa@+|fS|2YSl!_WA3?e`Yd*U;K0{m&uw$2~=L9`r!4sQh~QHusLyu-!VHf5>QHhO{CKSr9#P(BbuOObTpuBs9nx!gsAO8dz&t)k*^q< zdU@c~K2|a9%2b1YvBAIC;ICHvq}o2C6&9)&b|DXSd9qL!PYKO+m=ZlvzMg0{TQ6)e zUoUJGIQ3T>{2L8^&-uE)GiPY(KYZS=q;n3}s=KH<=N#%=Hf z^bhfJt@c#TF7?ldyL$C&Uj zvas^)BR1?`2=|bC60bXqESAp( z@wu=hpWtH?nXHiO%Lr1Ok7uNu%*(k6o)@!%oDs(}nOr=(Us=KdW!CE`Aml{nPQz~o zSRT?>Cz`xwm(;Dy&z2vr`np9rz!Z!*+-w>J{Karh0E{f8iDHDAl z`6SN_d~7Jms3@hFj14`_WFjtu6a-#mbTG;&pW>M}$|oevP@tKGWK3Xja@>@?vMJ>l zD?&(0Oo~Dx&caf%Nqa>eXs;xjuP7;+GCF9=WRJ*RsAe|hkQigP=u0M;OvQVVg1!95 zclY5ZIj+W)-4XnhyT<-HXe%coIQHmp%;@;=4r9IPDsPYAj`DjE-0os_Ou zgO%-g`nsiku(AVB*RPD$=z2_Js|G7O@pQxL_8N`L@#;WI4_FBSt8xM!f`B#I0DYjf zz}FG3589QZKb-ZSg|=n_=#`0jjL~9E-h)wK@riA^9@O;;a*!Z~0Qy{>_B6`8*Dkkr z+GXBhm#3=j@~GEYv>73JymPiLk2;9lXeYAPN9^rg29>9IB0_szA-wMXr8yvZ&PG?A;{ zI+tAa=usjsG!c1|*!R_t!`d+-Q~XU<+z{@8P{ z+VZPIdGr14#paq-Xw6c4;5qUy#@tS^+bQ*L-;BFwlm53`d2wtHXI4<53ewl3wi4R) z&S81<<1(ec&8;sj1)0n!3eq%2D#n z)#1#u$6ZBuuZa2dNgwWS5P8YB!P_kg(gwcQyTRl0^mZqDq=c0431TwQ?GoHc-rL(K zzk}zb<3U`SvZ8?};aM#R^61uz4yz9;azEAjffeyzUUirfBKHE1 z>|yy^!26$L`61wMJkRnma{X2$f3CpX1UpUF~dB9^t$^QNez)*XY4e$WJbcW?E zz{|d4`GdeOjI(?V@T$D=Q|e=O^#g=NA6|$S>Sf3rgR$fqDg9y*no?`FZ)?O z1w0aEd0czhV<|(ip7cZjf0`14JnBny_x7z9d_u@jr3k2etBX4DKEIz*A>^T|&+nyb z*cSxVCGaAp5z(ML9|kIhU7ABmpV=kSAzUGsK!*?`tyJ{{vkrV4rBaAG2rki0X((b$ z*V;x06TA`~L2$Q9;426z-bHoMPX`u!ZS+FjZ7v@;xJ8Ll!QW=2Dj23S3n3qMP<`oi zZl0&pxq0J+)p7Owq}0T_Jsx@zZZB_47c}zf4v9wE{1Q#PD0=8GBl1Spv>pQ|EPBMO z8Zt%*Mg+8eUDT6!tiN}YTwYm!{K);-X|Z(h>4_k2E;rs9hSTLC?B!0wTg200Qc036 jy-MYl)8X{1A-u%0C6Yd)$%dur@~|rRFHJ8CZ-%icKxA+igmOo zQ^)Q<()d45kHwRQEiJqcxLx>xXy)<{vs>NF}@<5W@_(x(X`jM)9aAw zhL)TlAkOwL;~-vc#<}JHoI@SO5AWPnlRF-J37uc8JB(s8uJPCy=OFl zDs-tiR-^H&)lb>bhOPBP?jv}!26?X3=r%o;DK0QV^zc6t;quVm#CL3opGw55(Q{Fp zGgKdqA@oXY)0I?wWFCrNx(UV4>O?Oft`j}`^1M~N%=}GHR|4CuYx&LxfWG~)n zPKxg`)E^hh?@-?y_&hT$=f$tuylBR1*>KdExR(XxJ*aV+1Q9g@1T@Ooal zHP*Px*ron{NiRKR?=mjlN(>B-TDp>^eZHx(8>!mojIVBu#V*c0v;$8Fm{YF%@o=?A zI?8x^d-~XZUk~dM8L`LHFELVoFT-$stklDG_w@9OeKqnsxI~! zXmXC`g}}?^XpCZIuCxAy!(flpo`OnaN8n`e_^H21zVdOxOEgPMO zfYn5E4Du!L^|OYsKCS})@U(>t@h$K!h32ck?VsuSlU~1pA^JfQ)a0MQvtMLH=6l(gQ07U4)6evd_nVi;8Pc9ehBz;W~(gYVc>0B_V30< zeg|@XWBD3l5CWEO(EM58LgEC5!zl2`FSP%wz#om%d=mI-7#qs6z6XXSJIy}^zFtT3 zdEl;UnqL53TS)Uuz)`WukC4pQiJ_JbSOI=*56$DiYa3|Zr9YOjIZ=JPvUU)k3$wmP zG}wK#Z?Kc|aH1t8A4n#^5C_KF7$B7M1c~%E?kBQAlt8j9!x8Ei2xWo{@fCc=zF|Ex z8x)8$>+=d^3tsXPDF~zp&4DFU;Hdf<#hqY@?MT7!Hynah#9r zrX9`$Tkw9Cq~qCuwVSw~P)_n%)W?z{G+$CX7m#?^jBDhHFYhxXb0+v;4(S`)rM+j_<)D&-;U&6|a1ezT%^HMKaEpPO*0@^T0lXAwA~ycoa*%3=tY k+qxv>`w-rjmGgiZFl5>`n{bwLpcxluN5C*(Gj_u0FEwj0Pyhe` diff --git a/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index 964d7978574307e475656ea8831f7b1f029382dd..a5a378fd5bb20704f0929680ed913cf0306ebd13 100755 GIT binary patch delta 4113 zcmds4eQZ-z6o2>9(T{YlbKo}VWF@E@8S?tw>Oc5tuPc2AKha$rK`Z-|Kzr3dwt0^eyraJYTXf+PN#cOWG`vZPGXCk3#WiQn3mG+z5zw%nX{ zqk7qVP?-Oa7wZMjS{|~J7!zZ;7(fNZ`|~@5b}IEP^9Cj8tsO#~=rPVZdq1qW1EZ~` zSHb-|u&vcp1Md6Uqh?936>afW(^_!f-_d3k?QHKlCL|45MFzY+X~1eSpe{2YkBSNm z_1;Av0MuepS^&i?PPI$Af07kM^@y;fMxnu_QmKjr+%zm1|Qv8=>0fMoznw!hrM z)`c8%wo{bkgQetyrR0Mp_My44&@IU7%trlreTSfJ(+Hz%iFuK7+|wi1xUD7SP$@Wf z>-yRp*0{*X(obK4??U$6z2_98axKs3W++DNFZa#Nn{&5Kpzwl;?}(<2ooimG7#3YA z0@KOXoKO2iBUo~_j`cR zC{@kgx_+6f23!;cXR1L^Ry+niM?pQ^}V;xHH${9ghoQ&N78B<8xv%g>P_YkEYZqoEZ2k7v9kD zn8F6YB?r|x%$qlXF(XG`^l3pex8>)Jk1&b(#K%CVW$6*4VN-FM$tR3cVX{tf z%Pg>lJ$q02MCVGjhX5LV8$|i}0H`#^g=WTx2G;=%Z`=tIfJ=?n=I{re04b2S@^*&X z-x#OHZPg0O3V06ytaAsUW*Uj_K1K zlAI3*6t^qWA+#G-6o6xV-ODgFr4HFS8{uu%h%5hLA)wKMR=BmyL3`Ydu6l>f(?~bC z8l7%uLw$I?C+rEkZH^7$daKR8fp#^NvxmWxXt)*Xin}(Fdjt_iNouJo1a`r~V$bK| zJtXtYlTA}JdP{1Q4N!hNT* z;bsEwMSqC!4*XAl@{=TR0$HLMF8XaUNKr=!ycw+$VaEf7r3m~MiivQdEW7TD1nxrv z7!F1sfJX^D_DS}z+r)l+N4EVQu~(t|7@m!>r|))>75~hVytPy>@{6$U>+I1B2z&r- z5#iBj_Gll0OVMErgIaVl22bv?QIT+cLnLIkdhJAYE~l3fl@8cFvg&rq4z$fxnX%GD zwOXAM)@6aE#ERp}Q`lTK*}=Nh<&qqBMM74GSDp@=fCJtV=TXyg) z4e$f)Fa$vj%I}1W^b^-{6zqiV$A`$xdWhWCJrI4=30-ig1^v(oUj#dlem7hKdQlJy QeNkI_$G$^*I^o=Z0m134NdN!< delta 4309 zcmc&&eQZ-z6o2RLtE+{z4&2sM*cKeyi11!N)>n86UB|~5Arpxq5oM!n4wyO;Ac$k@ zMmNYZrSOc`1XKtFCu)>De=?~)wN=SxGOkloLDF8Q}`T&T0tUPtq?(VkB1CC;O@>SO?dOy)Q zwM&YXwyVQv>Nc)do$1;g`E$pd3FlV^9A#-nk}IM0t25-tb1`^V z-c&O2M2x6a`Kx3I8A&Kpw$sX|W#&9}uamG1#XtX~cgkNlo}W0+2%id1-WP+NvTbe( zJbUJ*z_b4aduCDgy|6~PE)Gk3q-stp&FIn=?3DO7T-RVRe@(PVJN-A4e9SR>;w><6 zEd+}6Mwy}ZBmLew2{y_Pwx-epBg;WKx6N2aT6DEP4eS;V`#o`Qrv&RdM?qs&kVRO?}I}GjaiNUk%AT<$Hsfbv69^G!D&2>FTmZ)C+1Z>53oSK_s zBKg*8OeCL?bK`MpVzR3e4}E~o^g}N?Jd{#-sAm3``rI!>A}X%F8P;fxbUn4?ChO^B zHa%Q=Kqajq7~33)tX4jwlIL>33@6oRm!ha4S&kQ1>(tTb`(7S6@s%BDbE!vM%U7%m zHVBQrpy2WH-k_&ZV-U5(_-z(fh;8izzCAw~Ue4eU_ET7VJsDod;1;}z z!Y95(5F%HUQkP`IfynfFf& zUV`fhT)p}s_&S42KTMuB$o#Qw$^K#HKY-8DY|XC3J8IlZzNW;Zv$&7&EefCdDtUG} zgL`pN2Z6+_1bSFEa1>vy6x2#(;hU1(1B=-&$XZWr>}@;^yDgZ#F{cpdw9q3O7;8|925S1?%=z0Xu`f-Xy$(qQW87> diff --git a/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index d17d1af878c16ce534006d39c81871b38548ccc1..f2ba9ef9bce8cb1b1f6010f82dc9a33937525d85 100755 GIT binary patch delta 5811 zcmds5e^3E{$PGl-})2^8~Ic)93%p?XfHl~v{(v(b%k_PH9MX+Pv?!LG9 zoZ*iC>&?LX-skiE^}g@3k2}s!&YS$ty!};R_km@v#|J)!objS7=QQ7P057=){w3k_ zvrJ-utgV(CS7SICFA`^X&n1q;tB--=C7Ekz?cmUT<&T$iraHJY?*uk*Dc;|iN)r1$ z&As3Vd}swmf;OU-i?=D#|xGv#Q{P7SJyTq|bH_j=7` z0JNQpZe*p+n&@fBQTsfFFD<YCgl?^Jt=!$UHD zKFDcT7rUxwO?zzk@c+@Lnt1pkEd6ihMn7AXq0zx`Ah*&zl}s7Uu1*Gi8K~t_dAXgb z)m_+b3;Zn5#-*TEy{0fX-GjHv%nJVA$9!O*+r{k(I$ZQgYJn;sx6!U+oS;CqdA$C@J?qa zC*28Zb=3_yTE*LE^7PSaSOwIU!_J}~PRuA+El96mZTRr6^c504Ym2~}>IWxh-@5U1 z=}$fd)&Z^Xg$&y#4UgqlJ~NR_8Ld+NzJ~9H+x0u7x&PFaQNQ}wK5*Kh_Uz5p-oIAp zIF|erfFZ5(uNfjO72jL{j%fEx#k1}{>FWFc3}goy&H;r5m+rR&+)||8@0$IRG_GB! zugwGLk5C`}0&FSP2N!`72mGBQqlon%JqZ4oUSozR^onMfSEvsJK$)Wmjm=1Sdr?LS zCjbrri8=Q^u-WLnb~1DCSk-gyz5k~;?UI9I$&}GQ)pdO_tLx)=>hd)&>aFeI`8lb( zcOf{gAM6Erj&wJHiWRN*^ntMlz()PRAo#~!s_DA?f{rRn&cTwjayBmHU1=w5D7m*1 zRup8`!TdZJN}J2Pi46^xEHtiS8QydD+Kh%*AvX_azd-RVoB;a3FqlaD0mbOIb+P`+ z1}H%I*+k~9v3f56RB;L>^o9bM{;{?<@Q7RK{ov`-}rOIZoW=a~|&8ODF{^p`u} z^JyP%<70+#O_$uOS9Qad`#@YjcoKHaNzPVGwnN|Z6WEZYcfAG+9eU5FaC|=C(Mwn1 z&MJYIrLDdu!6R?wn|)hFNo;P4JtfEFnB)<*#G2e5?-t(Iyi7d`7Umn@1~|^aZA0s? z!P4?81TazjtJE|c+(y)9&=s30VXf7K9z~}x9q)UHr9*^nakKb3f)85`Ho1x5*(+_x zG`0}D?-3S%mf+!f(?F|7VR8A|AG1!K)CuhvHMY>m(nkngg^n8Z$Q=VXMQA4)!gTGq zH`wTR3Eh%l@s9}JXH~b!zY%;Axxd2IPT!I4I-v(q4AZr)UOUhfU1U#RRWHXN_t*FR-*o=r_^0LAQ>w^cq6%N0tA=bi)Ws1nn^B_cyWhG@;ATQG-7B3`-ZAU$q@vHySeND{U;jl+Z6D zFox;EODw&T&@ISq(EW#4`f)<gx-zL8uaNK=6jny zNaz|gj_KNP4NIRObl_{Y=XM)Ag68t<`KaSJwbg=0z5xT<3;-3>-zN7qC1fN8w zaJuraHAuVpEb~7b4sHzn9n+v2-57@}$_F1~ldm9Ndyw*PoTMto(h)*)Xa}Zi*Y0NN z&4ivzu=rMj_wHly9R#mOLq@fYDVBbL&;tlu$JKs*l%;<~=quLX?VTPcc-R`ejcWuy zgJL+{qg@}e>E0vs$X{6eBEf4$SbSYJ&J6Rofqe9b>+p13;aXx%%`J^ye~_BKsOTdo zAK_(c%HAL~JwDsSaTJxkwy#fiQ`A4>8xkq%7TjLbA>PvH#xLp|wad9ZJnaXU@Dm45 zNEWE+j?yrg@$=~QKG(HNu;LFL+67hOwf3v7*uEqAtYE+!N(B_+bJRfH3i;J`w=`Lniz-i zq=q0y>GlO#pnDb+i4IT*2WbhB=Y3qgqHJ#d78o}*n<~C z5xNaA>W`9sd>${Al5b|LKH1(;+^2+Tn?X^bCO&bxC0VjnBYAn6SQ6|Sk}v8Z>$9Q4w|mBS)!LXO(<@OydMuwNAE zeEgC`%lZ8ZJ%gafPiLuYmfUne{!r9T5mjgk#p9(%t_We;k0SeM6H3rSx37fl2PMdh zyuNM{ehmi^81% delta 6020 zcmcgwdu$ZP9o`xHe9qayca3@2aT~i0!88!|dAD1+340I6gfu+zXo!ihF}Bo}wnrK$ zl^orkITNQM)b!GfTUyD8BtT*jJBzxG)bT~=5q6Su!7|uw>FnLi zn(;sN1_?{{{r3BQ^Lx!Vqw`*!DjB+3(p?LN4lF;C@Sla8{X;)Ff9{b3_@l_iFA?5X zkR$x$GoE+jvL0fPb+-3>){*t;bs+H7|DVI`059KB@nm^tW)JtfKh!UmH zX9^1s!s6@I#yuQ&{d&6lKXvK&cllbwFe-IUIi>I9IHRMZGK}r8ZKqO|5o||nd+**# z6h8wVzt?`4=K^c=45%(xxSQLT;kch>)^z(anUcEr`0I<0QGxlW@Govx{wPG#(FA57K#tZogL@fUSw9_+$)vTko( zlFJmC+S<}0H={J0HjH{ek*R;%rer>XtuW9S#XPGzQVKUMwzc&n3wr$ zzVz!=_sxIz`{O%3uEw=`*TZFCWBSF;`R}}iHEsa2Z(!23$Ontf^~bQ%hu9o4i! z_E^>XLBk^0S8V=Z#0OCJ9$;8kUIov&+IiD|p}W|sIsw!w@U->#Ft~85HNXBg-{IQ4 z&Nr63)4A~vu+!DGVC#Haz{=hQAM;ql--3h1mf?Xigtb0bim<=aKs4G94XfP;-Cei# zH=BG7QUC1&W3wk=9nyI?|J(iuJ_yh<&V!wLwv7CXX#$ceL=|`+nCvj>(HRE;* zTUh!ap;w>{HeEVr00V?xf|8hCv+s3w^`8iRCBfof}+WbyLcrg=H_Z5F?a;MHgc4p-6sXO>n7J%SF|bk{gbuO)O3 zI%U((US#QJLXRWx8K#G3So+6=?nVNp6DJSko*kbT3H`~VEZ#%#s|#8DFu{lJWbxw! zzk4Z*zfJJrl{w?|;W|gK9p<^x!6zo*{HO+F;Y~ ze2=9AxpxO%?kGyybYTZeFC%n68nEexvn>5hLieEaHhuDCmVVR;9?06k`&jxrgl2C>MH_77f61)e22^=o! zWy9s)!FaiB8H@juEUTT9{!@aVL(Mo`g`6LxQ+z@0RgIS$MF%hq_Mu}Fa8$rBH;Ts_6IzgE~bu#c_zp6cuD4 zA0e@Qiy#Ohb*^{P2_hdRC@xJRrWjF!)D&VA6+%*!pn|Fi)YN0NN>PaM6phAcHj$UK z1dS$1A!;gWjGC(IgwUhTs8q*S5`#{(s7_0hc!k!3!#ZsQCF+YOOopA=;y z0ZEZCEkcSp>ttTlNNKVdru&7L(b{DtMwiK|Ot(YUL$rN4s#6Dr$HPwKcV7`D>Y*sk z-YKC79j2ni=r#tU&VB{uAnjR+)YCSUdY!fr6lEGAsAzP6K~-1O#IH&c-Dg}H?L$># zG@2Tob%@S7)H_PiB%Vgl6or1v>+`~Pb=2y9Zl$>b7*wg zsYh2sG>0CRXaqePp`!}(LD~B66nqLo|5^0ZG`!D8Uw8X2p`TB~;Fk>Y`r9C{#SIT? roQ7XUQ`4{l7$`6UAAtNA+BgI6_iV{4nn3`x2 zON>6K{>lD~f=Yz#*xbaZC#v&{$)`>rfbst?Ccip?5XS#%{ys8bb_7T)piUr$@xOYY zpA1NRvVgQrJXk{@R6{6|hDfM}ScC?ChKBt%44s$M7#SICL9`+xgN^N;W-!|kiJb^x z+c4~&yxvlC@&ii)33Ue3i|VBe3>rXM14wHEX-yzK`GTs^>2d}Juo@r@Rs*EL zYJl|Q1Y@Pm@5H&7ZT54u?q_de*`L7LvcH0%XFtgGf>7@YB6(L3>Rmx>-W7y;R}jg& zf>7@YQo*}E(s;eAG+EEUZZnI_NhW-L-ke~Z!a6xd<$}q2CWK#yRMc+1psLA?&yV1E zU1#gWI=Msdio@bxP5T*uDGY>TS(}D#{BG7WV8`okVBDIpnfU)5qVWpMvI(1KSm95< z(6|+1Q(X9~Wj`ox-CJz>Z9EzFbFjAVckckQ8*SQb8X5LCZGPx%!h$cIY_bs#$@ud|)DSs?fWI}51LofO^3XfU}cxTZ4?tH{JO$9wX`c)7_22?AX3Ix2RuSAv%EwnJdwbX+!*EuS7deg#?JYl|ZxQ0h}rH$N&HU delta 1359 zcmca`f^oxP#t9ls25}R$men(OGC%-~7JzUWffO5v84ckxRHM=bP%Z;2lmXSp1mqH< z52}B%Kck=$VLLWAG3tqO{8#W%V_^9IfARroo5>5r4Kx~je5BMFOfRaJGB7mw`AKO2 z*%}~rK!B7ckgW+~2L(w@7En-}%qd}_4dN58M`?4F1Q)Y}F<0w;_9mA7QmifeD;RqA z|5rkCMvDw-&e;4`hJ%?DcWf3=FkutuVS?ICVGsyz-lqPYi4^;RVW+hDm6jNr%*$U* z`+?yH#^S6^R0xY)BND@6f+;X8ivCh7EX-`l2#YKSAvVS5zgqT#!o$7Arr*YsVZQ)t z>wfnRAiL3~&8CrIf7|A{ZYC@whVzciYeKCUH{bI)#?08Zd1(O21}IH_r8Ny`$Orbx z3t}4?jV3F_m1vk+BpH~cB&QmtStc8rnj4rWo2FS9Sfr&G7#O7*S|l4KCMTzvr)o}K z$S5`0A&JeHo#8|bV z#AV`OV5pv46u-^N+1!XBCow5CC*Hu^)CtBkGBJcP%}p(!OhZEpQ<$i$o7?0ljJ5}ND|?T2_r3f+bzI0tOVbY$ynm#HN#Ix{58 zww6vNS)k@(uJMZ!gt%le#bex}#S4kb4;L~?NEZA^jU*zG#Y6+7IOn;y=dSId7k~Jt zG-=4=fKWqQ5()UdAz0`VnVpq0J?k?+mLPh zQeoiDl?0GH>Oba@eC`QYRB`7yPzb(NqVHHeDjWdu&)Ho-xKchUgbDr$V{V+EKJnYV zlsQT57%FLji%F_=sH7Dxn%Z{gCA?eYb`6zuz{Tn(TlFH(#SCnxl(3y7v}Gi;lY~9# zgwcvA-UrZ5XW;~uUxalaZYrPaXxP4&wF)L>hKtr=T0 zwwz^W+u8vDBtN#2A6vqe6a&76$3Cun&w2;>vZTR9!}?I#R0c zpGwt{Qgs=n>PV@&|Dn{b+J7$YwlZF$u`(V)kzcIJN+&)o=;x#N zz-#EtpK;|_tcp{E6x6Mu#sBksL=S_|!WZ8U`;Cmni;K5w_xR_C)g{3h-=T*F5mgn7gHAks~a1&Gl|HXp<%v-$f7|HU%Cg7DFn?D$&3 z$2Vs4F2c{4F&NJ9d59ynETNh3hOUfY4BH9cw>z8n5x#>-orB2U7YJ{7LuSbQ`-vl( zxf;xk5?=dUIzdDc!XI50|2E-c6B1bZI7Rq<$CfAwoKDT34-(QDPTCnK3EzF2=0tR! z@H3xe^A`y}yDa{D!f!KXUx}-Pe_vEF0?^yQCm(=Q!;K6Gc)R_9 zPMgiqv;>`20xg>zE*Ux+fW*NrrB&^lrk^`(xn>6k)p4n>k z$cwdTfn#^1wExNnQ_T<-r)MU1sY%4tZwA*5Co1Pc_@U z&-VLYA>#0M+kI=;*RwJVlR-VFQ(19TM6k(0PNax!OJ$%imc%^CPl Z4R9khR>>ztt2Oc)DzP0g^7|`c)!&UMvl9RS delta 3741 zcmc(ie@q)?7{}lDdM#HZ1aOPcmDyE(6k5#nO50m@EG-=y3pxowoD4x$Xht@)F&K#? zYr)nKGvUTfFPr=^)J#bh9J`pZgn9czqscTuqGp*fBXOE#G1KXmks;z;`(EMpkDN4F zlji;Q^E^-Ar{CVw_quo$KYbZzZJ5P+yz)$hH9{ailojM9UjmsLWNt=rM!kkpX&|ga z0@Y#btxJ`0&%k`^fA1j-!BdZJ`7}2!z5!-GV_Jatt|czUkbcJ!=SG_@eDKF+j94ja zDg$6;h4ZyC+(IE~i#9cxDuMgk%2xr1HaDB>lD5mbrN!isv_sbW_nSCh!#1uel+U#m zJ&}yBzm62XDq6$}4-;`QYX}!TcuedE$HW!enpQ3qYimX5J@5$R4*A+Oea zCxx??9DSY%Lw#(-5}_(2^*b}GDEqS~#*8w1f%vN~F76qY^dR#lGYG`nklPub5)S!| zn5K(Tz6fRm=Wuvi=&hg8R4e!zp31^XUBXDy`wwfLi0A&s3U_3u=4q=wGxJ2~v{)#x z?TjIBBttW#aqq?+pTYdg{nfeIqR-6SXzat%CwUOg(5z_Od*LW!*!J`Nm?-<8%!f^{ zo{ur1CeGlRzSp_C)%vlMdEAweacm6ZUaFr;%<19Vnl28mvFA#nF&nnw=TFY)pkKH! zUaoscXV1(8p)Xv@PtD3dLpgHyASNfZ{cN!9ZF<2rZ6WfGDhJ?34j$<&zQ#NB zFe(tU71}y!>}k^{&gMcE3iKBz3}*ek)j?V!GB8Noa`^umME@We6qSDUlXnOkGU(z! z1`Mjd$r`M>dofWqNtv<*qk772;nFGe7Dr4*1^mxxO_L(b-JQ#Yw-OULoYgd^%7|?; z@rO#_Af))4C9u!trVo;yu3)z_v_0r_dr5E59imA()J2k1x04Q1oxxzp+g-s;Koftu z6cUj;9q0}fEjh_G3&cKB^T!ZB_5imce*I~6eih<{HEK>H?sFssUtR|DBRLNw?L$e! zFQ|*3NBr|=)x1l|ccgB^mFUZee?6*@)cQK&FAgL{Bpr>Tz>QEEB?XIkbE+U6d=K&d z57Q_q&LB>nQuArVe_LG3rTu470DQkT0Kk`s6RFB{a31kH-=$Gf{DgSFMeW25#4qKj z`Cl?G23cUdOdU8Zzr+Z@GW>iwOhjC}z>&^_-A6hoC*@p~E+k2R?RG_aXpbT(H>pSu zozAWEQwj~#DsNUzc@*lbbt$<{kCT>jBht=JahAnuK%FN|Qq%%UU;4Dh+T=V^OW%U<}V zG--an`#itj^W67-&;7bRz?UQpC9r%%}rMlXn)fbzAYPwB<}kV6*couin0 z)cSn|Vfg4CwP9bu2KcCS;C-(n0qsyTKtvy?Z;BYtN8t z$8zoeCfAPT+B4+Zv0VHAAa`5&f6JAV#r0HHtx5lgG_WO4o^p^jSyB$OX_g1P2WTnGjQHQZ-=hGu%hc~4x+gbleB3z><>bQpteT~CojYj$ z;)5Kmv7Fe;z?l?SG?(<;cVds^nfIbx7>+*4&5=1ZNv%>FA^`6xwNab$RleRR!7nnmG#MF+GsUo< zl3r+_<<;+(WbK*KWc#=r`7qB6X?c%iw><7v;$jJ;&GPLpM@-a?rSi#RD@;%DUzNkU6b4r+TK`bt$`E z+%Fk(9BF>dB;QW-Tl~M?Nw>}Uvvag<%FglZ(*IHFBfp?h@4T0*rA|y_*E_w`;#Aa9 za`R5FqL!O<($lkzo%DKqf`ZZPy%6=yc$W<(=D_Rmbw!VhM*pI!owL7~Ru2PW2_Ni+ z&e-~uuzeZt^mxjIYN$ zQfK7Nm>=1wOC+N^Fh6`mbvg+K;4M7Ryg$uUuMcy_>qZ{M{AYc})ZiDG*G-$}B<39* zsZf3WGkCypFk=9@qnM9=YUJNze(kK0U&Z|LpppNC`M`jY-$Iv2zZDD*83+E*F0Ej! z$oS|UV&3+fF@Ba7W(D*mBe!E-J!a(3V}AHb=JQ7}Wq4rQv<2LlM{XD=@?(Darjfsb z`OqCB--J2Pm7qafe{_I^K2|Vp)0|XR8|I8ICB=7PeyG+M--EdqAMufd4=@kw(g_BK z_4m&T9J=(>z;T>#LNG3H8uQ)g_$%}CE#|{(jhs~v2dO+j&Jsz^WfV5oWCfoE;L*+? z584`=TH3a9ZkIRhR$|v#B@pTqf+0dVkBhiMFhJZ8UqRezH=%r>(t{}*6a?aWSuM5` zt<16klE=DSDc8%Uf(18OnXT}q0s@tUa=gclC+1iWOV$Xb?w#WVazC!>5QhU?Tyg{}Ig zC=BWL)~wk18n`gFc@kC@Q71~mV6C{jmx{Wbj#K@Ew78k=( zz@p274mr&cT?i91_=70(bcKX?omq4*QJEo;C2DGdx<6*ZqSLV#@m`+i*zF%VN=Cvx z-+q42?|FXr?en8OMn5E`dx;(!J{@?+ej+?+2blbWMl4l%iODos=7li6TR*m_(=ZZ; z0_Mpt7n29`xpJ^N=l}N*I>GyE9{hY&m-sq1@rAb<6Kghhi4myxZ0zEWRzA#S4Zk}$ zyAHR^=T_!o*!(=#vfazY(@9%Wn-|=gE4fl$mM+oUX%bzghDvYyZxFP@LGcZvyY64e<{UKwy0$cW!89(rZ@OI^?d)q=;Nu2drYx)|i z+CnMGiMJ3SjMyfyzwRQqmRGFYOv6;FDQS6*h}1q!z+|KjZnl3~ZXaQGe?5^oH@DC_ z=H$d%XQ`fS3QwipxV1c<X8W&L@UywX&qI$V-P2DT50O5!PHmN`ltf#F6%H9~m8@05^{*I^`U0^G zXF}*dS7xXHtw-YDwXMp0?Ak()Tue!@M+OI4fxv!t$k6$e{&rN|pZea7W$;4!$j#?M z1p|C62eb=U&fiWla**#Z#(!V5dj()X_LgfAQML%R%c+(6cDWS&B?a^t?H7iK#vTBz zEAdMN_fqIcR4SJ+v_>>;17|Y%;Q;6WG;a@rR-4nak95`7HBgNOb(GUhy6c>c9@5iT zPm**4<*B1<>*^Za4UcliK?WZW0!#R>8u-N%4pwC}W5b{7`FhAl@8UU-&ur7r-vW7h zp`LpnXY&-nKVJ$~hpl%_dKykTyj!2V6Y?|L^}HVPv)j~f*C_fj;6^3W~}FTMQE25$}}^^sugQ| zE#xz&HKd3QkcTelx%#|h#v;=CI`f<`A@YMhfrb2XT+hoOXa3OhYRLQk*7I7(r*K8c zL3RIT81PuNfRgztjNLdgpUgJ zd>ry0OZA+R9~yBBHo2Ssaw~|29e%92c3(qt4ebc{7Gw#Oq_4ODNtefkB<&=Tba@;| z27EMgYsP3Ezxf6o@t-9gfto56764*LASOr?Ll5Tm;kEX z;a=Fz#{}SRjN7Nm!sZ%=@@ob8DRdu>pdWd~&_PW)edz6A3LI{ZN^?e=wgDTCPult3 za*&@giT-y`$e%9KFrjS_N?7Y!Sb;0uPfpNLn#`vnFgZl1(Ccg2=%fcHZKc8 zy)202WkINy1*za=18KZoPT2fH`X?g+|0-?PlapdKv1LN|bI3*QWd^kXX+Hxnb%AgtOViMd*Ubq!k$C+IOv9i!O=FpyW4y)W;NKw}ufQy8w^_v$e;NkH zZ3oNb3)Wi{TYtCg2gR{_i%q|cC&T`G%&q&~JAmv)n>L$9hW&3hFSXf0FzhDl$xYkr z;Gn_6_;z!q3qkcefNEDTZ(ig5goy=I0#Bac*T`rvS;)Uc!^FVCJk2c0#KRDKXh1(IU~@$Rs5($}Q@ z0w-SzHWOL^t=SepYq|u!$+LrHC!2>za>9hxgZ1Wws4xWtO`a5D$LSCZac%+BQ0~d! Ofyy^PMVG@w85sbGWd3mg delta 1376 zcmcbyo^i)2#t9lsI|3(aEvuhk!vF>_nn3`<@!k>_bN*6%53N@WAhn}4i=fS zznk_0!wrm$S(+#ehXr~hyG3cUnISM7p8TaW9445PY&Rqv_E@#BD4zb^vL6%{?kzU` zHl7UopE0-YckckQ8*SQb8X5L~+?;5?gY-BKbK1wk_;ItSJIQ(tHcNPkuunD!Y-BW= zyeY6m!_*?lz$_&>)iBL6+0fM7z&zPB&BDMUEycjVDAmv+*(fnNIn6v(bMit)V15u_ zsb^qdXXx>PWJ4Iq17@%@9I%FoKxlEO_zNvmaV4nu7Buy`P;rA8RP`26@thC_h>aE39REuAJ82Y_ew<|K#golAH|w5aIP; neLUeROdI?s8-?3(E(m}~6hIB+o;*EVl2afMBDx$V%E$l!`gKpz diff --git a/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co b/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co index 5e817824b125ab34e210d812bee5f656ad9c2f82..4360d0e28076f7ae59fecba4578f58a761e72690 100755 GIT binary patch delta 1938 zcmX@HoAJQ`#t9ls4`L>2EvwIPX8;2j%^(2bGXg0#AkK({@G~k==>iCsVFpMM2tf5Q z0hz?;gX*8`&nT!w*pAIjjC`UxznFaL1OgcU|6=m169{4aujcO~17=5n!~*IBVi^Cc z2l~l?#3u(R+r)!41VS~0B58<(YKTQ>;Ad#qZ^O`eNsW<_!4^a-GBViM?r8?I9g*0H z5Vj4&?#bu1H7EBd8%U@#m|j#bWnj<%(i%Wo6G&?U>B$Y!N|XJ>jit*O7{F?PG*}Ig z2CD(mlO47@YV)L#b)VqR6 z-W7y;SCDe=PSU~c+szHqkxT@QG1(%~BGKH)BqcG) z(8xH&z${gB@KPc=865n3fTms- zD$bFPs@?)Bt`f}vt!JR}&QOUXX{Z|fpyCl|2JHfgGjT9vR8GDWu`R&K)PNx;F)1}C z-oVVo9L6*QB zE^=~GjG534Xzh0cS{rJxP1cT)oqRk-k`pGh9<29Ij0#gn)MTYtJI)o+5U&+L4dtHP R9xKTy5d#rD3=?H!0008a0eJub delta 1412 zcmaEGfbqm`#t9ls9#IpumerqeVE_Xd%^(2bGXg0#AifX=;a@02r3)Zj1_Mqg1FDY+ z$R$P}RR3guMnNUQc5H591K#B=!Hzh%^d8aCgcJpXBu*#hM-LxMV zZeVQ8(nN(Y*r`XVI}Cwg@Z>Kw!eFI2iFW_7v|$xI{kvs9C>-2dZ2E0H8TNml>}hY# z_-1pX{SK016POZrZ02zm;Q;DY^d?a+kH(J8Yy9?bv4FDNP0bC=lTFhs3@p-83=E7?4K0$55|fkD%u_WdFJzROyda9BzQqTcX`vJk zm}Y0#VT~#-4i$f(g(|KD6<>p5R){$2hsk58?#4;H6&J@SvJT}Cmpg<*=mQ)O8ca?$6cnDE5HG>x g5I(so-j3-*_~czcN+5FbuXsDohDeCg!$7m;0rVtLvH$=8 diff --git a/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co b/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co index eb92fba425cd7aa02ff9d38250d3e5b0a31c1cb0..72ae9983c009be03fb3a198923c31fd02d166aa5 100755 GIT binary patch delta 1848 zcmexygK@?^#t9lsGqNXYEvrunV*mpf%^(2bGXg0#AkN5u@H0A4=>iCsVFo*t0oBI@ zvFV)CgI2w?pGi^;D}AcXP1n!k?>m>mHU3#b!_Vf?Qi z=qCdbpFBg%CLXLI5UL>*Nkb%5Lo7lAKSRTQ8-~tHYK)8wwjf%Ok-^4xPcxY9h{R5W zux%K2PnMU~oIF9@Kti3t^rCtx1A_*T)&SC)Kw1+>Pu39+l`dys0Lueuuso0k%LD1n zYs3SXZ7%b)?q_de*`LMMvcH0%XFtf*f>3V?B6(8~>P3V?Qo)-6 z(s;e8wE2{@ITK#L>c~lOnwT&n{5aI2(rB}khB|&f=4i=qI?Vjpw4VW(nm{<6t!W6y zX_N_pINf6gjMGzphH{*)av%_=f1D&Z6{r7f*$;|W_ZFLe8&8J)XINYJyLSNDjW%sI zjSTyrY&P}Tfj4bT){)z>`GP+S2ji2?SHlUY&C%Mixga`%iv?7;PCk*^$Y?OxE3HJs z#K6Kl%`C~p$Rf=k)hx}_+%z>QG1(%~BGKH)BqcG)(8xH&z${gB@1)9eg3uBhVTQ1L0+sNzacaRW5?e zo0Xff8ADEDQff}TfrYahjA>+M3S(M0J42aI8a22>vt zkV}j{sQ$_RjDkvp?bzJJ$S2D2U%^L>f#LuE$rCheCO3#1Xf*ivNU1ZJUQ{n-U}*63 zlhOdPH9+iu04Yr%TNA_%3X+<9K}KovK23qizr>ZbL1M(}6VNf(oFSpWEaA@6x}Uv? zWxonr%l-<6p8fxokep&*M!ZutUyzApBGC;>oB1pcv2WH;H(?YIWQIC`lCapkPt%+v zM{E|*k>QlN|Fda7Fz~=wk*$fcfB?ox!sZ)hKN-mgjBgwQZ~sss9Dr%UZu1gn(%b+H zizn=x=lGg1D&GCsvL6&E?kzU`Hl7Uouduf6ckckQ8*SQb8X5M#*!kEH%5`Hf5&E?xCpMv85xa?Mw4G?4@6xi9+_Bh!iC$rCe0C->({a>0wV)XAH2&4eu4 zAog0MLFf~lldE%OCyVDva>9hxgZ295DKWiBn_QG<#|d>YLjlxK&dJyFBpE#>8|KMR I7RcoQ09ejpsQ>@~ diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co index f21883f13feddc953c04469f3ab32ef4f39f49f3..40235bd0e70fa403e6dc070c1281eb5fcffdb65b 100755 GIT binary patch delta 4137 zcmds4Z)_Ar6yKTNalK31yZ(U|xoX>>w6&GJ|J$(OT}zMJD1VF?jF96>OGM5@2x`>> zuI-f+V*dagSYiXkFE#?fl7>hv;OyS)T@RAIBl?L= zcHi6gdvE5wnctgahHjc7H%+@M0kx;N^uouDWOYzlQvL4F@EE}-oMyt!Y z+E(YB_b5v7!BX+TQt`nO`_R1DT9(Bz)*Rz5Q@3R3)JaX;OV;_?jul%+Z3Xm$S#^Qp;BiNY%;zm57%Zke@Kb6B*d z20Ti-kp&mw>vuCqcaTk!uGcHhC_gzFmE@F{MA=>J3zQVmcS}opu-?J$U^^)33}3n= ztkWu+qwfs!`(EtP!!r;)*#$o0kG(vuxlnjx<%sJ#e)sC(EHKP(SZm07tU3IK-efL} z^S8&yEw!d$+Ubb^;B}$23=ZdV?5=Fy8fF0eL6-?9whn0S4By|;pJ)E?c9O!~#`F{x zFoho};qX^D;RXL>kZ|q6Gzpg-7W2yAza5q2q*W5{#b?EI5;yaAV^ir#JS8$_*sK>Y ziA{j_4yI?Zux}@*$>Z0KMhsvUfAx=zk1>sfP#@T9;OL1Q)6qF`E)VBs!sRB-m6s3V zS>PJVwE*p7ogds90H`twB0_5cNYfkr9pPN$Z3jBxECOeN5Y7g}dcj%^L|}gQR+8So zdf=!yc zpd8{;Vojca7j47@E(@hyAgtFs_wgw%^Z$4SSShUN1?L`f?vENx%jU*;Us;fr_rl;Q zFrc~QnE2M2!v2kL2nfBu0!ub5G-f~@xeE?uw?iO&`un4HLCz4ks29SYLEI5|O573G z0c4^1w;Rchc)B2ijyR%&nHTqQ8N|h1yPZLCwFm#t;?6ch23B0U5tZZu)gBouO6`%& zrg)FgOX0ZYh9!Gsc?;|<0-J@q@4~5!dL&$o@J8$8r5`K-G;011ZmD$AjAy;O#c5~O z({1kcE|06NCAf|WGC_~s*&1xI*&VI4yRC{l0G{~#D9oO{O1%`2dqs{S4U~w=kHX^F zC8B&GNj?>dYM?A=g$z&ml8~%11Rg;hGW^#A@KOR>QJ)OYZBhOcpH>2&Kv!jW!e=*uMdXv=rtt)#DBlve3~iF(q6Z2S3A`2^kl`FQweAZ9?nUQi zn0f#nBX9_TJ`7KMk{W$40o!DFWOe*br&#f)NbxpMO(-bChks6ut|l;x_F&l7b1XI5 zOW^(;sdlA6m`$~#OMURQJ)Wesfv7e+ zld$?5>WCGO+elP9<5wMQeve(TZVv@)u123agx{^s;c(Q&JwgF{qmv;HI5ihhT~3>7 zb-JAnVs$X?dZJ=0t%f-kt5#RNe=)K8+;#Qp%uHjVBu|4&o!RZPs}7z9r<$72;Z_~$ z7{73yhXDw-AX^O1Hzw}oXhRHoO2^0*dyHJ$OQ!DyH=bXJhGMV;jvYaN#^55b7gY)H Xc{tFA-W1>>BlCUS{YTUngHQbn%T2w% delta 4313 zcmc&&eQZ-z6o2Qk?xT#YgH85@Zi`_Xo2(yu?OSMYrEUXBnxUvjG+QZKGS(m_NRSNa zYKK5bD0v~5Ok!pvf*~6l4|c^=-Xx2u4p5AV12JwB!xH)EknqP`#CO~GS_=Q{RShqB zz31NFIrqHZdG~j3UOjiT-`~;t%b@XK`7zsYH)80ew2Us?Wk70@r90Rg!Gjm$+bt0{ zhU2qfVb)__d<~QqKQ=1Q5sei(0WZk=aekNhJ`*@+eU%YUR(FX$*$xVZoDWgzgCfsj zhI#OSZ~Vi`2Xggg9*-yA#;})d?=XzV>&;&w*(<2+^X1zm+fMD>yYu;?MJ4=uH%%3^ zKiin0)g{91n@i{9Y-je146{#M<=2WQwOwM0H%VN_Un#7bb$+2}QQI6f$%WA7YZ~6O zcnnVP&eGW@l8NestECoI33JrzoagX)}D7%3=00&hEvfm#ljaiydaD^eegt9{3Z?W-;}PcS`b<3htv40g-d6@ z1VZ;bAh_LaNLA@Vn*$jz!uRe}zXr&+I&grnb(gx7wCZGe4y>QYyE{^ajssT6Y`-5{ z`-jrht$os135hV*9SUS1(_ehw&e<1PkvUni@;{({ zcDO(br#_!jwXqOg8zf~4x{-7Mhp_0vj4DYf(jd)yp)!M)X2Fv^FeB8CZ%>8-;p~m} zIZs3*r&6@v6-4wgIZv%V+EkT*godCnhxjvK2;fh=MhA&W#G)#zqKP3+d-B2B+I~NZ$Btln`MPhW5bI4IAoQC4OyFQT;SL6`PjF-a%dDjIU8Z~5+D|a5Av1uiD0Adi zOeO4mbt*X~9UcFsD8WK9QwPTZH~$Lz(Qo$?tB)j_|D`@hUPK%!`R0~Sq%~3m70Qkm z)HW?WU-ki!*2deawkoQtwYfxkFK^6+ZRE>sXhxMTA68e#!sz?Gmi!u{yaMj0N% z2Pu60>nIX7cF3?Fe??(^q#yP1%dihmQFzN#6bT!j$}oqE`z5&X5n=%u?!ap(Jh?2M z_YX3>5_eMg+9U9!3?Ih>6fWZ9qyLuS03N4s`kRr@((tJClO6U>VK|&MAi?WK;-i@5@s-RQHOfFO?Hsx)xtk2EU5PW)o0W7XQ;jletgEh* zr;r{^$;?(8>gCaDvniU6V~w#1+mbY*ZDv1C9 diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co index 473df016fd957f77242a5f89c3e7489fb619214d..04249483b91c7e9f48aa7d0538a1e9459cb80ba4 100755 GIT binary patch delta 5670 zcmds*e{2-T6~}jWZJ+Hp@)4l6$%SB(1{;joz1!W}t&ZTmU*IG)XF(8YoY?1>AC+9d z5(p|8-`PGFNG_^jlZ=(P3dBfpOM-w?x}tS*CQ1a6P>U&5UCB`{sjJeoQB32iB_*XZ zcQb1*lJDrhtYpo*_j&Klo0;#8y`G*ZIX+P`Pz{bBt$8bwD&QFW(2=pHKQaIf1=wFf zxN05uWWtzbI0!!^k45I)}hcdkEbAhVAXPZw9yP`=XAty%qan-S%d1TkG$2V87n6%hQrhXd($u%}IEg zBs@EtP|8G;9_Gl^dH|TUc&r4BXYFWT+ScaS;f(EJeyx96R-@a?`Eres#Kh%a=dC(yAO64ktiQ(oZelwQDy{?aWe(Gspk~<6flj{#7yT|hr zPY;uHCZ274vLM@f_34p`b2ldQ^wu4AeeefEz?qPq$&nJROQ$4ScU_i6wETRJ_~Ky)ak2LLz+SQcm$2{F~rFp?)Qnm*~0~w+)Ne0T2?> zUvp%pHQZGO+{VE|5aI4Q;Tn7X0WR5KY7rPG|m>@Z;WjOgSOKBU(M?Gh&`u%qX1672BLS{ z`Y$_i=jZ>GvvZ{AyYHMohN@QW`zn^i-BV5PeZZ<|rAU9S@dabgYv8@CdxEcJtn8Bx zpcub@A8fE?y~L54jb#@>=uuE@bbbl`bZ_d}UR>O_DkJ3&7H5?*zKY+?y1`h*?T@gj z;+!ftUCs7oEoJI=vjgA33VXly#jJjlaC{Rx|ILZ6-zYFz{|lt7moRRZuQHT_tjMxU z|CzICB;DTtIQ9t}HEykBvwxw@{tqv$TK7dPiMyxz?Jc+JcU?(Zzo~E0Z#qMKhkY+= z*HpzXFEkEcWZzx{))~?yd+DyUoXLsVjOI7kAkyqT? z~=)r`R5D6P&JgUfx!tp8!i>>q@SGvoQ+ne2^=v*jkR? z2xgzMw2AQvj6=`OB5bWDcnFSTJXveSK7uD9aByV}0|6_3nqcuUtG}K2haR{3cN6~% z45M-tx6mKYbDw^KMR*kBfwsAjVO}SAE4+a5<~vq;mf(7L1LK)<*6fP}55dZZ&{7;P zTJa|YSHEKQUnlaFFcEJ^aCq?nBa2w8E)p0KU%Rw z@YDk9Vb&A>vYXcUCy1Yc+EO%IeA%!jZO<4~qKGiE=*B0PcdnV+XWoA@Py z<8U0~=I>hZFu~0bJdE(pOz&p)d4jJ&0psWnYxZvm?to#84{WjGKNCC!k76AEkrn@k z;6Zo+NV*UQ6)XunFVo(^kBR;4avM@!&^R z{6m7Tz_S=Pe`>`Mf)BvY5N?>L$-K4GZF_~_)UVP8bu-6_f9iRw{}l00Lah?dw$O?% z5WEw1VtnmUEB*t)_3#A76L%EQ1-YmHh% zq9$P&MKx?VWW@o3Q<;^U3l0(g-`}vtKSTWJF2LnJJZHt71kX6qhFPwQ`2YPkYy2VN zKMpIGqb1f}w&I@=ya6_0-2S;0CkP&bJs4MiYsDWE?1yI&UedZS{n_j(l!r{VkV&cV z^W~fo;R8%ZxNTcUi_0&`#B|FF_3{C~NW8qFxvA-No2JWanu=z|%054hRRd;#FG#(D zg5%h=5ooju1od`G(c79;?NFfW{Y&R$b#l0 z5fb`UOH7eh={iKGV49*z*CEP5x+y4)&Maz@msark=_!gq7j0N_YIF)oa>@}pvsnd{4G*!3CL^9%6yr#^RfRi?_ z3Vyn6swmObsj{hfRTZ3M167~bY&htmo1zISolbKF=?;4XZmP9ruhT_ONkgh2)ig~s zOZd$>w4m9K*JXYgy=ad~npc&~Ieda*?5gJ4STq`?(CikLau6Zlu1OQng$ipAl3GnEY0gkOBJI=vWZaZv_^6IkVup$CZTHGHlj0k zGi!!_>J1W>(EaxJncr*XJ8OJHQ{LGr?_fQcJ-p#$O7sXE{-K|oukgqL^ihiae!}lz zKlc`wag!y)AaVQx<4An=o1p5>|6e252aYaXIZ)l7{}K1bTj7Vf{2#*o`3$jrIjQ;> zUv|ANy`JN)Uq9*v;p~~o)kgixL22J#$tDBnC-kxzO1^Pav%`7PP^Yc?;qa;~n{C~;FcH~fR?l5C=W z2CT_S8x~&Zg5qm{X~vq8YO{5yN^z`}RdMqD?Zx2CA$5 zvS(L;q@TaP%-!iD$-AqA-sk)MmAgKh7x?o&ckgi@?bZn}RSP!R>ZjnZw_EVOii(#Siz0uo+MUSjz657ni~Mg( z>_mjWRALW*k587{b?f;c&tIul z5K09P*8&5pgyk#KEAU@9^FPZAria@Hw_zOp);z-2CV~^N7vsc67Fz_j!4ZU;qcIjg zM(|V<>)%PkzRvoer0Zc-4XWl#vbdjM9hw+tI#_&+;8xg!@&2189U*utJdW`jqipm$ z1h0b^5pEtk#Nx{YPaa|YSBQW3rgUEre-cWysM?WN*s$`#Dd1CS$9VT|SiFW{A3Th3 z^TzjCtP<>9!XD-p;@9Wc_1_@=GI$kNJ2A`Rc7ms2U^T+YVsp-#rwQH*w_*J5%Y|nL z|Cr$2uovThtz+>^1ed`PjMo)=ccOny@F4sU*gcY@DB6XWD} zSo|M?kH8*`xn34m;Xm3r4|fV4$N2nnEWV3i5njZ2{0kO05j+J!9m3vUvG_qJxrFig z-?8`!g8vKKF@Ej?7N-avgNG4ru5T!Q3qiN-rv&?6FBsI#4H5sPPS*bl@ejbOxLVCp z7Qaof4g&#%PpoJ09|`usZ5WT=l>AeI&%$1eFKuO`g+l8dZVMbi7@UR|0>b8%>3!^~ z&1BUluJR+ z1D{PC>7vg(+Mp;NDJIBT_tM!iPVSzyJf zS5kw83@KDulEoNZ9FNfyk|KqOSBa`|V#;w4qUto8BI>j?MbT(KiWYIkFr3P0hl~BB z_UzE42+gKS60IB*Lo{(v7HQ&O&>1wS#b|3NjMfO6K_|?L(&2(}g`TY{J58vntft6F zYDjlhm@(Rf8V%9oP~#Te!B9}4{b;6hqM9PoF*Mc5py?5sL9-NkURqoxLP+fhNzQ>; zl0_?8ga5*W%r8`iZL-5=cusa&ckemxSU z8V`sO+SOJJUrJM^+KdInjur(XEPDvSqKX7dm)e^uWFb;qD5Ne);T&uQdo-)ew0e mB419oLC`3Kzz{SVg|C4G{66~N>(;^78ijkoAe?Iy?)o3V#q(GI diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co index e7b9362d6094259009a2901e5a310f20ca1218fc..5753abe74e8cc77729799e4d484e902ff25612d1 100755 GIT binary patch delta 1841 zcmZoT%edeY;{*++1*sFYmeuD3Fn|G!W)Ohz8G#fV5Empt_ytX9G=$4Affvev>SF?O ziP2X8(KgwiQBaAn9h;jN^+a`kG5OR91Tg;p#pG8f5W@Ih&EH1`%#Hww1=I<|F#cB$ z^pgRJPj-;Di3e*4glY&y(hv#N5R1^j&(N^nhN1J48Y3fvEr?cRWU#T_(+p-iBC!)8 zY#WB%ljUtRCkNOVNT@TIUQ{n-V9)^48bDeTNNWP=$p_SoCfA88OP4b+fYktLuo@r@ zRs*Cb2bd~tekX3lY*WwOx}Uv?W&Z)zmi-kBJ^Mkf7le9O5XrlOQ11$2^R6J&yMjpG z6@+?MkP65lPwIcI5_`m+Rp$?VIaJhwQ1<7iIdy7rKjVHtY4A$2D?j1mOqfMJlBg6hpn+4rVSn%eB4c-=PjGH#U z48*SvWas2KhiRJ~!Y6RBfJ)EF7vdWk4JHRAlxUb3SeU1oC7Bpmq#2}|rJ0(WrY0pO zTO?W}nj4vp)}WDaLq z7{L@68k<@`6&M;gnoT~Cq__D%k^&P`PQc{E6uC*M0$lJaDrvG*s+sJACWzG!k|4AK utfm5SCmRY1Pi{z+V0w@=c~+_&(}d*7w}4bb%4DfDJI)VKSFZ*dEe`+~`SPUz delta 1358 zcmZ2*iLv1<;{**RhUAG_%jz%qGJpY$W)Ohz8G#fV5MPLg@GsP((ghGMg8@`MOdk_a zoEUvj{geF}1(gWfvAKy+Pn6@of{z*l!~g%2KSrCq%?tSO%OXMNNRF}g5qRO2@`D)pLjh=o2w+Om?b`QweDwcV%cBC+Oofb zp=bYpB_wBTks-|)n`LD^m`QQR<_3i=Yyx|jpmtLj1cIBFX_%8_KQQc+HcRPDW0UFp z)wCZNeqdbA+C+`8Fd;Q8%zV+LP|UX!+FQ%JCRn5oBaaZSQxi$o*F{30ZNmB5d}140>|Wn zq((-g$)A!+G)yg$49rrJQw`HBlMPMH4a}2G(<}@u(ozfzj8Y9Pl8q9Rlhe#oH774* zl$tD%&SuQc@FNVGyP*^hm}X~?aYPjthl-m_ewZxoC4eTY3za>RgR08{D*h%BReTYM z%f!KOp?5oy?dnH+ez202jRcil4kI-AvY@0b;j90)#%m523;2WJ5vW$qE?~Ob!W?tupMG aJ|s-80#XkWC-2Iz;|xfGC|wOSTOI(CdRVLg diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index ff65790c0fa4b306d767564ba0a7c5e6d0c4a62f..17fd9d0118d88765fd6fe06f514a6c062bf780b4 100755 GIT binary patch delta 5132 zcmds*ZA@EL7{|}K-10UC-KcR97=t+l2IRKAec8N7sk|u0S&_oFutAX$wqS&0zRVC_ z3*ypbm_sa?3(=rtOTe+68AwEKO_t~fV}S5sA7%oHF`Kf4#1IWD;<@d)w3q0`4}R&I zG|$uD^E~I=r_brX?VW$fUfX1ca+sXwKFS;K8qS75ddL=NgD-(ZW65*G6pD{Hm9{OE z23iuSL+hCTSV!y8t8icT?t9=~cqKJGkkT*qf!R5s28c6<`$Z4+r-_g5jn0g%J;CH} zFjbwY4!E_!RClIU!>wFrReI1rDY2T))H=A8S6iJfab!Cp)CUV*qXo{0f>X5MOt>Hg zbc+J`a4{D`P$bzb2!x{9zMx+c;k!b<0KP_q-7D@s?cqpj*9TGi23b^p|= zj<%|cXjMmB)%_2xItu^2Rjkdp$f)&-rtC9_%VlxfAwSDInQ))cHc|sR>)yEa>?=^M zU93I+J&SAwGq&1Cy&@ooOL|^sd!ux`Tw6ZG!a?<>BW9e$qScjaXx5y>U*C-4S>sVl zh3dH`gYM8|IUjjMY0woO;j-`&5R=({v0{+;J;H#{14Ip;EWWG0&XoO-25Rs^g?b*+ z9-qP9mw%3BGp)TMzBhURV(;sN@Uc|Kx)|YuJt1YhqY0m)Xj4Y=(Wz@MQXN0}=x{i2 z#v9A8M6)#Fvu;a_sxJCl{IRy;jj$$utBGDsJg3?#o$0IwO6b%NU^xX=PCwiO1M0~{7pGVE!KE}-`g8B|7r-9K`Rs-8LJrSaEzRae-e7I!+~#JJ z#pG^uy^z7S48Xu8u)Za!3oNT8zYj5dy4Yz(@Ne%xwkBIQ$&uwMfuVd znb7^;r-Avta6t0v$Vkbi?ZN5 z$~)$ie3A0*9r-^|9SveGh+Qr=o3ud6RDmlO0>BU!gAxU=41 zk&_A5(>jCEA}?&<>?M>MOUmTyF>+<{!Ul7xd_9E9i}9xItK>POyfbgnTS5Kl6DHTW z3y9s{GF@hbzd>d=+jr9h5BeM?s7>6y4QKM5H$n6M_?pn$7;g^QKNEbn2xsEWzF$r7 bd1l7!%QF*eJLWr2Y{JyBP~JPznwftCHQcnL delta 3733 zcmc(iZ%i9y9LJyM+R_${D`99Uu#GMug#o!<>9v@a3<}+x!VC`fCuF1)kQ#`60fxBE z7S>_o2*Iu zeDCx8e)s(HyZhdAoqV8PSXYl)h?i~emRt=bOc014JOYDs2t=A8@-j4EFCT-+FmRYa z0_BN^kCTV;nd30a{NHPEgYbjwyrw--{vw#0cQph4HFK1YARRDAnLSOjA8+Iky6vLI zA^^6xnT-|~vt}eM;l@T&39v7g^Z^h)al&*+(1%3b+-xcpbg8IMoiZ`O`a)(Rkj0$I zJt)mTejdr)&b6qFKj@;oc2LcCsv~?q7~!`qyGq$8P*w_8aP}$47(FdaV*g!5^@M#S zw{_qpje#Oi9jUa0n+OPXl)l))7>bqE5v-4MOC`6}`4gLiywGFm>sJq^p*LlHDfH%Z zo6wsdAEcu<*TO0ECgT29U3hsj!i)Vm5^=rbdI9jys-ygoAwdtg-gXTDUeBx#PjN<1 zF`@V#7^&M(Lp^ul*s`KrE?7UVB^GJ!nWm4Pl%0l7m#U1LnMpgnrAd#Sj;fTjkPG|f zG7R%+vXHX1ulFP}@}u{6Tysv79@i)vVV+Q8vrEz$Rxo~Xis~q@~qI=`Lp5>WDE?iM_WNcYQ%pT-2 zuN0}C(AJ~-v`b?=m!0&=ygm(HVG7Q!a`EXulsz`Wz7-8OHoK-X+`;=_D*KMG6|b@_ zdtsP!eiu+w$@QKU&CK{dxB$7XpZ99>>+bCs>1%x&jKq}n{onpf{wDP2fbh(h-Vajh z&$V#s{ptVPpf!8$MR>7C*km!`xx0G{YS1u$4i7W&=`eR=`>rCRh;B0&V8hH}Hr!I= zQ)IEFJa)_u2O!0oXxMA9JKIRQ-QQvDtn^#$4$|SbcRERDXFEw!9ag8G^7;Lpj*epH zLug_bXs8SIwxQQkG(0VhTCntmoIi*7$|Ky2`1{A@@nwkXs^r{>c(7Cw>`4dA3T?UN zK@BMApH?|Pjrh_@Id4Zi?oPgghDdL1E$H^Ob#%8kc$x8Du*?5kBm N(lVNOLpl)z{{pP&_YVL7 diff --git a/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co index f875317d421ad3fd7c82b56932cf1dfb5b9d7882..080a1ea8b95569a435d3b42c33872432737d9d90 100755 GIT binary patch delta 5847 zcmds*U2GIp6vyw~-Pvsm%F-_3!vbv;Xv;!oXJ$W&gWcV3w zsA*`sY@r&6OQvxVH6jp%q}0k_wrXW+BvFG8K$_T~CJiyc4}4HDky5R9X71hHsXFlB z%QDIK{QJA-%-nO&ozv~@f1jGXNJR?`lg}J3KGi*527!DL3#`hoK&EE&nW$68CtlFn z4%H15W@3Tk4EGp^7>bHTfUZ&U&{e-KS_@if&dl4?Yr>uPqYB+UP~azYtm|lzUtDP9Lz#V&C2W!gO%Z7iI4H@px8ZbcDe2m9 zy0(8y*M`%zrKD@a>DvB>bgS0>ce(k9s`TN(ff0PnBc8+_Wx}9v2 zXU%EcBn9g-`@Ph7)Xcpzou>6g4bn&H?O|NogXV(GwhKkWHXo^=Yg(r>T^ruz}cauuyn^rAT z9KNKVFMg#4sW{5{v{B6c+Gg<<-AZ=NujZs_*AyqO>!<#YQa}DHDs}c$nwI+9lzzIi zDjjdngjP%bs#_6BE;nhU&&|~~QfK^4E4-}V3Q^mPcPxU|`Oq2fExMaG`u|jJoHff# z@=3s}n_w@bW8qEk`9hBNxZ7MU9OG%DTU~8Vx3jgS`B_i1r`gSL+nQS(47-hXwH66) z!zJD7@qz_dTk(McHggqg0W|u8mwGM4+%t<8U|v<7>J7je%m+)g+@tbtFm2Zkl;MD5 zvjU#Pe4s_^UypguRxRI#`Il{Kiez*L=9R}~(M>P_FXDlL14$;UKFr74wLD7rPAz{O z^Zr@)Ifc2sC&gcW{(E>};84l{lB1a4JgViNVQx|#5`$MU=iW^qMSP2SRGmR$@Oz}t z^DUrBOfW_Kru?*kJ%!q(`wR2j+gkrj#mxfFf1%|z%(=^2z8v$)YpLsxe2Vdab=Cu% zm`~o&-pGsjtzWc!Bj#_+Xt{c*W&y+MjL{&jKRP@@mjzTVR74`G9rFfs>Xdrz#+=`( zk#JNHClc|eg7hfdgzoEg!=C^kyyp0P|x_TF%M`gN6)1&JoF^ zE+V&VqXoq18u(VXgZ6Nsy}6~ey^(R#Wl33%CG8Io2inc}31!?tBHfIK$Y8mX$Ur%b zrIQY_SUQ3MhR6~I`F1N$b}()d=U{^gS;8co-5hzc!{HTNV=TN zo#7m0mYl;$l4HA_9yf6=@g?H?TtWsCGiIHMrD7c{xf{#+i7)GB$j#UiA6ZK_;2>$R z!4l%Y1w=BsivoEE|WqH=&-gg!KCozCo8K_DLOd2~767_?Y*#UQr)(oGNf*jk&M_+Tj0{zfqJ;uP#KCHuZNEr{s_gcBb!Ax Ws7@U~b*k737iS&AkT)n0g1-TsY!H(G delta 4287 zcmchbZA@EL7{|}Kv;`a)B~IxO-B<*XF>>G9-p-CeTe=rFA*8&h%T_4hW-xTgoO4FY zSXqc-1TW@RH)9vFB__)jbGNY}^Ujw%ZfH312$eNo zQeOhV{JfZGa*H!Iy0p8#zMu@)e=K_yfbKw`V7;o>Yr3(qpj_4Enr?0`5aV(~k}ok| zww_N=Vqf82+Yed2N@kBq=`cl=L!e)oFJ06^;_;=oAX|5rL(%5lC0rqs1lfNNTPnzCL z97@BRGo0$p^?#Pfn+iF$aW!?{-t=!8GZh`b7gaQWwnW|Uxwir3QB$w7rBBsI+{fHU zfRZim^Gq2#L%Rq1YZ4}yEx)mRo3!#7%X5p2%X>SkO-A|<Ee>E3=cxPdBUb3V= zI``|zs_3Nk`?=-uq(bgH_RIZw66~Kb7fE-bislO*nFAJbWCXe8{eZ=Y=ki1>Xn}&v zd}6RJeyTqkR>>RAJ(KZ3A2>0y?Wz3KqztV{cTy6gSBy?dFjp>};Gy-Jc^PgAdHlvZ zh8oaEr0%D&+`_Kg3nMbPBFzyQo;U>M#OGHG-Ja@O$B@-(zq<@xs88GqF}5)-B^cN( z7fx<6)W{-ZqAN$-`H+W2uiTC*nnie84yy04iyttqkwF`rFgh;h%_DR%2QS@yOVZ?>>UCvbhObl+s+6f zwj~tmaJD}!o`eOGwHanb4z%JAps6({xnzRb%OuZ79E_5D9pXc^B;SB|Rwc;=#KYxD zAxX_JFEY1@OnMF_&F&%j^N3G2k-QD(@5Y~^Q}Hgu6DP?0!-xltCPhS#_M*UUxWDUO_G-(&a3+` z_^;;JQQ-CrSx|+z?RSzti};hXByU7~E0q*_umy2nuFg~EccQ@F)p{UV^cv>%B=12y z>LU5Oi0>#Sc?|IcP4Zz_r=61t=t?qh78OixBl%^-uQ!vN)4nrOSpalNw_b!7BDCM< z1D!3c?VWpAyQfB%UK&Z-!`iWQcv&o2yBAA`$B(6-!#mJE4wvyYeoUDfCoW?IMoow$ zPiB~E588s^XdX*}!N)NU&WWkZq3`CcW)Nk0KZhmb(Kqu8IG1I;4wS(89k`5(a6WvY ztia*kSeFMk&wBMs;yABvmEVaa$I|-7cHV{0aQblXoQKhm^SP=~0`Ku@avt zkV}j{sQ$_RjDkvp?bzJJC?=}&i^-=>Ab|1zFDAb_fe^<3YW_YlV0HvZETB#xhVj38 zpq~s#eDVS@n|QE>K&XaLBn^>J4Y3Ff{0t5IZ5TQ)sWCD#*n((9Mg|+(J+lZ#?C31vd~bI3*Q=6NdNc>TFqLA`<1 zA^dmKegc3e2*0n@h~_r(s~+ zo?w~WV7o;z^mogCP#n9r*!0_YGVJGLY2EMM0c1DYwAnN=>}TB^YIlQR*iEjJo3>fN zDT9TPb#tW~LG?R;YF{vKmht<-!~!aTCkq5NG8#;t6j-8RVqjsOW|m}PWRYf&YL;ed zZkn2um~4?~k!WsYl9HHYXk?sXV3w*mc_E|Jl0L{wcCP;re&RPhBME)xgChT_SOg0@>axiaJ=CZ*=Y8<-oK z!k9*8mN2Hdp%IMf=IjPzx;Q&dE)>+={6AQciD`}FAWByP&6WoM D8r1X% delta 1333 zcmaE`h4H|8#t9ls4j~h@memW`Gk^h%W)Ohz8G#fV5DNrC_yUEfbOD6R@PQA?fa+rc za*5Fg)j!#vQBaAn9h;jN#Y8#&EBL4}F#P{N`G}a!>v@rEODBnbw7I(%ltdQ5X&r^htJ$(q=a!U^txlOKCVfuprrPNI2-&9AQzM{kvs9C@kDtZ2E0H z8TN~@wC;EB0J0lx+H4vb_H%9)bi6@&9ILtfV`1dntm;X!UW3gZJ}K;zHv~5_8cj9| zDbX;sNHQ=>NlrCPvrINLH8(I%HchiIut-ZWFfd9rv`98eOioTSPt}~fkWp%KKm?mH zJA;iMH19$w9x%<$u)zjZTpTJcFXBgAS#0vt zkV}j{sQ$_RjDkvp?bzJJ*d(g+i^-=>Ab|1zFDAb_fe^<3YW_YlV0HvZETB#xhVj38 zpq~s#d@_%?O+8pcAXGyTl7>*IhA@N%utAYf@hFHm*w9!cb~r*kKSRTQ8-~tHYK)8w zwjf%Ok-^4xPcxY92xbdFIEfGm8;0GJYXa%X z3&caE%NZEJ@<19a52V5JKzg!*gs(6bd7wTCfnQ8Pbpk*a!W3`bAu)v!9B6{jKodj? zG(l*f31SO0L1>@}A_bZtG|&Xm0!`g5Dr)^f1na?(w>PSmyB77(7iSygv50g_tmffL}C&2vph6CkKofdn712~dSK`*+L! z+{Kc1f{X@}rTj}YObjf{)69}gj4aX&Qq9s#%}rC25|b?wEfUR*Oi~h)42_Ia49rqB zCog1_ntZ{Zquv18Vt}>_7K4LXzC%#*cnvN%(sB5 zKal`tXoE=*;S6T6Gqgm)g}{u(Ad-oLVL{R4rhsj(PR`B@If+TBIq?SO#%3_4k)r{O zX>M!`V>&ro!kBL6Mlhy}qv7V-KxZbV1@@C2gXJba2o~Uix0iw@{|Yt}5-5haK_D1H pf8d$y6k^6C5Ingm#E$7e@Z?=U>O{!oUmV{4nn3`__!k>|cN*6%53>hGWAOO|J z1Y{DU52}B%Kck=$VLLWAF*b>E{8#W%V_^9If3ktN&14%@1C0hBA1QSP(~Ih*3=9o^ zeo`7hwg!kD5Fn)qWNU)hK|xZJ6J(So_emIQgZRYixd7DjPeNOsfS$<)vNDqk)HvB; zcGaj!Y_5^4VdVI)gyh@=J(8RoN}^MJiEySW+=&bHB_Q z+mH!g6-N2Z3)Mgl9>gK6!F%|3(|%xj17q9GSM`Tf2;blQ(_)Z^unOzJ-!1zOFIIFC zWHg$r6kMWVYLR4MmXe%mm}Z%5Xlia?o@|?}n)wz`@dGgo40^~$ zIYT9CLXkxo7#4%LOdJd|@+NnMY;$#Vc4Wv&OiIm(H!!n+GmT8(Ofw5}m;y&3!q#E4k!bvj|s>n zMjurFWPe6MCBk-WZepww)%nHbQzsC>`2QD^U!6b*<9{`O9~m$^0wfktClJH%I z1|&XtgOW`>SVJIGLlBaNP^gA5ga)udkx=m{h&b5LSR{5hLOnl2!+slv&P!^Hj10CQ zT9J{##&%CLnC%E=3qUxD5D6QG-IMvnH77fW8%U@#m|j#bWnj<%(i%Wo6G&?U>B$eI zlqTDm7)zHkFo4wnX|Ng~4ORoBCnrcNP2OjsEsRCYWCImdW}x`y4H8or!Qmwc4KG2Y z@DhZEmms$A5`>1AAX0bcCExZJw;U$O^UV=#BB}k3%3ed(IUZDiS$`?mC0aJ($ zCn&5YAJCDQEFiP+&drggqv;M3YQ&UxDy*fyTlSZ4K4`l` zjL~3nRY-}3iGhWAnpu*Gkwuz8s#%(;xoK)rVzNb|MWVTpNlIdpp^Q98jS^yB52TZXutguHF7l(>h=%R`%LB;2wsfQ?IXE+mxs@?*s-XaamaE8l5 zSnLcBLf}GRh98(@XW)oO6<-75G69<{6_bC3ZVPZRHfG34OiIm(H!wGHgfWdQU13af zBXbzj$qA;`(7@RZChBZ%0b{xu7;g%9U}E~?GWlSH+~kBv0WNsMDPnR}q?u3wv>jCd sZC6cTpL{6NjHw`E@~=ocCWXk!R#A3L4pEb6=;||0ROX>00000 delta 1673 zcmaE{m+`??#t9ls3E>m9meqf8U;qOc%^(2bGXg0#ApQ{y;r}Q`r3)Zj1`Zx51FDY+ z$R$P}RR3guMnNUQc5H59tPcJ3HC1+)`3Y(gn@c2X7&-nc zAvsY%h6E=@$dKm56EcLIs3D7RVt}j+=PShM%ymSSp|eBVWM z^BlDjpi>8JxEAPZ@SgqMv>%wdz}R^6QvJamwv10UZ?yn9auA2A3hU|LE&I=IKIyna zjL~RvQ$&e|sYQ~3SxR!MVVY&Kp{cond9rDmg@Hv{ih+Sqs-Z=)QDSm(nt7_`KPc=84f@TFlaHw04cB78GhKnL?E;{m||y8&_NYff{MREQ?CmZKjMq3-U2Fa zki_7QBI68|xDkLN%-{zVXNX1>UjyPYaWMQSp3D@rEx_5>kRc~ADK#hFz|7ng#xydq zfHBR?onTByC%9TSXBU{Llerm;>1tp&*&teDvq7{36Vn%m$%%r(lVfA0x!|RI@Z_dg zGog$Uh?6oxAoL9G$=*a>9hxgZ0Y9sW3@|PIij3IR0K diff --git a/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co index 3d4c113a331e146926841e7907edf155b9f62102..88ce4e89728a6d45f3efd7f6ee8c47df828b94b5 100755 GIT binary patch delta 6964 zcmeI1ZA@EL7{|}K6bg)u_HIbGb{TFpD+BcQy(C3P3&dcCz!)-cZmoqZT9hRaqM5V- z8neup%3%v}G0ylhUxIGAYA_P}K@;_hL0pU<#KdoWaFN6q8(zeF?|Dw?gV1Q9R{Gm*$URjOt{2H%9)t*)hehS9}A#2+;;V`jq_YtRxjK z6EP4Ii61eMrE(KVy6(vvMu;RynQSMliXj6gY7K7MY?!3iacEzH_N3b+#=9<3b$C#R z&r*j6b@=Vn;XxffOC286;Ww$ngF1YcI((Ko{QsfO@ov+E*+eX6(?qq2-0Y@*eB4}S zmb9^<#JsJaL-PMcUS%b6e_JVQ7q6z`2t7*tgk@{Ev``j4QA49|_E?Ip7AI2rQC;c< z@fpkI0xiW9HMgXg6*r0tR?tUFvRsnvdepXicx5z&r2RtvUJ~1Cv_#u!FSFuL#1lh3 zSTuNreC2M`vMsl4v*r)2jHiahgZEe6#g!ftn>(K1v~6wsZB473|Dkf;TSP18RjG1H zH>_}`f!?s%6WTWAvspY5d6hn1Xd7B?8=7r*e^&Vn)6B}fnyN@{8^tMG^8KMf|A<;+}5t@2_;@W>;?tn%AcOzSxv)&s(1RDi$X!!`o zP2C|EjC;xSs(DxDg%rZ%yHz(|D7{*dKZTgu^d53;^&s)C?Jr{J1Eq6x&6+oZF(GSp aMKd}YJOIP)B7dyyFSg}V`Z|_}!00bNSGhv~ delta 4288 zcmeI0Z)h839LJw0Nt4T3=T$e|Lf4j>&T1#SyCjzjw9qz97&2lvf>0(~o3uq(VPSKi zqnIN$h}uc)%^tYw+84^eHysaXUkq;-8xm&76mG#4c$5u8g_^XwTPYglMn>qXuHK#n8!-j7*r?nN- zIne!7ps}l>p~<-@RECBbBoFueU(Dw3lJcm&4tYMAxRSc=6Y=;U=#1Tj=}h+pO=M z{irZ^GT^cN;b--I&;2jz&uHInv#bGnW3jEtV_7qv(PeaCNsr8vb-dfZVLzk)4x;zZ z-(1LgJ)} zF^&_)d8sIj7K;# z!&Sn+yGV)I)+nEH^UIW=hzQ5!8jSgIOKrH_uoWT>ESJ+5c2r#Qo3V?Ye4S}sKH*X{U+OHOW z)=^)dL6t6m7eFUnjZ4kkwnr!8+bL)Ya7q(N%P@YE2Kj1|hp&iw;P;^4)kC54zJ#Ux#g?;o&aCP5eG(^8tQ zkWKXyDP-gHjs(e;ye*}}aeAZJOl3@zII=`c$k2O^C6fuDz-I2ti$VB|&nK^`JOf+R zIs@58`py8K^VMG%n0R8HyhKBdsNK6@ F=f8@fQyKsO diff --git a/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co index d5bc2999b84c0298bc70fab46321275af7a5d560..40e8ef5309e1671517226887894259a3e0b0cbe5 100755 GIT binary patch delta 7913 zcmeI1e{54#6vyxF#(v4h_GGNEg5B1+4mRxjwYH|iu0Iw6IK~{vFJ+zE0%m9>Gt+@< z9h(ATtb0f?#wcQlCPvWbFXCd=kjY;%5h5}22Z=`IAD2iL94PkPckXK+m`JC8z9ro~ z{hV{|z3;ozd-nPcoyR<_s5#ih$l zb`8piiaW_*%&- zm?cm06|FK$lVxTpCRCUABUFP%&eUu-A+nF;UhPQy#-8M(w$iNRAcf89SS;Qgu51^= zr6|NtLVQd#%M-8lVIl0@pFn6|LJ@@l6`3>@8Bme=Clwh`kx5gL0Tr1WRAfL!CQU^q zO-1JaP;uh0d9>UtS<20$b!N%?Ml}&BT3)JXX-{ftCt9`*D!Z&ric_nI7_C!9{9KJ6 zwcH$@+$aqntyiPHM=bdV3q}&TL)nS_@{^Y5a*R?`)XcmTZ8#`juz`L?5t5?4#UhN2 z+Y9@z4kbUrQS#a=ucE#`_=obir+EpB|Qf-Lx~$=+iUp(_hGqJ)oa) zCT4af+6yDosXqO*d^l_r;>`PWqtd4^vCF62T-cc8wn@V-+CHeCB_}8kE#@CD)e$6;BP%UpiZ`2{V_si^9Om^p- zZ>NS6=ceQZmsZzdNBrS=xX&&)T+W_`PJwgvIJz5pM5ox@8G6VSa)q3n&=u-rIljx$ z&|NDX#tY)sGMpD_-%7u^SI608$VIXD^t=qXS=RGP;I{R8UJpFmtLHA@XVrj+=JkW) z$~_t)MhkFzo1Sj~exOIsyMTMv>iMI=*0TNrzW#L$5hI@nmW#q~>-k*Zr#{g0JAiw0G^N$`7lUKZMSX$@d_t8( zb9#YKCNxBhmB3GZspso}@4BexAxzfKMFCYq&D#qJmU9{+Mn7;Nbp?bE0e|a9ef;yl zqd)8UA>h{{Q+me{aGd=^&))_9gesNhJORARObm^m1YTF5=cj?A*(%>iGrk9h*P>7O z3HaPadOit!)lxn0P@edi@(}XH2g>nB5mszKy`j$T-Yp#8BvMmwIKgxTgeGb_1(uqe zkH$Kje2`L~pT@F|Af4CYcQw<1W|jsByiZAsC@Zr;w;v){H|wG%?-r;jh-m?IW!A}R zH0XBISa;AtO@9+LIi91jTxx4#hN0;M82i-g``ly%!8=zwH0g;NyyZ!X) z_$KX-#0N!fvY=m^feQ#|%f=Qlw5_juhg48fBULlv$9opxNo=}S6d$nSr3KpKp9xulV<2$W*VZ5vY&(EjuSiGbRm&RKvaHYvw72i>Tm*r2?sO1+{eJ@yX Te*C*+Qi0d5I<}d}Oc?zQ&_#4z delta 4938 zcmeI0e@q)?7{~8!g_fpsF!@m{ZqNWLKfNElYnx27a+Hv{04`)<8#tha%+O?HnIS4k zNe5=pZK<4E5{;2)<`0OXm&`38mOn^bHn)K~;|yk+p*Cs^iHRi4i1*`p?|Avg{ds># z(!Sq*p6~m<&->gxT-w=X+vI{R<)bEdysE0^E(lq(iWNE6?u})x%ax6_#oJE7xg~d>d$f=$*)5-+dckI~YATQF zv*g4FEmpi2wWqc7lLEcNzjY`beq)El>SsKz2j%k2O(i?)wY2Yn+BLJMZ#m>6!^%I( z)sJFM*>$JIYL6SEpptz1t~CoMY>J5xXxTEM6?mC0j2VZ4pY1b{CiL^z3ICHgKLP%(qbAq( ze+~l6mHKxq6dp$tUPVM7Q^4OD)jy;6*EL}$ejh|cA1?rR=@}TokM;B0iOEreX<|-$ zw-Y_j;{0>q?OPK168OSt-2Xjr_j#OWfxq$Y=K3q}PY8^?iBI?kc*`Wt^R$8~iB93X z5cs7TobLwiTL+HzuYtgoZ}18Fw_ZDOT*mo6@SnMe^Cy7!T*mn!;B#BnPxL~7zJgDP z19x1*`7rQObOHz`foE^y{&T?RmvKI+^8}Il4G+8x6OR9Z^ACad?9zl0ybSz+3+D^K z3-xPf_-_KwZm6is;KX+j@a5wR{RI5)0-Ub`_j_^Pqy7XzIfyKup4dZPOwhcai1+sO z$B(g0m_d@IIgqsI4uUKV)WSD48mmPD=o)Y=i>l2j4p@zg zG@&zdVdK(qf+!$EfHS;}ep8AlAFf%95XX+R^1V-=>OUzt3-)|ESVLA))C!fJs3RMk z#&20g>8o|5u(MOp@1V|NeS9x3y|bPasLtZ_{(ACp%2ASjQ5`QmoW81#zS6X_fvox$ DUf9e_ diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index 4d59e8f3174c3009f9ec89879f6669687c9d652d..f35baeefd4b719f65f713fb2790deb1ede760708 100755 GIT binary patch delta 4163 zcmds4eN0nV6o2>9+R_v$0fT5~EXiiOfWDS~u|2e<6(Ksrj4c{b#;3;B%zY@xpcW{G zIYi7`c8O?$SvKcziTKzaiOrt=vBWJQrT+0AWc$mP zwCD8v&N=s-dw=JquPkP;tRQYPsq1+Z9zKch`?d=w?ieQ`Ic zR8JHVCwk2%j>NN@K}N3n8e9kXI5Vdyqf6+dV&5>YQ^F~_ONbKvEN5Rb0P`NfNZ3>X zW*)(gu&ENv_&OtIN#82kqG3}NnDKXYm_<9gbSG0S1#BS!ugwYAMgn#u0@A67z)&CD z^Z`H>iP8cnrYP1a>Hax-O zb9oa*Nj_LgK3GaVSYjWVtEx7ouvW7%^;&9|pzF{GQ*4oWjdI?-w{LMDJ!gYT$+^L) zDlMD0#>iGYKMvo8Z1~A)#jM<^7xXEL8L!)Xi_(`r?hq)vVdCdEO$WQqoUfP`9nA&{ zsg^DrhaWysueR+am1@ryF^eoYkE4QE@rEdKj47jpmD(<0LpRpjnZrywCG>J>K}Mt0 zH9I&p!EGwjYhf=$y}Q5>ZnAt>ai8$9iV-2rg{}G&VC7P`>r$R+0+(tH$ME=y&r%`9b*nw`vbKxQ_k+k%erxX1BR*PDMrrQXR?t!b9R>8K#qEHiilUlVf~e1~frim5Z08Ge`s?`U|;U<2TM zm((fDU)>At8o9|ElUlHtyZqgo&oGJkyyIZEj-C9~Xm~3p&g4`; zX85j=#~WrgfO7umF2HCNuX}uXi_8zM0DtlsJ>c3i&icVr!)8mI@t>Pi8PC5m0J;^o z98=z$$xkw{5Af$kK%Wl2s#oWY$p16lr4aC00$A7p@K?VCm0CVl2YaLy@ic_K2MkQ# zelIZ%pPJ?|4f#JUptqiBvA3aV_1^MN{GYvTS^>QKr^DMwH@F*J z9#=zcO`W&KTjOy!>uYN5j)HpH-C$!+gJsBf5bARt?jbK3L>MKhqfVllgD|^nVnBRb zNj$lds-vjNgiX})i9O_#l-VyYv0p`(ak`v0o`TB>+=K3maOKX~KuM`5a6huNV7TgH zGTcJoQnW>ct@wZc+^3Jg3~CqQE5FPIDe5x{t| z5!Q#3qkRPSA?P}|P)%(^sJg)J@DSCtzQ9dXI^d*b)#ECbRfnfQ zRy{5+QSEm3taW|RMXWfloXp|2%S+gUE(dlvB(7|Cdi`>WptFRij*=45Lv(Ny)AAa) zG&!*|7;q8?r=6C=3Y;!^4Qve%2Uox?TU{=DA+Z(~1{`ur-h$aUkE2*#+3ld^(mW2o zyhforAUk-2c0MBvt05ezNAE=7YQyZCd=vU20^J$b{`f&yo8xi<-H$*Iup(PKTn8tb V(Vlj=7Ut? zrbVPbP6{d$R69L+?e}% zYS1|1l}j9r7gj-u?Z0z~G1y;P-dWNkcQYgB0&g+$QMN~pQ~Qj_%{YV%kLU1n8Rqe0 zfxR9OMrN^9JkMK}GVUu&w=oPKi&^RvyN=lLxFx9AL1OpzS_JcKo3QbQb3XZeLd&=qItG`nwI1u`~HRkVd&K)>=OFt z=AMWWHH*f16G%%!fv{avT~S#4!o!%tmQQ{D*2RP$tqXGJImG6}bG1JS-2zve2hZcR zdGP#wp{200=wVzYT$jfKp9dNlxvHQ?UcN`My8^v|E=JxbmWKj5Ej1L|8&OZqL2Os>&q`2hUF3dfW##`$SwO^i?$^m3{6>HQdrk)hoaVB7fqP zrkzaUSHXqpqWPQ&;FdUB9?>KzRT@&_0~j|6$|`tPH%yA7SI2ZvDjvMHW!iJmNK9}1 zynIR@7fo9AVb@dwi5&oI>9>=k0Ab<$KY8h6Leni%rE#vT_>H?6MSO44u84f7O34~f z$R5IjENEv-G?5?O!cu8{1Du9R|J}Guh^tyS_F}dc)=Ksj5Gc@cG!PXeXTn;k`8|!1 zPbwf(fgaQkvT2Z{E&m2L$=1-aha4zRa1Luwwi{E$jP!7 zhHZ$`wlON2TK}@7+?B&d7?u_;M* zZZGQ%yIXy%ueF6`9TCnKb~J~>t=@=T7z7Kx6Get{(-!)yLo7;)JAY)uUs2eHn+a@wis=Z2U&H+h+&F7qb|J1(`2OAO zRez)Nrlr1N9=FOn_sarSbLC_ITPcPVfmOP*OXS6Rwi&z!+)2t0N?JGz>}CvX>m zn?B8sUP56WpHg6CIDjwhLc_ZqtdnVL-W+M$T+^lMP z)4m+%NjoeHd8mW3uIlUXaDF<=gAEst^Nr|eZ4yCsAHE=2j#DkI1=Ux z1?yCkW7FwePSw}NrOCO2Ue&?vcBm`({Xu6vUBc(|sFmi*wTd-K@)zv N3oV9=_+S^B_b+nh`56EJ diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index b5c49bd732b6d6dd446e8e953b554071c780aa62..d98dae9987e39ee483e5280d3f3762699bee6fd8 100755 GIT binary patch delta 5688 zcmds*e{2-T6~}kRXP=F)v7H#%z$G~&S2zuhZ})a@_f$LF-Vb9JilF&X2u}Q?HZ7^s zIEqnH34G^Y)Tvt4bRm;cw>5H(^jkPw|7`dayt4PT5&RHKt2&rh z|B$>m~JaJPCPsUpxN=wEID1( z=^S&qtvm;!O7pS#=Ybjj_5ipgmHTXN!M)4Vs(s!#^yJd2wXkaAz$_j~W|M9CZRVT( zGJs#p%`S!S4Tf_w9iiD(-IQxi|7IfJa@`B7fZ;w^T=Yb8LBhte%n~+-5AMo5gLiae z5jbM(d1L9dlgG=ttqp7f=EUAC>!%IR6;ymTIglW&G6BDYZ-*NNJY$|cK9p6!*60v; z+i7g>ahSU=x}0P3p8=RLl~Y+V(=r|@1vTcQ^Rb*;PrH`)BzVDzh8@7Q?8Cd#3hpd2 z?sqSJOOlHhTK~=inUC<-=RkXj6?cPDCw$A9Rm0X@mEiZ8B}O!11>0c0%j)_nD0dd2 zxdjF9EXu0jI6$MlAbzv0cRfN{Z=BEGI@a{fx8Cx7Tz2(=xq$>}pQ-x65?z5?7RcSthZi+LpLSnbdJMmjXVvTa;)(&=yF=e zW})Z;UUc<5?5>i%n!3Z^CQI55Z-;+}qADG2t>4mGwN_P<+gsa2$=B}ncidwf0xJv1 z2Yo?s3bW|&N3blW<)c+kc(1P7aW{Bw#M5cmXRdC`s> zruY~VF>dUBh0lJMVt1VPpQQe#^z1e~P5t9&ADQhti^~0z;y9Wlcxuc}Od09pa|%b$ z6@rDMJia@1XykA!E?`{wK93cO+tC(+J(qaAf#M+QCD?VI$N0+-emR8!bQt5t&7br5 z8Hzt$q?|tLZy8P*cG8cje*%^Kjm);3$72*vA)VmzN**7j_&9os;5Wa)<3Cb7f({XU zA;ja4DBh1|2;RMw$DdPt0)f9{-2FI@OHyBvox(;W68u7($7?8#pjLt>{>0-tirr`* z!7T@P`~byk(Imk`ukd&$#R9rQ@YL^k9HY1jRb0fl{L|EXd(RJ0+=8|c+@9{m#;;P` zgn9`cxyl!xr1%0lOz?gOU;IOggXk>AjawHLzf5uYDn4zV`r9jce@^NL2&Yht^h>Y| zOrUL-V9koTjeJxMjlu_wqgFIL#^Wf(9yCL6=Pn*^r??z}%NQTu%j2gg?nWZUjn7W- z_y-i9F?jz^ssFWK^8Q~?e?Hnr%C(z3HYq-p-e>z%-l6{SDL(!W)bBu7Nb@acdA!Mi zE5LlggDU<3kH-8$XKPzWXR|LLN2%$Ls?798MKASwHQC2ZNwrO%Zkw8H$NIGZiw!Kq zh85-&WigZrh;=rLqR8CddV4xi5+zy!w-%wM6js?Bk{DrLQI%C1AZh{I)T25}6C=zN zBT>78*XN6|2%p5NNs4AiXe!$uNe{CweVW~{Pm(oOArfF|KCiu1KA&Ht0oanMiFcUJ z7Hd{Q3M(OpqV+5|tg(IdMe3O;du7&-EUBz8P7BjCS<%_@WwoBIS=J?m1w<4!gA*sD3t`5(u#A{9&Dy@XKB{oe~bPWhnJ>`fxj&{bH1zqF=G);aBYy z#r(R)+6>9|>H>O%Rnz+iIrRE~X1#tDJ_3OU6(nIz0sF<)gY+a+ tZy%;V!Vc3PcDo8uVG`bnUQWUlU>Hp%;W`jR;4?hno0bf5)cYA+^FI?1uk-)_ delta 5893 zcmchbe^3fL29`rcy~Isn*-Q{W(Jt_wFDN(^iaaOr?<%a7c$~VIotV zA-M&?I4zx$UeY&dZAS`YX-aL#I$fOF9I4DSX>AHlZ0wA7POEKNaf%5vME|hwd-uLe zUh)qcXLQEdm*?}m&#(9U?%dtksl1`9d41)u{Qc+dJ7Ts@LV^5aPoXgVrvUIHhuAMs zo}HBjt?Zd`6Z#TGpt$H`UI_!NEB%;t;arfcDw zJ&AXk?yJfx@RiS#KNh*cu}*_cS0K$`_yL=WEIUu!rvbFw09eCwL!7r^`>2h0YeNz21fUwbK93 z`??Z;TD4*EooJJ&g15>i+h#<*;eM_w_ETp z`Z>XWaCb)V9R;2Q?`-jbSx<_0>hr+f74Ua{Z^w@{?=cU4r+2Ip*iQRO{{W{DRG-P<=`{m7%1(T?oN`1nY!zWcy$ju`z#Rf zdwM}xwX)Bd=mpiw;c&pd`&=$ShwlNFGdKvI_H^*}*u}npbLv%4UIPB?%$@@OxZRSU zSdu$Xo{{<1rQT$I_#9~W^jaV?Z!B^OSHo>SXZm3{8gLFnC_`AD>q!yh0}cG)QD`}b z{LtIWmVBzoln^bvW8}*GJ@_(;ZGek^)gOrf=p6n#X!rC9Zs7o?a^YWRy|s=n!!5U- zf2UK{f#+WO>nq(!G6Z%va*O%9vESME3_SP=?#7ny!m?7hKQDbZ9P3x`X-`6NuSH$1 z6L<+``<<4jU|p8eJOK0kPU~4H<^u`+>=N8luIRghYGXr_xZ7wD6)mVWD7*EbzPm9P zl$u1nL5ejr?ADs@P4t73CFEO+AoztT6uS(I)>lVEu#kAF<@BWQ%+x&;#%q4*7S0ppsAm-*d)pm;FO+fP&bOgC>oPwiWf`7fL; zxFFp76ziys;O7VU-I?R_3%#hH;CDxP{F(Hj`GpBIN$~o&c&t#|i@>`W>r*`5OmQ_5 z2`;Xmy*GILZHj{nl+#7+9ewGAyXkw>{vvvvWE)z_;{l5IqH6@dy_Uzv zDYj7QzcKFJ%HzLK+>B}o9uM>QGR3W^li=p`FshxxHEn-_$CVUsLY)L3YUl9|imTBG z!7~SV+(z*@x`1)b=m{P_OL5`zy!{8%KJ`=H{u4Kwc@>8nw0Ue(yz2M7{WWTzJH^}I zr1ov7ABS7{$^{;ODI1Ru2832Lc@-XyhlQ5duBMiqAw$vWQmxdZbQ!D+x=TG2WL8OW zmmytdR$X#^RZU`6Lv{9ra=22Z^@JiSDH25Up9Qdk7s zGK-)pG8>?(2idl`Y8q@-VcD(CFe};Mb=6=!Ym&?w)k0A=S51{zJ({ktV4CR)6Gvd_ zv}hIc)q@I-8&8X0mvlBvT{c;ny6(zKH|HH9^A43!)~GRmB@EeQr5S3JWj6FGHi@w4 zo{YIeRarqsL}oo3l~t?=Q*?_rWw#zvi?AG~sj&!VRAi$vtIUY=#Cz~-5VWG>GjQV) z_vdGHZ3b$eO3`1GQ}kErHF@Z_GjKI}Y!HY|o0}N*M0I{K`P2ynF#i9=m6aT+MGg^TeHevG)EBvV! z8nr@fiVJ_W><2}ydy7rKjVHr?4%XKF?j1mOqfMJlBg6it&5SN4EclYi<^+!&tc*>Y zCH?TL1KBzGob9yD2Eh*OET9s1azRuhqrv1)Q6(BC1{UUNW=SSS7HI~lW@)D8rm0DZ z$rgzgiRMNoDTzsjM#d=yW~rK!7cxps7KmpvW@lIu1}(Lr6c3nYXE1O?6&Ht!Z{!*pcPgFv;u2@ sRaBF81;r;TBuFqd#7?$Kuwycan_LB?6yhiEO0eUc5D#(lN}##&09j+~EdT%j delta 1355 zcmX?boN>V+#t9ls3t}f~Evql^U;qOc%^(2bGXg0#ATEf4@C&NYXb6{K0xOgO)yD+n z5~HsGqHVH2qo5LDJ2p2l>WOmvSMX6|VEF%k@&jp`$qU2{G#Y$-q|_NqFRGU^Ff{o2 zNofGt8X$H+fRrYXtqEcW1xZadP*9xADPf`w;uEh&X>*l?7PEvgSL=TECYJqDtS$R1 z7<%^qS3+{e5*gB*v6)wvgP9a}Y&K9ZVH4d(ghj6riD9w96c`pof2kD~VYXz1MVEsRo8t3dE&D;?;of4?Z{x|Z zUx2lBzk3Ie-DuNh)5x&DZS!6?6BZJ~dB^56p;nBW|M?taW^CJhG=O9SlqPd&PXij_ zz&=?Zwvo|j@}$@j4O5FG1GALmRKqmOWJ6PP1M_6lGz$ZZv=jpaqf|qSWTV97KkI4%YTDs`5Km4b= z>Yh=TD8O5(7Jc`z+BY%LQ`Ho6PSf$!_`rvcc4?G|iZ z*t)PCEp`g;7I=%E1we7ZR&l{raluxY<=lubEHLnYW+x=m04?>>ap@2^D#;ITCD~e+ zWosqbTK_FuE6LWnEL$td*7`qW+qd<D`CV~qzyoP?N z(Raf_F@<{T`7G*;7oBT(^B$H%*YULmVXE1}pI@a)V!JUlk1|RUQP-X_gPux^2Hhyy z$r}ZC+eLOf(>0NfT+p5BsH-Px6xtuZTy!ynqzOAz$_a_{zl5V7AF=)Tqe$Hx%A6U)q#p@^SZzt2Liz>buT&c3Xs~sQS zh)Z%|HCdcDr_171^!$~XyOKrz`pwnwD)QadcUNSQzuty_wHA)T$psU9ESHvl<;A)} z*bm*}=Y_Dlly!4~1O5oha0lES{sX>%uOkwElnZm=0K>M2BOb=v?)G<-31e`B*tY>@ z$66v}KY$pEVol(WchdRwgui_U?7liD65eu5V^sSb;fG=hP_HsfkcOKF6_C7)5WY9%sxZe1 z|Jv!Ks{&3DzU@RhKTh}-Wxqyi!=28MhNkZ{Qnlv^A9_EXzeM=DJN$nj{M>!%Z{iB! zZPs-DcbP8+CNShmZ+Jq!rLuuNC{EbmWGv(d(Qu?Ax}RaZp%kno&|MvHYtUaC&>$12 z)}X(Z)nKj9qrs40V_7ffBdljT=ha|Sme5)Yh_@UZtI?=TQo&@U}nLi6SR-nT%)6?E3Gtri6j{-Y;* zxP*w-Cyfvd9619Yt6>i5jQdOTW~rj~z$`pdZAZb^9%(iVKQ;0NdZjx|8OHX&G`x|q zBg^aB#t@cixQGmWy_ETnhXvO>OYMPciqmCjP|{^!FWggvFw?OxRr~wHq$rD{*oUFi z_L(5~Ut2Dt>6NoO%^sQ>S)kwqIyvqowbaf@m(5A>nK1)e{dAQe&Rfu5lUChhw9!}7 zLzHo?Ja40a=^|}};08>Sk)OZ?MKV82io~V={oGbyw~8Rc9c@Ewk%YZxp=)#xlANmsqu>=y_mONQyHT*zz+uP<7i!=j zznfz_3b50GS+>=;3EiF!h2>lx1-dlR7)tPUj}g7KYgD;b`}{aYCcZrJR`axs-XhMz_*KVwHIsI~#pU zS*ugYb+Ybu(EaLRSI--Rk+=E%6%BKJR%j&%zg{S}!73rcKtritdB>|1ZZgnak;TVZ l-Fkxv+ddqRM>=pBblM?HbTgv3xR+z z=ol*u0EAO2IC zw9mJn=Xu_{=bpPK?Hk)?x^u)7E63Kp`EkX`NXtS9#Ft`0f%FxKG)H&`eX>6o*4j=v z3?!x^f%33t%tQHX1DrN*;$v_<@YBNCOY@`rPLR6DtOWelvMAq+^p>(Hd!gYKw)EJ+ z<6|#iwujiV?S+kS{2|t~y|4+6TURVAmUOeoR&FnBf#VhLHWiDU9W7&4O9}6wgcTVH zt5Cvw>4bdH%`;%nH7kUmTC{T^V5(C)CGE}7Az3rv0^3k&WrJJoMHRq#wPGUx3k17B zur|Tk1k1)+3HD;q%d_7BUpBrwsXwPA_MbhUX;)6G4fr;UVby!$BSog zLFHO9%C(?!E&o=o1(j>bDA$6@wfqm|c31v)xq~IEFuhU_c!$`5jd@%#e#O9=%g&S4 zSh~`zYAhFPwLva6zY^-z%MH_gMz+&7(K^}WnT%woX*WQg+heS+S>>|%BR{w`Ni zGo<@u40>-F;a%?MfPR=TD~7nP#Y_%1Ud4Ug@a6M75>v~I4a{d|c_cRGr0=3E8;Qc4 z7-+t3(47+lEldvisv%utwhcpQ(ycFXxy8hKb``MY?|<*%9;9dClAP?Fg`Bzbs8P4< zftC&v>)$d;{_r?UTPqGhwx1ed3l5&)V%x0xw=rt-t-0qO^zvexm4Rr^(GEl-=iP0| z(Y|Rs;yxaGCd9laJG(0;+I}NX!*f}+Olfpw`(w&LP z(n;`C0lc34b|#GJMskV6q=|bxveu9+wZgU2jN*U!c;D-A2eij0SHTVCq@8xHp~56V zud%mNYn(1;dpOiehv<-tAlpJ=oN%<+sdg)S6wZndSHZkUa~N$1AY+znE+|TB`3%HM zpW^cnA8ypfS0esnjh54h|8A2+yq=7 z1H`-g6(ZX$h`+s45)suKMGo^i6-gq7_=vos>O6wDM_x$fClPP!QjsLiA#UE2$%W&m zkmJNBnGOIhBc3|1<<}5Db4tr^BHowK@;ithI-})(h+iWh7kFlAFWuiL;g`GGgh^6Z zF6g?Xkg3Dw4!oh?m{h@|B2h zexl{VK9BIq1&xy>A%2mM-+>$_3RQ>b1e*{yF4poM#0PLK-;el}Ia+=Q@qVk8ABWPT zlM9L|t>Y|87?VqvPDcFr3h^sTwDGvOnTO?pU^ss44VaAhDbNuLw|A^2oc0V!BWm}# z?24r7T#6)IK1EV>q$2B_xFY?OLP?+Bi6|bRl*MowcZf7192v)JB`r#aLzOhH22)PA zvV@yf18ThrB}l3c1rRu;oElz75eg+Jr8tM*udM4Jd=YgCms?%JO{mgGD|HCL$`VAt z=S2%UaJv%gAW3C02jvbZ0dBv#LM@>LkZxMORMj1;Nr$&q2@cdLXH4Q=#)uT56>bbayDen?FpMSHxXXwTO^HGWUv*r+dF5r8#V WcWr!~Xou?Kr$t-7DE?aj&i@BKTmw@8 delta 4352 zcmc(ie{2(F7{}lDdi_C}P{*cQGo)h*-RSV{dTno`C~H|6tmv}f0yslqKZtQe{?ND> zNw?XN#(-8&Qxp>+OW6LHIC2S{Iz3EGG>9e`Vj$7bm-%K zw7BkCSj?9(-oDmWcMS-ft$7)MzAanas}#LT)opEVpQ3%L-n!K-=67@A_VOw%HrJX& zGb4RlUS=tK!7(V?2WUA+r{rF6K%Vx_9%HGAF+3L&Qu>i8Xljm@aYj1hDYClU6zYGk zHc0v1J}8Dawu?u+x=(ZWVqDDpMvJFz(^9;qkFNOs_JFK-tNr@vfx>(l@?3Wim;9()OhU!yC(yPc9%Z#JaDZ-qAk`I2Q&-jG&we|UemAIMjw3+u)y z)A#@9=ok&JN(YF#J9&~tC*UaSslA=K31ox9h&#yon+p1 z>rSCIc>3?bhMxUJw#vV!QnG3V-jos>xhW3m{R1U7Qx><3pO=n^4lCubh;KF>7c-|v zbLlGhE+rlM@G)~sr`#>kw`W8e`>4|ny;AA$8q0n7_v0OHvX%{(>N|7JIUhSyCM`Af z+$uY>WwGmdYk?2Skg_jQ{Kh_0&#AUEqd#~TS3G`wW@pYi3)~sWb8(+p4?K&c^MCTt z_1>*QjZik`UXA$UfgHU6^02b@dP-I;!Mk&EA-Ovja@(9RV|saVOf^i<(%2WBmfVqY zSmb~gavxWkJrP`H^c6M}hhq|i8qtKnyKaSeL#18ZU6dj7RqaEZ=ko;l9uiV9#BThd>=GP)FtR=ai z^CU?4$Uqbo`0gsu&ua%G&yo4tQ2yjIB=1DLZ-RyHkc;<(q8Q#UQAl|FXE2NWc~sz%?`#tCwU3t?O&06 z8RDl--{nya3f#Fw76dT=iR9}LPvlA7jQHs9ByU69bpvyBuXmzAoYDl2f*!=j7LxoG z#F++?4!~NvvXHh&koKX5oTlql1xLC#WE-~V#x(?E)x_Ym^L;TWzk3- zqI_K=gCx72!3k`L@vAhcbY__tPG&it!^t7Ws0;7`Oa(rQWrUxZox@Xd3~QXMk-@nf z%krqqA6c(;RvPqWLWkn?PUUW!Vu%x;Rq)lPi3 z)AiYbI1CowM0fvYz5aAKtl8!`T$;Tchvn4G`Yg2>K0%GF$wri;GykNXv=`_}!<%8b OvqfJuP)}(Pg8u+zaxP&2 diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co index 0156dee7341a907ced8b9cd33a8efeeed7ae5142..9353d04b4023b60c2084a2dd8844bd12c5c45522 100755 GIT binary patch delta 1871 zcmZoz#kgQS;{**Rg^-C_%j&<_F@OP#W)Ohz8G#fV5dR2(@P8Db(oik~2R}p{rjH58 zB}N}q|73qgK_$X=Y;Ix{6V>^}e;*kzI|3vYP$v+>_+LHH zPX;7Dd4ZTsJXk{@R6{6|hDfM}ScC?ChKBt%44s$M7#SICL9`+xgN^N;W-!|kiJb^x z+c4~&EHAA&`Hq-@ggS%iMfFk!1`Qys0i-p7v?h?A+@Pa0Sx-e^a+19DB6(R5>SaM}UKWIU zSrEy~f>19DQo+js(s;d`u$e>VCnEv>DsA?YlVUZoWkUFK$VKgDJr#2%eE!@#LCt~H zq5XH$egQlhW+=LTlc$n0NIT;Z8nVz``>OpYO{l2*iH76o3=T@ zL4$?y?dDDwg6ekw)vjRPe8#(ni3L;wPmb_wWHgw3%dbSk#K6Kl%`C~p$Rf=k)hx}_ z+%z>QG1(%~BGKH)BqcG)(8xH&z${gB@NEVIWjK`L0n_XZEw-rQ;!yD^ z+Nk16Q1J>h^}0~;H*u)yEui8S;b4X^Ty`Fa#l*qzqhPXDz&1-KQ%8oJ#H7@mcmp$6 zLm1P@%oN5nb8&?+-He@JOc#^M`GUHW3j_@|mj%9JWcp$^`C*XUqnIeie+3^k28RFtCm#{BncN|6pwZytBc;w@dQrWUfuX_A zPf7#G)&Q{s0;Dv7Y)ue5C`fAZ1@TaA5SvK#n;j%Pm?g3~TKBUzvFtZvY1v=F(6j%) z60-F P1lf=nb6DfUlxkWkv3CNIS*AjJf=n_RDNKA=jn-3F5_ygD|&(db~2Is3b5 zKQP?D*qEh>!f-gCN3vU#HisDk!{NzaO2c7;ImvcI!r_ip3yb3E-!1z=Vd36l({JO+ zu>Toz>wfnRAiL3~&8CrI|HsXZ_B%+A<1(jxEQ}vFo4S*%*I=`S*B-XXKLQ#VjV4zG zmS~t-BpH~cB&QmtStc8rnj4rWo2FS9Sfr&G7#O7*S|l4KCMTzvr)o}K$S5`0Ac&)W zjSo2A!U-NQi=E+tHCzbH5QmC$XrqcNLB+42sn>;y2gIPNw}6WGgn${saM^hv783_U zMBe06LEEex9i13*5|dJM;tkBqEnrL|6C)VY%-j^lbhR{rF`XTaCl?6n0I|Vlv5;4c zOc6Gd8$;zLD})Jf!3!_H$yQ-zk|&^L*$HUbW&tb1ChH1{PhJov!F0lJ@~$vDrV9Vb XzkpOmz+|g%JI)&c5GOAOnkx?gsntq` diff --git a/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co b/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co index ddbc60a6b520ecf8a224872cf5997d419088b871..0c310c38955398dc085d8786b62ed32776946dae 100755 GIT binary patch delta 1900 zcmbQRn{mYf#t9lsD`F;UEvq+hX8;2j%^(2bGXg0#AU24E@C_iCs;Q~kz2tf5Q z0hz?;gX*8`&nT!w*pAIjjC`UxznFaL1OgcU|6=m169{4aujcO~17=5n!~*IBVi^Cc z2l~l?#3vUh+r)!41VS~0B58<(YKTQ>;Ad#qZ^O`eNsW<_!4^a-GBViM?r8?I9g*0H z5Vj4&?#b`9H7Bo8Hjq$fFukZ=%D|ujq&0xFCXm(y(vufRD^2zjHtDuA1yPFqCM=}xctI}pU{WO-zYm~N_*fJsfIrO6SpNZY(6qTQh`24#$ z!MuZIvVisuhxXr1`x$^K41_CLnuccl0+VyX<{LWr{R~Xcpg5LdH97cq2*>Z{epfzS#WKW(Qu~pir7Dr$23TgM$bMQ2kU7 zg6el{HtzzlJyc#k%!xDr&n08PCvR9qt+ zRlNmN+$9>!aE8l5SnLc>(%?d1h98(@XQ)6kXctJFiG#tQa`LQ*Z2?ZE77RIwNvS#U z24*g9Fs6~2F^p;E;tXXP8k#u5L|sh{;iATtaHgs2<^_=fOiTvulMAEeCJV#}aKS6P z$jMqUW~b>9MLU6{Z&xhTerX+h-VRWWu<0a24*0jYxM$y%{?oDU#=WjG8pOdbF! Cf%Z}W delta 1422 zcmZ2+fN{cZ#t9ls0#OsSmeo&jVE_Xd%^(2bGXg0#Af6Eh;m;^Tr3)Zjh73+91FDY+ z$R$P}RR3guMnNUQc5H59o|*W0QEY0lc-lmW5?z*ekoimpe#4pAhMCsX!54W5)D&}Bm=XQ zEi4Q||{Azk?>e3nb3O z!2m6NCYwiZ3~+R`WXMTOO3jHkFf%uUF^xRz{E7gWwN25$mDr(Qe5z&E^P9lI5Q!Ra*#?9XCr*25 zX}sQ4+Wbn|oC&XAedHuKO-z^(ejI91X|&l&Lmj^#d$eRY9cKP)+Rp$?O(2}k)-;6U zw914)oZc}5#_6d)Lpe@QIS`0b8D|Mj#pyp=_JiWpy~U>A#*<hnwpfDY>{Y@Xl`VZl9*&@WSnAPmZ~{BirauSnLbK(swT+CriBQqBm)566B$}}`JfQcF!8Cg#D5lq>U|303pNWV*mgE delta 1347 zcmaEGm+`<2#t9ls2QnsVEvugp%m4;3nn3`vt zkV}+4h_=c8jDkvp?bzJJ$S2D2U%^L>f#LuE$s06mCO3#1Xf*ivNU1ZJUQ{n-U}*63 zlhOdPH9+iu04Yr%TNA_%3X+=qK}KovJxzhhzr>ZbL1M(}Gte>EoFU=BEaA@6x}Uv? zWxonr%l-<6p8fxokem`=M!Zute~^h}BGC;>oAoRYv2XTJH(?YIWQIC`lCao(Pt%+v zM{G9Gk>QlN|Fda7Fz~=wk*$fcfB?ox!sZ`lKN-mgjBgwQZ~sss9Dr%UZu1dm(%b+H zizn=x_xPGHD&GCsvL6&E?kzU`Hl7Uouduf6ckckQ8*SQb8X5M#*evM3gQS3-Y@nkw zSkEFN`8t|Bl5vaa@xPG8!3;CU43p(J-}0GB8U?PBl!kOg1z%H!x2& zO|vktNJ}v=FiJJFNH$7LPEIpV)ttPLQEKvq95!Qi2AddYriM~HV49s_hcl|UI8mxJ(=j6Iv%TWo@%`wQys|NlZ%3i8rt?bA>UD zOdMfM3o|Du)6m4s48}AxpRAZ|IJrj9WOG>dD@LXX!IKZ>$W2bj72txGVX2d=a?OMo u+MxEQL20hZhjPuB7}6&H%C%$KkT%&W&yHz9`sAuSJ5GTNh?@04!{h;qzFYDD diff --git a/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co index bd8d6a9888eddb1f26571985c344db480a779b6c..9b8020fc58664286330256925b024b3d0db8a7d0 100755 GIT binary patch delta 4162 zcmds4YfKbZ6rOuwS=cP>A}CVVL`YkgLhZhH(6wck<)JmzHf^IR&9Vr_MyzPGFG1O5 zX{c!{GRAAtHmTK=RndY&92&O_O&`(*TQoFTQy-W%jfs+oCTb}yy)%1#JM=GtsgjmvgQ%|-Y@r?cDlp}44H^VHbG63h69b+n0WaMXu!aP@8V}G>&Af|x z?_3E0)KWp_0m`*B(jw~advt$HcTuO9%Q*#%ey(Y*o3fvEZ=)y=mL4oiu`I>1qcD&8 zWUX!9n-nFwU@5v_DY{_UQD|a|YD850oS{(0LAyS zODh_a<$7jD??l?)S#I0Sst=!thga2o1*DD|m^Usfkc%eOMIId~P8Rk(eGV|8BVoAJXfx5yb&eXF zgbF8yF?R7Q&wX~o1J1w?ae~sBG z=H&ME34bT?D_>OAlnM0+u)!0D1dS%}B)THN`upJL2<$?cM=MF$`JBMR=&S(u-q%=!zV#Rr-B`kEAw1m0b9$R8(z0Yj-I;0fkc8?TibxK88 z%B<2FxHKtf>_cRmQ3m6NDa4oxifH&j2*eeXR1Bs(LZCF(x!XHimOt*S8aA1^ zXU=yX``x+Uoy-KU>ixg!JMzHax%iWo+%CkBpE@!rp-TpoAznH1dMEL`;mC38WEgWH zff_WAL`9QF^Mz+%R?a{75Iykj> zaCMHj;*Y9n={uQrnPK+J6&-r{sJ=~3@Wjb$h5N>3QTG=e3mT?rX)gMjF6ji{!XOL? zt&5^>M2V(|V~aRV4F$rsDgT3b!@!tFVe1YZFX}wP1LKp?^W0#Gh3C*yL3lyvvB$tO z^XV9PzT(xKZ_Lmyn?1SGyOV@5t53vv*Iuw|RtPh0O_C!C&s>-E{W&Z*5$}f1xX}If^342uyK4qVKSPfM=X-N&} zLG(*EHEk@?(CZLq5Qk3B0H_kbKQpdLQmQnhlzmVzQ&6hl?hj#H91YIX!BH{k+U9AG zMI)(+`XA;`>SH8Xt3DbuO@Wkh0*vXBBN>JOZeuV&Y!cx#byd}IVvb?{ttnCbmaJJ6 zF|qZP5Vq5Cby+NI6mF!xd?94K(YtY;i;g!3?+} z^|Wftd{_h@XQ9(8l?TK{kUIL|&SQ8&NL8OflU~0icXIjjshXDmN50z#`&J{j)czG{ z+pvic#-Bj*lVaN|m8Zm6z?-%!t>@IpR^yw>P{pnIctBqiK

%{)oDEw4<69n$4D(YPjAJ}j>0x#8QJJO?azT`zhokL7Dj zg&S+@%=JaJW|lK?wQN0a;_Ek>OqMz`Uu&tZt*z(k77IO)iBtMeV#cK{^o@xQGeTnm zW5eb?G&keUaq{Xl^~(vF2}~}oCh$4u6hthZ!dr1CfxmtPuA=a>_#}Z>)r79Yr;QXg z;hO|L^i3EE8M`U$z}fu@9NZNNcTm`dpCfSA%}Dq#g-38Jfkz)97NBqe9#r6p?&XoZ zf1>a`6zRWA{q2v$k5KjWu~4s4SB*B+1o7&lG zV_DSGj4KPsHnO}|=n)2|qt@JT=7pc~)! aqm^h?A6`C)R=`fYYY?r35&Y#KTJ$&cfF-5? diff --git a/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co index dfdb2a80e9012701d8026ec5794d424d06352878..c68accb11e716be26d99651a9d3ba43062350fd2 100755 GIT binary patch delta 5689 zcmds*eM}t38OC>ZfdhIGhs|elp@Cu>T7!4*cJFqpb9{SWHVHylft1u~_y{JpN^C4S z!nLUHfTOi7q7tp!aTG_wA8Kiww9u=wLB%~9IjW=9j!7+*Mzq1IqqLSP+pVkE7|ETz zn^`=LxTF6X$(r|`-#hQTGxIQjFI})kE?7sZ0P|Y)&r~nq7~_kKj4knz0cglF^a9L9 zdhAK2G1FQ;D9-;u< zr3raVB<^7jee4B**=UUAfbndc8cf9MeQ|p-_AtNEJ}s`+Y$be&%1ATnD-7d9qYsUK zH2Tq~_3UbXte0UD9E}N%#so)W?8dGhPHVASb6TxKamzqPoY!8qZ&M~o*X|T{prp?oI!_V{Hw_t(01Ev zmMwd7`o#a$sCt?Gd~pSh`qDMJc`B=X=1w$jG`l(K`H`oNiSNi7if=xM;(pH$JpD}E z5&M2aeO9UsYolYcvGE&o8Cv8eoA%zfj{!%ld0VLUheg1?m6_fOWy6E%JUf>e^M25m zqrct2RspU4ge~u#TT2#x<}0Z!9GgCICd~?C!#2>Q)l4qGb>rJhzt{^JK&<=Kbo;MO zG(4VLe)?84ZnR1a+=U*8`wc9_@-LlFYhZnT6zt2^BGHW4`a8pxlUF|nK!|*KqP z)+fJ6-#YU2%3GfwL1immxgL!h?Gsn;T}8WE&e0Au9M+$B9h^;hD8}P#c>9zc$okM( zu*H)4DX*Q?FMR;KmEfqJe+~ToAvN?!ZtmbWl3sqlFr}B#b9^S{0V5lCRluZ*8A6c!B zonzlF0GssYzp&?5WaZ?Xs73EP#vbGJ^B=Lp7QHQ-o3H{Nt}5jAR*Agq?s9dA4!Mip z?&^}<()Nz_7v*-j-R%%N+dBk@vy*piuhx!(N8xlOm&I9_F*sAm71xaw8Ncr@eHAA{ z3p0C%)+?~y^TJZD5!GOQ3JOJBap~fE8v8L`gzX52$AdI}4&$aLX}uHcQ%}+QZmbu4 zi`MsJ-M)|3hp^uC-9!MtDB|)<$2;hxH*nHESYC{_aO!~*nZWo4+=_7BsbA5ve~)o} zaxD{njP*YrHl;-HSFF#%31hXR50v{4#y#+gv0C40dUj6YRxC{Ad$eAFb^d)?|0>qc zL$w60c6pY@62?d19s^&xP2;CA9*4&b?72Z>^p_F(vM_t$48nCxvN z{~6Ygz?-OCsi&L9cQM`q%O6Gf*I^pxv&I4zrXFrJa0{gII*iZ3egk{{P2-IiKMyAi zJo*lepTT%Nykg)j=V-hW;|R=u3}NfvXdK443aSQ{E0cfjlLPrN#thtJ;EeXfXF{3N z7%#%(2JW-b_;(l|gfj-d^)(v*8RIqx>r322j2Eh9 z+++HRK=(0XON==~K;6a*H^@$-Brjr&A<*cDj@RNl;`Shvb7&kYp@N zyr0Yyd6Dcx6#Ql=iew+6OBI%;k(EW2Po8UctLT@>g^Ix-X;^ZYtx00Q!Gx2yFS#A0 zO-VHmQwj)VIVt2O%ekFqmo8Ny%efpX*@jD!eYiE3ERh3r`GREouAsT2uv;+O^z$OQ za#a#ZvK*4ll@-%$Zl6Ss${h%jHDumNwjnzd(vKW+le)6YX@;^N)eR@iFZ%>?f^xu3 zwxl3u;(aJimE5x;g~-Z^ERiEt{5)w`3CiTYR6anKQ=Lv?Fkiq=uEV!YkjW9NE*~kO zD&`%iesc|EM$(VZVSZQnBy^3*TlC3_xrWch>lZ7y4ixd1!8X+uK)rG~Rm!r=+r#1UaHAHaGR%%*N3OP*c8?ZaBNfH={I%w`MQ1kJmuZzwG8;WFU#QHd0vEx3 z`L5Q*Cpyq00buONivRU0XGh|DyS!MR$5g{wj^~ z`bF?j-WzeyXl~2xHTsLkrdy6*%NDI>AIXM)9`0a@sjAW9BioTZ9NrclW{R@$v+u00 zaxzI{?uni%ARAp99j+_P9y3 z?Kg5S9&sx4F%LMD|Lm2;@4bmco&kpY_mX3k57ij_Atd#}=?de!<0`mo@0A6u_vOcb z>!6Cv2CE9s7H3tUlij`4@Ei>RcKv+*^H&x>TN6qC_S!ZIr&Z9h`%1QGwM|t29J%RhrK}-K^wrF)s1itldTHs;xN4vl}m&1GgaHW~s4^A|);Tq%g zOf7&HWsosHeh<9s?7}O|cGj5g)8NFtV6XX?PryI#xaf1WwXZ!;7W#+v&V=4R4~{q% z`p0*hsfXF=N^>g34%L|LE7%ar&eS@Rg!RFOh5OF488aMUon5)s$D48uS?^uKrwd2n z35;&1!|=p?Xa`*m={+20*>9Bv-Ln@iY+|2k_Porl|Re*IE&az7ed)Pgkx%~_~ z<}y9i+(9=8!uRXA7Xp&HnU|A&{lb9OCrApf^hpCMuMQ-6KG-j)eZk(oz5%7bH9rpg zFtw7a;#|xzI9$&)c1$%`|H~}=G?q*kQ#ViJ_hUTyy#-&(dI;kwcnO8`r5>c|D5f*e z+rTxZlX04U3e#8D(s+_Adz8k1Nc=a@_)8e)hiH5h@=k>k5nReQZowmZW zs7Bw(TPAW4(^nyAL<1W$`q*1E{kR=`XDRq3O@ANLZ^Gjiz3v}0oyK$$Ubg7%E6ZPH zkZt)Trg_+MFQTvXl->!;ykQ6LvgojfrjKK~8xC9a#^V@Y4#!bcUwAJ~Z^ZNj zJZsT)1)6>y(@6-HA-ek$n*J%K=b&KGV;|7;UORZNMNfW2(*~xm!<`nrYo4Y*!1Q5w z+@hyErFVO)IxCRLRx>$xc^UUkI>8L~Ztfr4Bt{bp>l-u;3gp`$iz7jCm{q;xT!P|I|LIV2@w=y5sV6|LJAf% zyI>(Ilg!9hCB6w&C&7Y(kS4Yyl5{~OO6m#f2|GkgQLwKV(TT4pDI}W6%cL|>lt?|I ztdrVB&CZM_jE9}xBnuLWCh>OW5FaFgLxN5Mhe9F=9FjFs8d^qjgtUacEUF5);7~k9 z_Dc$eh=UZ02~4`Y`;rp0J#>wzb_^Ju2HF5_WfyV1nd`NO96qMAgZtA`vBI{`h`wGYfL?(n_waCP%)U thQ3vt;$_srk)WhGb;=Tn6@NcWQhd~DVR-?xpd}=kf=Kt7A2!a3r diff --git a/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co b/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co index 852461e4bb127604a2c5e71de0def2a8d9ad4098..47a6566bd6bb40d3abd206bd42580211c62f2543 100755 GIT binary patch delta 1826 zcmdT_O=uHA6rR~OiLHOxf<$bwZN&VDKbxOT_OPj`g27svR4TTjB)iE5g@O?(dP^6erYAYtXYxrSmnu51;!W@PSf7!HOs|+W1>;n3sYNhF1=qF( z`_Y^xqKB&u|6uoducci#Yy^hXiN3B5E6-|y|89fnqbuQW%(mAZ)ml~e+_3a$_O%#a zw!tJ36uhxU~m*^KIoS?^!k+o$%y;d(_g zdYh<-kHzyyyKbA;5)TM3iTA}4(w5XfVy4sbEUa7*?08A-x@cWz%n8N#Gc|KrcXS$; zF2vI%%)xSc#uw&EDB6&W_E3%XoBteP>!#PO~1VcG4qWIwjJ_LH!9&(`C z6ur19>@m@S^fT>wS2GW9Gp>=^Z*Nb~w{th!RRYa_$hGHR8X>a_=_Tlq-V5h=G%d-> zbea=FLX!rgTT}=&3j(1CM+*YnFx+BKshm8(Fl$MuW)DLl=j`74MMGJpY$W)Ohz8G#fV5Ie*}_zv}`bOD6RZ~#if^f3YX z#OQls6^O~%}tDYq8$GfeAE~i{{NpWAY(Imfw+N2gO87tI)mv&^->0g20uS3 z4Iofvz5*?Hkr;} zP5Xi22gc>BP1Fbr6H>#%92gc(f2kH0_T+?x^CULK_FpagLE+)vV$*Nq$*{kGwROLH z2aw%p(`M7iuz%a;x1J>|q{K5YoOf*g6KTb`IWNGCg>l>Ftsx{Epfnj6Q9wf;uup!F z*vM!!IVq_`!_*?lz$_&>)iBL6+0fM7z&zPB&BDMUEycjVDAmv+*(fnNIn6v(bMit) zsmTp#Y{u*i65+614WW6!6gz{BBdWMKR6JyIV6wQE0h+8XRQ5>@sxAwtI8PF)_#zOO ziG#tRe)6W|?Up8H3^|EOsX6fm=FV;~rje;BjA`!d3}qS`Ia$J(1_s8HFQn*gzL288 z#N^;RxiC#`vOu~37rgw6pRAQ`Cb$7woNb7QxFGSWqvWcwT=oAT*Q5P!xri)yQxQ{Z$U4V5$_O-8xIoZNiipkx9II(JYJ4m& zrd~u=T_o7{$~jB(y&(?pPnhsJLkuP6eEG+@9hL5it(lF?IGtc=<9YhbS_01*oi#Pb zn{mIgsOg@M6J~7pG4C=@11^*BbM_G)4>CuYLBJ`|<(j3~>x8U@N~lD)?6K>3znDYQ z&x|H3o;RT?=j4NIi~W<9B(}RHojJB8Y>nS4Bac#ISz}@Ap`UBVQRSW0`!mUkRdW{HJL}#8<1`5FSh2cHAo7n(aCU1h?9L{H zANqkl4IG4IU_}FaOKH+<>N0lFl(~y^8@mi9gS*3Z(Cjk1OcdSe>d;a8PSWTuV^6_H z0yE_>)w|S*PYZBJ7h?*rAvymr<{LKg#hCXv<@J@Azf>dV&6v**7kD?abvUuGsbL@H z&v$H;il7Da6RmPStcMq14Iht4*;jFWa9m=9$nXCaPRtEQCjjtceyT^#k70g#Q~d{+ zzc?eWKaKg&CvyH%;nE9IKx(G^PJ_6C_*vd?0rP)#n&s>Ng87CL z2(-~Hf8j)5wY=eF;Zl(b9L~VCO8A*qN1BYF$JOEPX`^VJE&}x!l2%)t1Wo!4mNJ_n zEGfY~^)?CG>dhF^CY!Xb&Q`w%vpO?X4|=@f&N|9i8=7 delta 3685 zcmchaZ%i9y9LJxh+|fd&o*PP6o2~EyYpKii`k!^AP$-dXj1Y$1-Y_UM%s||WnRI_* zfwiVH3sf)UjLR05nuSC&W}YcH2}d>)Unt33V#XU46B04_Vn*c!%;8=8JZq0Pa+It| z`+V>7Jik7_eDA)!>*6Yv{EbSPH9GbK%Z*sH90K`8M!;kaflP~KZo%`-%JFn|8A#Gt z;5y9~>u^1P6uwyb|9hZe`0*ZN(+dglBAA)=bpX**nGoYxyCuyZPEV&Fy`<4?RJBzB zu(81#^*(-4&zc9@+6)%p{K4`L0E6xAhH6Py%ete(V3o90)}fGrzZE)`;agmfOvjlb ze!|$N|4x?>wZoJcq2gi`ToyOXc`Y#ew%$UTorEZ~|14C~2cJ4piJq@GK{K2N@4sCO z((FBmCd{h$pXTQVbm*~Vjxt_d9~LD)Gw(1pT%aHRhuidH)-L(+{lA6!;a@4hkIxRx zP!(6!PUl@ zdd@X0qQhxDM30Uo;WJIlg(CfWX*MnI(mRkAKD6J8n9f709ppzwzeN7-C7PMtiPZ2D zW-Cg)rLGU*q#iDyxq)Tcm{`lj<~+Rvu_?@$*0Yi^{+)FwQkH8sm{>IXl*gPi{{5JP-<$Yq_cLL=1pS^GLP$v;9gXhezoWsrihtX zp!MhXWK%R-QecV@JNdh6?{Io38Zloj>X@I&jc>a*9>m^L*93`<+`;^k&e*t`%g@&( z1?C6&AFAs{z6IwjG5so8Vv>IMGtUCsw#2`pu8BNLO#dwEDgvu$d&UUC`=7%T(fn>CmcV@QRIWf)Da`vft-peK>eGyP`f&sE z|qPKI85!H0BMa2vZNo*=^%SZl=S1U zyF8?<(MHxf9CcpMd7{PHb?OXH?e3fU(G6FiCR#4k9fbRYMK5GZqvUU%MOgDf*Y0I} oyo~<_>S>|A2D&uM7U5hCtkrx}E!>eu(*a>k9#aQ|nuD6f=~9LF(vXKpw{`gcec~$mIq(f zO?Jm)6o&1xP8~8|n z&-t0%*YlNhiRw22&e^=cl3V2fi;bNrC%bKnYJRFKD4w2>?GfpaqdB>*-_ zc9Uf7lC?|rYzxJYu3W$7FaU}~vWi5qibS$JSx5@ps%-|Yc2TEjX*Y@k#t!ia=o00{ z+fcT)IoaA!wzhxD)`qgR&B@k=vbFsO*+vTfJzIfXdz46;>x6HZ&pMVS*r_xl|IQPG zObwC9HKi*1NsAo{SvgO`WcdnY;a5h!ar3aZ_I|qvqz6oX_Z!<&v%WmJwvp#r$A!I* zCC{5r)Op0n^#7TnH9p(YC`_}fl0MQ>iI1Ds)!gq8jl622+Ao_wAOUzQ1)gxh7wbyB++76!I=nTZdIgu=?t{?AP@SZ?4N9#)&;C3E5- zO0}jab&AG4%Z*7pMZ96e%+wkZNv2pgEH))g&pLV2#Lv8Ljhatp#)y@tbKxS)NePUugIr|3VCU4Z@vE2MJDCp?1U@ixW>&N_E8Z z+<-LFdrjTsV(#;-lH8GU? z0NLz1;AEZ6O`(16kUPXu%>Gc5gQEA7uI4=cUAQ_nQx4Oj*Y~Uc?eV92LSO+0M=??D z3dLAJWFF7Pyi%{wKzyc_dFdj2-zk$Ll6K-|`**MBTuMu7z!Kcf$f zD3=y+txwOtM11;+o?k^g^CLa~3GpN93bf!9;@boze~p9RP+%rUzjc2h-t&7xD|?F+ znFZLt)^i);1K;WS`_Ib1;S{+wJbe9jQC)^p0^`@3mwqmiOwKCY}f015igIr^?~y!&=t`0 zi->3K((}uR4?nBtw0yKjqyc;qh!^yQBuh6~09db**h~eCMI9vT0^v|obGU(W`HK^7 z335rFUvpWS#*}ii*kx%FyQKj?cKrc2a-AgQK(3>-1W)GhI7&e{s@&N@xg=k8>44^X z9GaMAoLEro(FFcdOeq)bLjsB>@zN;LAHbAxP{?H*G@h4^I$hcn9=A4ykJ4Pf4`)X^ zU3d!G;lR5_(;lp)SrSi1d)#<0>5@`yhEf&_7=N+6Ra@naXIK{R6+^l3iW#R9?>6Hu z)?A-k(*^>(Bb^;OS_N|mqRSgQR|5+k!e#HSiruJzu5{3f9*=5dm&CGz&_#gi*zO?Q XN?fXr^~rXsE_Oq!_1<^eyMN@?B`jI) z+t2en@AKZb@6+}Up0{RttqB*ES-Y?FL_94)p#R}3V6rcPPHj40f#S{Nwbe)i%V;D} z9d(~|s9s(LAD;97J(PC%(Id{K(>m2%pzplD2B>ger`mxun5TFiJ4-o+=dYO$!0_;a zR_K=p3t3m&nl;W+;Qqe!4FKAztDOrqy-?RRHO?|km+88;)+tvt^YRNtPh?~FTa#GH zDi@2ESqi^%bgK4tT5X~`)NNp|I_w&=4Q8f^ZS>!0P|`DtC{z0=&#+2yNz}^l6zV@w z898*^FEjJfeAhs-s$i1?*wX| z@@{az{0Zm@-=txla&)=5S|zcv!cNsKQGTpG{U8Qtn^JC>s}BtT@2QoEeD%syW0+pC zO~x?Q*8fbKIWtOm=9*PzHxDwRCD$`k)f}*z=`HIYDA!sOw&Fu~M`!0{+hokno%+B8 ze7L!5kok5fS09+J3HWei&t6+e@@|K!dx1w~tBo8LNvZs>+Gfh*^2~seIJDbJNf!Cz z<-Ky|N>3`y!>g3Ctb3lhsngyr`C98qrQrnappLnerrt%C2k;fNO*v^Td~${{HT$Pb z#?)LqTW9LzYE!eHVJ=y7JSp8(%`o*21<#o}s@btA9@xYbZ@E1>Hv6XJI5tZEjdNyA zXqcurhDFG@|IXD3rfcF(u+vhfrb7{yyn4Gs)jh%E^QQLEI`*5!m&z-I{if$9bxeYT zv@-kaPc5k`7u@TBzSMQr>_TXI^u>O6CKGGnPRJ(SHpUvcCSQzmds%PH z-6XM6(?*u%8#yV)uaCu=yp0Rw(l|Z`@l&hF z{HG9iEg`vN@Hj}6k%15@C>~evEaHU?Wd8Gr-+qqd8xj9}opFmex&`rpBf5xV48V32 z@b4OBy6QxH|7MaW5Wn*p$qysWj$7vh;um&|$=CaTfda>Nj|ph95Al`*B>x8SQ>RJ( z1L9XdBl&g2XCEi|ui8tkza4;HGVmuV7@9>|_b=k4c;Yvb8;@LeaO5wN*P#3i%nB`t-v9L|(Dx7y7=m3?s<~`3fvOp#YY=N5s+- z@?segJy=!tfN zZpU-Mk;Nhs&gJf9ktJgt~%oi+5 z?O6x^fK*yaZm)w+=i@g$y(0N#9rP5qjN=SkQRDCoSdu&H;N!^|^{|L?MU#9zTtW>* bldo&nd;{*++6=4&#merRyGJpY$W)Ohz8G#fV5LX04_!Y&dbOD6R(7^*`K=m;J zxy0y$>Ywb-D5yl(j?GPsVxl^~n0)F40vP}QV)CmK2x0uM=IkWgnZy{KNwz@P!7HGs4xkk$m!lM8f}Cht=bn4BiAEM3mP09FU2 z!RmlCSRIg_?4WM2`H8p(vrRZ>>wfkomi=p3TJ~2k^y~-uKoII>K_o8=LcJ`A&C7yN zFAE}hSrF=FK`MB;K^m`@6E<7ONHP)duhQmua#5@%p-c#W4!Nk^yiX+@uRk|Cs5h`W zg#T{Z&j3taAiR>LX=ujl<^tVHynY3yVNjedW0`!%WQ)nlze6})fmzmW^At1uX&4x{ zCs-ygu-&2<`nzR6D30A*Z2E0H8TRwBwC;EB0J0lx+H4vb_Oos-wYxzu>?Y5Xo3`1& zDT9TPb@NO&g6ekw)xKcfY~weFi3L;wPreb*$Y?M*DzHSu#K6Kl%`C~p$Rf=k)hx}_ z+%z>QG1(%~BGKH)BqcG)(8xH&z${gB@xDr%615LdyR6HgTRlNmN+#?dq5QfVx0I`@j7%GY1@W3lbDp66K`N{ z=mKLJnH$2G=7x?irkjg7jOpTHG`Ubvck}UJMJA>a$H|OAqLcrJN^-$#u;9r`VP--v zpcUE+Xay(1Gx>F>?Bw<^Nluv1da&MoVMR zX#m+8Aa+22lqQg^31SBYNlm^W9;yvu6RCc)gG2?h#A%M!{p?LF`_ouj_E#|U?EkNX zWc>~q;;r8#TggO<^^+YWls0dZk75-tVuD&tuFp3IsF7^9!Q>2|j?Hs4Pq4_${oS-5 z7;Ipi%+f?zFl^qSPqJH-Hm4Z@!{N+d3d3QOCCOGpg2Bh;2#ezE-!1z=LE+wF({JO+ zuwRU&b-#NDklkq0X4A;9pL4UJ;| ziH4~~l7U%Da;jmPWwN2Exq*4IX_|$BMOuo1fl;cVMY2(1a&nq^s^;W{j8c;~gmBcy z_(Ah6l;Q!?>TXUCQ)FVgVK;eU zi0I_`5t3Z+QY&Ecp$Ib}k3xvG9)S>g2jArG2-(T%k&>J+q4i+Bagj<)KLRH=McQ$` Z2!g0BfEvm<`8iN|L@-2jB}|l&0RTnURFnV! From aff40475ddb9e6d264c1ee8bf07a026cdf4b3b4d Mon Sep 17 00:00:00 2001 From: "Fang.Che" Date: Tue, 19 May 2026 10:15:43 +0000 Subject: [PATCH 5/6] using min_v rebase --- hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co | Bin 22808 -> 24384 bytes hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co | Bin 23240 -> 24824 bytes .../pa/pa_bf16_noquant_gqa8_1tg_4w.co.orig | Bin 0 -> 24200 bytes ...pa_bf16_noquant_gqa8_1tg_4w.co.poc_kl_merg | Bin 0 -> 24184 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 69640 -> 74336 bytes .../pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 78816 -> 83520 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 163176 -> 165200 bytes ..._bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 193032 -> 195048 bytes .../pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co | Bin 24032 -> 25016 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 72280 -> 75192 bytes ...a_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 81424 -> 84336 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co | Bin 20840 -> 21824 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 23032 -> 24016 bytes .../pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 26792 -> 27776 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 175496 -> 177520 bytes ...bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 205360 -> 207376 bytes .../pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co | Bin 25264 -> 26248 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 76680 -> 79592 bytes ..._bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 85824 -> 88736 bytes .../pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co | Bin 21488 -> 22472 bytes hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co | Bin 20520 -> 22096 bytes hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w.co | Bin 21920 -> 23504 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co | Bin 61104 -> 65800 bytes .../pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co | Bin 70280 -> 74984 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co | Bin 161736 -> 163760 bytes ..._fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co | Bin 191592 -> 193608 bytes .../pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co | Bin 23888 -> 24872 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co | Bin 71440 -> 74352 bytes ...a_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co | Bin 80584 -> 83496 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co | Bin 20784 -> 21768 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co | Bin 22976 -> 23960 bytes .../pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co | Bin 26736 -> 27720 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co | Bin 174056 -> 176080 bytes ...fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co | Bin 203920 -> 205936 bytes .../pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co | Bin 25120 -> 26104 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co | Bin 75840 -> 78752 bytes ..._fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co | Bin 84984 -> 87896 bytes .../pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co | Bin 21432 -> 22416 bytes 38 files changed, 0 insertions(+), 0 deletions(-) create mode 100755 hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co.orig create mode 100755 hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co.poc_kl_merg diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co index 77f8a99fbae069e2d5cbfd69108d9f2afa2db033..f6a11f006826cbcc7af02474222440b574f948c2 100755 GIT binary patch delta 2345 zcmbQSiSfWb#t9ls2cjoxEvv6^WdH*h%^(2bGXg0#Ag+jj@GB}%=>iCsp@SXDfa+rc za*5Fg)j!#vQBaAn9h;jNvxH>+GfdhCG``_K!!$7M(qz*oWFyodW+T?H`HL_kbA2G= z|6fdgbpk<*|JD3`WWelDkXS&SKp5kH^*}!vxOgN~JPINXRuhZF4o9fxXK2`O!_aw2 zjggVT7DOvDGT7MeX$G?$5o`suL=b0Jvz3d@Br6x&X?`{W4gR)*O#wC{4S}|zO_M)} zDan~$R4Zj*P-if`s1BqxfV2jX)&$a;Kzg!(cn~2a3F2O2*iD%%psd6OHON3&V)Fy> zMn+hu2qJ}wAT(42(LzNK8Y+TFp(2PBDuOf#l^97PLPcrvJV^t>A)_?;oumft(2<#J zAjJj?AsrnFKF*#>}O%!93{of0SmCvWCTi&2FfaojhjQ2S>WL^nvj6m z3@ssP@csYQv>#YFfiWZN<|Libgaj(hkjo<##@fxXrlSc7QR-tRBo(&5zgqVH|Ft>J z?w#=FfFK9P`WYe6Iu=UtfN6Gy4hK|maj1BO4yw2kRJ_IkRa_S;-h*bo1yuY)Dwv@S z*X0bA@QH;Bffeynr@RB-kfc yhs#bDkC5bq39Sd~^@&hnnh`miCsVFNFe0oBI@ zj=nnJ2Fmm!14dJYaH^gvaDv5(=9i zNla&)EGsQLIZ8TU@+oPL$yzcBlY?ZWxY#&b_p>*#>~CV(TqeWJfv$0KkeU*s!{$;o z77k4925Nkbznk_mFf@R1J?S24gz9SiqQW#)dGa zi-E=F*l1@aCLjCBim`H&55x*^!4p={;N>~Iq*(aiZf$65HvX|&W@=d Zc=9YDRS`1zR-7H@ix7x=*8mNZ2LRRSki`H1 diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co index e6d915f04dbf049ba9c766a3cd48ad8a7e94e1b1..b9c97f8944395419573b43408003d50592438ae1 100644 GIT binary patch delta 2436 zcmX@HmGQ?x#t9ls6|obwmeupPGk^h%W)Ohz8G#fV5DP>>_ySd^bOD6R@PQS|fa+rc za*5Fg)j!#vQBaAn9h;jNvxH>+GfdhCG``_K!!$7M(qz*oWFyodW+T?H`HS#Rrusm} z|G$|0>I8xq|Eu}?$bi|QAhCctfiTAZ>VbYTaPdf}coak&tR@zT9ga}X&(N^nhN1J4 z8Y3fvEr?cRWU#T_(+p-iBG?LQi6G9dW-Aw)Nmeel)BJ1%8vJbqn*wY^8Uk%an$Jy|Krjx!(z;*A2RpUwfkomi-Ewb7dUZvFKM~ zcJo&~945TqY-oYi1SQsozgzY{-dyN7TVk_7lmlb^ zoIq&MK`BT`vNK$SzdO zx>-8IMP1>dE{-OX9bz>$JH$#bF+FgYd{Izja$dX?7d-t1Po5NSCKONtaZ*4Cgx*1_a9* zZiY!@kp&E_`dsTRqU@@xu3F;4dzy@h2na|-_rX5hhg-U(Te^joS}v>Jch2qZ;U>9B z$d9YLFH_a|x=){Tx^L(9y{Et1_imVPTOepORxP2w404ueSRTR3^4b4JGfT#yteRxQ z-vW|HGC_>CR$AQ$Q_LC1Y^H}|-X;_E0I`S`IT9T$qaAdCkYZXU|I!n8&7)Z9TD>7Kwv*khLd({9^35JvU$WUd)5Xcd_m9F<%OH*xpco*p@t=*kSzj!p1@> z<|yleC<~Gm>I{H2b@LWB+(<~`NoSKM(Bd?&_C~|rP|Mw0_L!Yff6H5Bdi#@Rr#rUB z8L~UwZf{G=V^OcuW4|pLjCrBP1NLw!Z%n7s*q}WuRWTav9xmoRG^jK-Xb(*)Mx))s<-CWLDvb@= z!)g_yQSRaHIgPxFR+ZKU?P9Hp(P(#Zu8nuGL8Y}pySPHdXtcYyI6YY$Dyvc9C7N9&T}m{QQ4+Or##P{PKhsy1_gsuX)FkBXzK8qz>S+ zkJsuU9m2y+uF&fAAZ#CohaQ8lbEFRL8>xc_hT&k3%NcVA(xb3z7%qB@!owqV@W@CV zJU$EuyO*}4hv2DU*!LKMXGiK_-$)%iKMV)E=Uf6hvQ)3(wg)boEwe< z{A+*7vt(=Z)vQ^CbjYIaE{n5ypW_yd* z-{cL)?7rYSug8u-zzHIVg`6?_ZIS3| z=3K6W!@-r|nW^?$>SsD({TQ8oO|#SP@>R{Uhaqbt zph$>}?IhFC1$luaIy1<|PAy4xW{fL{Z%A$FlIrGm;K?A>*<_q!TilV6v~*07rpU)& zDJv7jDN_;!&y`-0Mf;2(KNzx=4LGMmvp+HYY(327LWCT4KCF_|}A|a)sVBU~{eS>4L zl$MI(q)CYs+1H83TDxtLxG1@3+aj!kzHcP;MhiK5Qmhxq7Gg7I*@OTLMYkw?LX>rF z#AetEx>M_H0VC8KH6IgAoe)R}Lb--)etlm7Toe$^6rZnW7zS^;hO-)7rdTZq0XeVF z6c_Ev?Gmf2+l0BZJ2Gy7zRb3Wmb^Ss%+F6`NtO408`r}~ z>z1B1k__<<=?+o;zu9a77@mIR+@I2iDAdP^F!{7`D%fRE6l~C!#jG#c*`hdpyxbSs zzOFnQ+RujeMRKUKR$ACjBuSbl_~I`LMRAEd+bLvZW=UjkC-y}m&vg=w7Vx)#g7!S< zjL#kqQ=+}|Umx4|LrEx$gC|K60y2t%D=rF+aY-;j85|2H#0`>A5tlX)f@8Ohlo%5w zCFKb+v0Tcc<6@aOF_47+O}nkWd>j1#T5T{+OX>rO`qJ`^FlMqITuSoU*d<9`xiBA& zQKu6{y*`09Nn1hrFJ!bCB%3V{FCxamf`Bl8b_XAS`TtW=B8n3xB%*~|x^(13r=Gml zSwc>BP9WW#lEDV2g(Mlrt&3X(TYMgjuZgt3Zd@V|BWa@ub*6E*$LlRRamj3bqQO!k zE}cCgaUS$_4I1e@*ytJ&8{)bZh|A)-l^O~9WW;NUR9{cAp@HJEWirbC{Q{W^?MwwU z0h$1(0Zs!n=3f<*AjvaUU_zFv*y~)sC8MJpI-1Yz<_}#RN;!hr?xcL!^cRfyV%TpA8 z`Ygq*`zUUEzD>mWkYMv6VQVn7H#C%+U{1h%FiN(Dylf32SWeVump6-B;I^|lgP(&d9pPrVp;oW zA|EmH36#@&7M8Q0rt+McJT=Hm# zTtD#%ilt9eoV*wGU`|c8QN6PLRDRhD9r*#UGk`zDEB5S``*~?YQp9rkqr2sL;}aB3 zPgAVe3wqGMOB<=)Rr{&j{KD@15ZG0MKg8*Ko{{@^>58O?3He;=zNaRfvZUnN}knK5Pmh z9zAMmLOgcN6h=I8!W03-K9_l5OfwwuN^+{(@zMb;qU#lko>wXQUOS)_b(2r$RuYJE zz$TCsF{Z)!E9ye0b08eV{+5N3Vv3e)h+dY@u~M8XQk<8fIG+^B^$V;N7m5^X@%W}R zgCAO6M2cm-8>|!;ixh1siglz!uCKRJT!P0mC6<)0om1A)e&R7%H_~I7b0@`lcTrrh zjbiN%iZ|R#(e`tS^}jfv#bcXtM^ZFFyf$Ns0AUG?J={AhO&Y1P&M~oa%4HJ^E_Z3G zAU>*1sm&duk5Hy){SUzM5lkf6WQGox2aViB&f?<~TBz z$DKDV7hI6skUWsQkbIE*kgmM3QNVt69BzZ-sy*p=Q6NP`>ULb2DPWvgG;KHB&udRQ zU!rm+$RT8tE;y$_{H0006 zn;_p*{CmiM84p7~T>KW~zm7*BA1VHW5s&5Gx8R)8XmYDEGIFcNjmxdd%F3-8KR&lA zH#fH`KR>ssurPPsk}X{#T+_rCg#yIH%R&Lp#qDQdyrG;tLgnqpDQ@bfxcN=0clWzg zZ_61f-}*Mi?e9?Rc#rCJyifIZen92-d_?iSk15{&8P(hIA=P`}uT=gYUkL?C=*O-X zi)7sWa*^D>o#!xadFRx_N2vUl$0fQ4N z)qC~>Du3=Hiu*pM_@AFqy?ei;de8rr$~(U*Dp-)TY#;@Sfw+j|(yRH!~#5Gp9zhk^Ihs8Cx2Eg(>7S}A`aSg1$QC`d9nj2VLgKGtp z+gMyv&*GW}@QZQGG8Wh1S^~?BEUsD3;+hpKuDON9HMrJ5y_GDk`7w)Y9N-V*7#E9c za4mvm4~uJjEUpQ#xMmfLYjCZCdLb6qgjrnE4E`{ViL$r`*D_d+vAAX}i)+@gxaM{i z*Wg+Q_3mJCO`OFwxb_*XxCYlxjv>Z1xYkmOYjEu~8gUJ--9{4EDA#=0&(Vl$aNRhP zxCYmoe~-8Z*Rw;3YjEBB{~@kXuIts}8eHe6&*@Q!YmhHMfBy_|4RRhA8`q>*TyucM zHOP~Qa1A@i;+jJ&t~t!&nxibPK^_J5jeg-za$-{o}k1@;Ioc#@}H7IB$bI4#qEP{0;Vx z^ESxipxzjNL&o@>K{T%u%c^uSJ z<8M%p^ESxipq?6kgL<5|K^_O=5;gt?^*C>XJPzuq@i(Z)c^l+$P*07&K|RjfAdiE3 zWBd&~AHLYlCa{b=4kGe5crI|>26-HmtMNA& zH*nqtc^nVusqr@$H*nqtc^uSJ<8QE^oVP(9r$7D%`^R}3$ z$9Ws%amM(Yp+A=z<8LnhxzrecL!V2H@i+ZHml`U6gV#jP+aQl4!ZlWnzd=3D+aQmF zdTRU)>T%u%c^uSJ<8M%p^ESxipq?6kgL<5|K^_O=3pM@*^*C>XJPzuq@i(Z)c^l+$ zP*07&K|RjfAdiE3WBkof;+pRce{=WWD1WmMo+Vlqk*@Y}ZC$mJrCox{i6u3zYpW*N zwQv*EwOZOOt)%N^($<6Dalrpx$?ITQ$>(5MjmJs1A*_lgiN%n#Sn{&A!+uALCD3ZQ zK9I0j0~;-K0yBzjMiV)5^0L(eNkJX*Ob*=F`XcT(ICAoGlwW_nEO(;(B`P_$Z|KzU)fDGY;TMU_qxE|7&oBfCC5a*hb7I2v%M=LeYP{)XF8+t z3`agzmAxpv_n`ECM#J}#D4(g|o{-@_GvUu3(qlWwvj+MM;ecXq#399gh@*-<5GT_6 z9tPv;@p(C3Ta|kdu&i9W;^5cE)uYQ!jM&-);H{5)?jop3y2T^{ium4pjf3vUu)hPe(zWzffKhxKL6UzUm zum3R0-|p)_g7W{}4X;^@eDCZ~_LijYiT?!g#&GwF^o~D9d+)0u}>6v~&Ff z>~-aF7~k8PKKBNR#|GQeO5?SE?7gfP6F0w=JFtGB z=Nxz+9_aZ7)(`Yt1M3HRo`Lmh&oS`&_=Sc(_viQbk;K6sXh*r15zA^EILZg&zVZG& z5a*5c199DW{~n0r#(Fhwd+h!W_sy~UyZmm9+B-4nYwXzl{Tq9?W$gY=-?XCzmk96TTY-nHfiwrwTuAdE&4;uA(n3hJkQPC@0n%bfHb`}l>LD$G#5vIl zct5YAu2HC%-xx<(1-!#l0q-zXz&lJ8@D5W2yu(xh?=V%sJ4_Yu4im=sq9#LrzfA1) zepzb_|C&7d^V7i6G-Ibl#4U&KRr$a!EC?; zA+5}&6~94&`U&9lnQr>LnGre-->?;TQEZjvy5_Ulq5$ud1GWMZd|$M+D-&GfZ-t&> zE(KD`5d6r$cUgT7D>pLy>}v^@@$em5RWKzH8l8Gq0(<#y!(lU0Ln(R8~&+R#m&F!`Ekh*Ld-xIsN?L)p_`3k(Ka0TjdYl zQ-og@X`HQ;5^@QX@4T|7{0b(o?WO;HCeL2dQ@@(YuL812`J2n+wab)JLh6~^_M@Ki z$FV46lU?jP#k}@r$f$iks1@XD-wz_@a3tm>X3zRCeENu({oys{fU_k)*k6>#qQo4H zc$_gOF~_{?VzRcgDd+~VHx!M)ztrP_e<(4#BTe{SD1x6+iaA|?0kEbyiX`N8c|-Qf zx!2C%*;&{AfM=&qpUJbcuC3(Rng#P&wyLtanoH-*<=UWyef!p&8L;l@mUd`trFQYBaZEnWrJI7k4i1QuW%4TJL9e__blg}k7!tMEd z=UC44c|EshVloLb=$qH`>jOsvQ{?r${~Rxe`V56VzaDYq*CXaeJ_epggm^tGn_hBm&v7}_^RZK^rdaz1rkD?v vc#q)y=jYFlLB{*8v_Fd%9Qg6$rzP4~o<|HoQnPQkg=+1kkp9d2%?1Aly7tl} literal 0 HcmV?d00001 diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co.poc_kl_merg b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co.poc_kl_merg new file mode 100755 index 0000000000000000000000000000000000000000..6a5809ec4161c5f85a57200572a5017bea6b595a GIT binary patch literal 24184 zcmeHP4RBONw(hwJNyzU^5-?!USzl@xqB|xmVDt^{gx?^O5U7Y~xjX+qA(NSLG9Xyi za5GFIi!5NkpXXX{5oK3hb84OXlB_2lvR-| z_?u61Nd}0~)lA~ zF-KV!M46B*P^Sm1u3fOW{zgLLPdl33{uYO+$rA~B0xkD!*<*4wW7GpRZg&J^wwAEN<@uE_5OO*KdrUz`v*+Puzcz)u-WE@k z>@l@8-r>pmjcKhTu*PF+Y7DxcqN+CM8n4$AX`?2OgdJ{fl%-#ugSMnZKs5CZY z535v+#<+*O=QZ#yT2xvavWqn;Mq}N@`BvV=dX?6O?BXgFqp|Md($r+JtF$&`7yT+m zW8KA-sV;_9S{t&9>r{*e>>|5jJ=)?5`1t?sxJW&u^UD)n=!Wv3yyhKGjMl-X(K>+3 zK3=OY>JT1nb_SYKgRp%B9(oPJ&e1w}V6+Y%8i9j7PDj+`PmRK^5xD3z3XhJ~!DFL! z@Z<;_>|WlI8iKteu{?R&kVFV6#&$}=L$r0H18iGTkb#P>~4vvq+!P1o% z#-Mv74itgl@i#~7;Phx6ygd>J&CzgbDx4jO1LYXJH(CcDjMl+NBXO{Jd39)@-4anN_!G`uy*Bg-4=`H-E^5Zd6yt(;D1vQ*3lj6)8Xe38(au&u@8 zYxV@AHgDrPkK2Ypz)2#A^j6NX1w*&3aX|CF+Z-L5JX$nUg z*E*t}RNEIiN1ii9Jh#C*v{xHDV9Yd{-2}9jk2ey6{^3zt_+ojjC(_aw3U=_7JS?bHepp+W2CHt)TQHp&fI46r zER6TCFgfb;G!O3FVGd$*co zyvYTH3%MUd_Qz?tBmte}JjsRH?|6ag%N+W|$uU5JH3axWm!3CSdY1+W{jNJxGHxS0t0%|g0fN>4XPFbT4kJPUD}bBFn3Iorpd=( zE-e+sY188Q&q@xqWqD1avcsHOxB(ja+4u2# zmz0R&lqvBf+24uBTC;75xFoS;+Y+pUzHcOT1~WOF!i)yI@qOA6s*vfrK~SmS)w>`qTCnS zzP2n2+RuXaMRK^aMq1oXBuQEzcw;XKg)xad*D0iBWJ+XTC-y}m&vz0{I^gdBh4k~F zGcju-Oo{f+e|=*Ak0qfr2A(8I@XII)&X_1P#3aE0WpFGQ5jRLec}&_s2#(!0Qf!D9 z7njA!q^Qt!kEc=a4E@SW0xSgWx_%@ zMy*y9b-Fm(B&_*mzm(Ccm#o%YyoeYI^8LcXxgC7`<^NA{u_#WO6ps{a>C%#uojUS* zXEEvSoJ4v$CA}3+3rR9eSQj%3*4P3VUn6OM!>~*s2GT|m>WmZah}D_3;owAOu+cUk*2lD~5Ld*sw`wHllL4&y(_X%V=v@;#h z2xtVH0XPHDkav}TdY;K|%&YLv$eZbx+gr}sTfy49k+rvS4B9KuHcl@nY%~@WH_j-S z+$gtqGi&b_*4|GBZfBt5k;mgEuw%!aI=+GuXm2XCR|@T2)=0Fso}Ek*A}Iz;2P^^1 z0xSj01%&!bw17~5NinUzq=eRAQcCMf^t8UD3=r%~jDTQYaup!hPpJR|`zf<%{gf(N zKgCSzH;HYs-p>*gH*cbN_ZEs zfBqcBt@|l%d!bFl`4DIGA#SbLx7XK~8DUPqd@xAX`rIrHAy|&rWtD}+EpS`d4C$LQ zFm3BC6dOc}tCAFN6$P?E6a+}=kTM`;LCS`d3n_m?*h2kCSB!2R=-52av3a0l^FTKy z^I#UtgE=q{s$d>mC(i>Nn+G~J4|HrE=*DCo=;6G)0?x}aI4=$IJkYUupkwnuH^O&lF$cLmcEUHCt;lXp*%hlOlJVNW~{&*iC>gox$z z$K!d38Bd{{-m|cr^$e9~?*loUn^PB1{oDgop8sMz&kcH7@QYZury~#N%+#8Mh~?tP zJLLMwPf;v+hT_zHpa*kms+H=M9-#8eUhK&8gPk7yAzra(x7^Q5>k}fD%O2k?*BhRq zXnclZ`99Et{$1KY^{zTV<)#;R=LNv70{kJ)-1Ds5ze`soL@dvG{8_nv&Qlbto}qZ% zKG1{yU3x3kGasPx>t6&pj9aNaA!ZS=>wGe4^p{MO&L2EzY(zYC$haEu@L^*U;*lf9 z0OGM@#%9Fh$BjY6lP8TKKWCc zq8zaCCq#^CaQ=$g!0BuV2eH4UfrOZ(wBO3&q7E#Tq=m zX<_g~%S%X+tapQj;!=^KHA%6S6wCE>7K+R8n5IP&^0jl?I@(V>Cd)>8Ec5Q7xZrMz zi?&g$*+KD!`zcy~L9y z>6H+lRcan41!<)*%Vq9ti_@81S?!GBF>Oz_!MXlgkG=Is!U5<2bOJg7U4Sk?H=rBP z1Ly(t0(t>`fIdKbbu#9tJ{WUWABwrE569fqM`E7pV=-^_@tCjrq}>TaF8=6r zE+KX@Nk*qwBBPs3k?33+GHA;|wZ=7PMd=rH6zh`Ax}E;<7Fzr;L{_Y@t2{6{e_A$GBY!$a^l3C z%AA~>%DlXs%7TKNb<4JNiEvF5UlQ^W+xa1t-}f=a2R^0v;1^VH$46A}p}$f2zke;{C!im@ zUMiGv_bY{R|8{9z9CszdAwju^x&~yhZiy{haDOd4|fLeuv`TcPT#mKGnPL z9Mya7Ln?p%V~YDfrT8CTP`&%VqIxg zUS30M7X5C~;`FXL!iAW|0{3=|Rpzmn2IGG$FJLjvA|BJg+8gCHET*}E#Wc81K)IF0 zG<7VdsRzFp!>nL24Xz`w+`wX*l`N)N#bTOUSWJWK4Ai@o#WX)c8b!FAhcVjAUo5BoV5F%7O6M-$WF zTJ!%RropxB#l$qY=KcQ=( z7SkXHgL-eXnC6cxra8-Enm@6a200pxY2ITo&7WCJ^8t%#{=#A!gMuw>*t9rqPf@ z*S&v_!0M6GSe`AO90eKrGUxWM&%8|Fh*q`$?$lst`jki(qHOSv! z?4rio;2NCsHOSwfo*Hk1{p5TN@;3wVHrPMT*C2m`dTP84_K)*5$lqY>qQ=``|2SWR z{0-`j^EPChw;AVcWS(f8xB33_HcGw*`5TNi)OZ`z<9rSBH>jt^+n^rjYmmP|JvH72 z^*CRH{0-`<@iwT(`5NSJFeXvsZBUQ%HOSwfo*Hk1dYrF8{s#5bcpKE?d=2t9s5j2r z4D%WA_lUPS@^x?C24e-z*C2mmhyN?`HhixHmXW_fMBWC^1J2hVe}i&0-Ued^&etG+ z;|4u7-Ued^&etG+gL-Pb4fd1sHOSu##M@y1IA4SO%|N^j_K)*5$lnaa+hG4VUxWP3 zIB#?D&!Wb8o8do;8s}~3v#4?2X5eR07t7n=HIef*$lr)?jaB1qP>=I9$lsuz8gGMo zoUcLt2KCf<8`R@`4e~drr^ef$9_MS2zrol-jkiHP&etG+gL-Pb4eD{e2KgJ*Q{!z= zkMlLi-=N+&Z!_GO=KI6j?D#w7Z5G3`L-P{S)jpxEt41=nOK>SMCnt1m)g-zWZ-Tm3 zbGx~fbiG2_dht4T_`fUp94sq&94xExH>oy+)v*LI>l0>kZsvB_-)J`bTg}(|<7SJ0 zqj{cxR*}_UBuBe1YZ6Ej>X2Wu<384xaDTzk?#oer{q?fkf%2EBT#s^#MV7-O`X0k8 zRQ>~$&zmR9T_}HbH_@=YFizat0()UxfcBT|llh*N6bH`ss*Lt|&Pbo*jKy;s`B+u< zp7h;w()YOx-#emwj)HqYM*5tDKW|8l?GVow=yQXEioFnr75g8KDfT>^Ozm?Rik}a| zeptM&D)$^zeYJKbDniMttUYK*4n?pXcJauY8V+W##i*EGy&x@i{>*dcW5A zocMO;gz}jc&Ix7AgL6Rn%m~ZMm;>j;=V|d?_lu5Sn9__2O4ZXegPgK_#;ug2x$_F+;;^8eWluT_kGuk6L_CrRBC{|Vxak?s%a8-I-TURPxtGuR&2k;XS@=lloQ+sflG zzLzz1?hO%-4Yh}r#%urBdsl}OKgxU9;W|H942kzo<$hHxt3B^P`C!jEuzs-T8+acc z?70Tk5B59*>j!&|f%R(7FYx+!PD7vf^ZWZ~;$SzlqujrUWi|dA<%99wc>f-Z@5cJU zcy7FZ55{j}y&A7Qet(Di=J@?xeiugV9hlTLcKrVSt-Z@Ket)O$*o@!b2Yxr@V(#yQ zt%vcxtlYbaWi{>}<%4nlSU(uokN4lfIDV`jjN8Zh!8m=aSL5>Weg<#y!@GL$4qrJT z->2)B2=CrofQPJxG!N2zNDCk>gtQ3KVn{WRmO#1z(o#rPNVSmaAT5K$Ini=>AFsT& zK`39?5JOoxyt7me?<|$WJ4@y8&QdwNvs4c6ES1AMOXcv+62|zVCQW{yOziVMS!)`V zw<_Kz6BX~F#V64EcptlsnN+?}Vb`j#OH8Eoi9U91IaJ=Ju!}3~+Vg0Admp>I3aI=p zh22JGhwo$Wwr1zzw+7(7?A+-&+5SxUuEQpLHjtg|k7s9>iL)L&JwXJ)T);RXt<0ts zzd3>Wap3fsZtA?56*vvwtQB`rY?bBO@YyU;fOp6NTLB5aAKKcL0j}}4N=Gq=0x4k# ze&pYKtUjBS8yJ4^jRebhLUytGbS6K<%B+|IYhw_;@_L;fzWu7VS{LYN&NNk+D)h#y z1EHYL6S0LIQGXD=(_?E1*`j{h#0$mR`2Y>q&{7HYADBOz~Nz|(SdOT=|`Ah5Rihag$)aYf~V-aEU>Suw}$@>F`S zbyd#3rs5jcY;RRXmDgQSG1F67<(dg!oAqAf!H?t&@Pk+7;g?CQ@cmll58g|JAIRq_ zrG#9< zcZqrJ&5%+1UQjE@)xH-*OuH=j!@7J}}#@XaFIf}88AYnRY}4IkhUBA#!V zkOXeeGYt86mU%UwhrEooIJdbOpYI%Nm?F-1Y%7bE`E>w4=lrE4g}6PR?;OjRKCkEY zOiZRg27U8-etqDmXNtU@_n+hCP@kr-=hq{S{CdRP$j89*i12Q-EaUzy{JU2imCBH} z^eJBrWgHiNy-a;ii+vY7_5E*dhsuiIY{;W69s~aPyyM^N4pTAx);Q$(_-UE^dM4xc zWF4zm#gH4YVxQ(oEyz(;!Rk}M?0=gQ&gn4Ez!R6Jfa_xn!Sa6Cw`w&8S=c}T>QTw3Cd{z literal 0 HcmV?d00001 diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co index c3f4d68d84bda58b8449c11a5fa8c3cfefa5fe39..9c08b4df5f59bc108c9ce724b156cbfb1cb5f368 100755 GIT binary patch delta 9352 zcmeI2UrZcD9LHzY`vX@v?zXfZND;4_+;P%!|A3<&3T0_(BIi+|rS(iprSTs`YDynW z%3TjI^+9RxQV!{Z*d`5)k*brlr6D~Y)RO3f@}x$SCdQ=clZi$WP|w-9nY~-GL|>X5 z$!-Gk+t1AWew*Jnzn$5=JvXDf_l54fle<@a!F4q{tA{{3SPLxV6iCz{aU1OqDBGdz zFwkqH2A#+K&v|ry@_A^r|91=&hm(9&1|;ra_puC^B&`?^zRw5w*!JM|ShA*ks_KMw zJUt2)?gS5lbi_WM9;5aRY$q*=pU|h_MzPPG7w7#!Mn>22tkX~J^?|nWnq-qZ8jxW3y zW@<67#k>~tTEe?WtPz-TVeZ1*g}G~xOFHb2g;YG_bf|j9sZsTeQ^h~hg=L+1~jmF42|L9jNd7)V)`1rxI#-ud zP+RB}3QFlvyK`DMxN}N-0{PDQP!>7?&Y{v_tlFYKr$d;Z>ft#UxJzn`(<`GSMxjPLJj)k3cjq7}2Adnoh0k zHp`Qpz@5$RHc{mCt9rEXT24D{*;Cvy3vbzTnlr1p?IzTt-A|Glyk=le4BD!vkg)Jp zJprD<`%PUD6HKc)gt5o+?7->0iuDJ+0sr>J?tW30$)dzs$@jrq!E5vBIl2 zr+H!}x2Zf?gg2ET{3_+r{w-+Wt4>a z^OT3*QSwWax4xz1S1A|QjsHyIQD6k!>jEs$3Aa8bhWzg?<>gmoM2uz1BgA8_?`O*Y zTFWuXx=(rCk5c@+<9FH-xs?a8`48nactZYBOcG`UzpV>UPI-T|GNX@E9w?Q$gZOjN zj;Wn;2jSDwDOUt$11YZw-iUg{HZah4{KUXfuUGUl)93S0(=Gbh0C$t8fl+UhkC{y# zFEjVBo7W*H#n%>KG5l_C6G%d<`({T8 z+{rzHKl@3&VTQX*v+VmGTk5hIHuARSY|3pF+4jCc8n+{5EQ1xr3|>#3v~EoW%b>t5 jRiuuW!KaIts*MBreDG1)Qdf@zbk+hiMRGyBnXqcLP=(|yn;O+&4<@uQ9UA`$aY zR7%MW0B%4k=`h7#*I)WAp-Mf)B*pj!V zz(3cLH>|naajdM)#IVi3%rC z;Up@y#Pi+|PNKp|RBVaOogqUV5o^D#$y@TIXtj!@XJjuX=hYJ;Sq%4AY4=lgBe;P) zJvxlZzV@Hi^4q{yMUK|rS)H^HI_!Mbk)55rGB4O)mAyyaOIK_3RZ$~re^{Ne?vSrF zBP1NzObT;t*w{B5ATz_7ulMc$rYlI$6(;SjuqNQ|9otKLfmo%{S35RZKT**JaILU9f08Kp|67pj%|s=(0)g2jKH4DQKf;B6}V z-5b@i{Z(M;PjW0xmF~G!t<+Zqjz|r~DJxhczx}$fS+vhFm(PT7FV3Z3gF_9NI>*W= z@VS>i0{qY^FAoDR?)37c^deI4#&y4vl*C(Z~+6rPe1D}b(|M~f4Sew%fClP zKb8aC$+uwr-Etssn*Sh7^gUcY_OXaO@aAKTl{&B)eNo%WP#d8{5X*OqS8OntlRkBBrs-@h+Xwc+9;+ysJ}ZIp|`R zIib1f5?ixOT0-wqk%5Ei&@j>+*s?r;E>FfR-96n;DyH$ya4H~ hG4%~NB`iti!40@a7?aHf$JUxFwq33@pA6%+zX69a?;8LB diff --git a/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co index 7d99faec52dee02aa62e6d135dbd48ce1be1320d..68f81e24c6afaed1826628c086c7d4a4ddf2ff7c 100755 GIT binary patch delta 9922 zcmeI2Yitx%6oBuHyDY3F>`rO{yFOT3x2=LZkM5&vrQO|b@Kp;Igbf60XcSWdAsDqW z-F3^Fs9*|{bWvjnPzgU2F%v+rVTl?aiHVi?15`+VAo7DK@<>b}uJ_KKJG-};Mt@*z zlbK|5&wTgXbMKs!GxzMwjQ(mnxy?RM$DfSuXxN{s@NtM7v_&rc6cIWN|6EPQepixh zHx<>;WjhEkj{leAV7&Sy=l1^d99o7O6g&mA^8_}ZE}%iZjb|cn2@xS(6{$+A_g3zz z?sjMM{b=|^WI4+FYP0#RAiq|dRfC=N+Re%*_|y`vFvfTExtDSU-;#EZE-xc&X`g#J zSEzfc!(;H3z>`I;>#Q!5pDyXoLg{=2y?)Baaj2QJ^Eeb~9`4uW){>m^b&-?f{Y~-6 z&Zc`iJ?8`!&zQxD9YNy8gK)8gma0zAM68@`j2@`}% zm>^uj1mO})!nJS6EL_ALX1*n^G4m~P74@xpX~-;Gy4Os?MLcL~vMgL`k!YSOAE$)AN=TKrqpB&jAZ++M6EuV{BAM`zHo#HuElnDsmf>t|WnHWgk8=#MIE?YOv91sXAHb7St0ldzl=$WzLr*GuK(BTntyMkDz= zGqsVQt(S6Br@G|NjA>P)RtBn5W81^;38o#sPBKv2f7Pkj$jkbphkRTMmsWw{r4%?g&!2j01eF{x0y&%`D#!d<%a2UVeN` zPEyE$T;mK4>sk)f@oKp`LSF$tJ#Im#fJe5o`e%X9d7I@IfuBD@<}W+0LPyUBWe|&h z0UsG+`3&7*4z&0qmRABlRJ;Ah`D4Hj ze8ckPz<(HLzl+o7cc8UDu>*QwK3sLG%C;^~KI7Wu0$jo9QO6L&6u@E&O$EmWYgsC;GKo=nf!-klM2I;WYu%C+Y z5T%lmpi@XvV+<4|e=0~#NU}&TNeTqwpdrVM>0~JwrsF~qRggjn!$P<EQs9`xQQo~A6rmIxM1a*oWkI`$E+hjUSk&LA& za)4gG5(@dznq+%p*ZM8pn^mXVceL8i)$)9WFBg@#1y17|vZh?G#07*jd{dT&Z_Lv6 i+-I2c>89M6#68UW7v`E|+>S2F^^&%uC3jrr?*AJ!{Lw4` delta 5068 zcmeI0e@q)?7{}lDXelN80Vl}_qLQXaA;PYA*V5{O^e0Oe87vbWnb{!lhXy7yWHU3N zq$__3$=E3+8e_99%QT2Jr=pQiNKCdEe`T48A?R?Mj{n4%5Cc)XAJ2Q`z5bzz|J)xo zX`kml&-1+R_nuy!-gR-@zIxww!NZ*7CL69MvK1JqFJe)dc8FA3tnxaDx6yV;PeUCg z;6NSoFYBOQI*iL*|GbBs#8Yg^61upsgr*iGv0}RF_TI7Fc{IOppbO<5bdBXw;Gc8J zzf_!Mgbuu2hsJK!4Kiqx=@=TGE)95-mX`U>$T&fBjv~O_QhIP(~gQG zri!Oc6qEqbC#1mCD_fzY0l} zMJr-bx?fQ*k9QyYe=LxB_aDZB^G>Ng7KY@`-c_fyFV;C_78Gi<)Wz!ABW>mZh+2zreLb;Y|x7$9snaWA>cav}L zW8QMfXWn_$X60*Gy<1o+_{FtQOVvAtsR#6t@P$*kQ;=9I_tVYXv&k(t3PoC|rOIu= zp5R7)C|q|cL!nTywZ=fzmNEtkMOvz@?&9W+R4%wN#}wmIajxuf+=>%{K6r)!vrDrM zB)&)UeZc2uY3>64M<>l2fpbsLJODh~pg9R68i9b@RL~Cm${{0FbB+PO(L;0nd>rVW zUT6egg#0zV5V*4cIS5>9)}Qi7#Tf9+>xAg`0`S$mc8$CVyygWxk9=MQ-a1H#Za+{> z0y)qv(?auF+<_`z))UF+b>Od=1l@37W3~&%CSdzvJ~g1U#7?P;vhP zK0Zxzr`9kBvR$UR{#flm16i6s0{I^n8J*VmZ-Bt~3SHm>-uDsBTY;xOq4^=;$3LTa z7w}Qj`aM|N-+{`$qzeY1VC`#~p9Ov`e+F7Y68Ow*I)4Iq>JH7Pfd>+L&yDD12$XFY zNONX^4>W3y5&RVRbS2F{2cB`z{2SoI`ge!A$x6#4P)!&74E*53G+zgv;c0$S{S$;K zLD{zSr6=*n2~Lcofu7#}ffGC*h#7921D6YjNdo8RB7}NlhU@2ogob#-jfVw;#tj+` zhk=SdUUd`d&fZ9fWO$ne!}UfCmk)ZOPD$_+*C#d;Hxe>vfHQJKVUoamW5!}}k%;g{ z6Mhkv;5m*buHYkQCj@w-DY1O67z~kIpWi6+djo{V{6qvF5`Bb*;$&xKUCm9XjgRwW zJ3byGs|g}U8ZU?uvV_P7Nv;s}ld}_oM&pHGkO&rHzPuYHUxz3tojB3m?(2Dexc?=o zw7hQal`yVlm~2J*QWQ6pX34*4b?N0O_Omqy^(&~xquW(S+7-iorpA-*j^P&OTTl88 S)s8l%msPvon68TBNB;(sMp97# diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index d94d8805123110f88b2844de2cde46e75686fff1..1a8b13e1171fac0c6e9ba48f6e60fadd766327a1 100755 GIT binary patch delta 4434 zcmds5T}%{L6u$Se?6PDj3xbMJU5L@uNHhD#j}CMf2E=Lp z0Ev^mjKq<6ZWT<*(%(Y}!@lHvT2P9GWrM-HW!ca%eg|J;eD_ewej0sa0wL zgBR(yfOMH^l~{sjZxCjueu;vs;aCKPo9*l1n7<{Iq2LOcW1H>Oa4gUo&XD;;5Rz!H z=Y}6bP$1h(2r76k_;;aV9Fv@B-mLyMV$BUQ9LR9%o1-aw~>HWs9YBPi5tFDHAh zg=3t(DtWf=y8j~p0>F1YfNc@BMc5W&TZ}E|sCrY!MX0ZG#BgQlHtWyUR>=}JNuykK z`U0IOoH&b%)ZP);jCn}UmvUwa$F?kv<@ROUIQ!h2<_5${9|P`L*yibO zAy7`gIWhf7Si<*H`Tk`Ja~sp=>*QC;=wx4@3@Z1gdGCKl%ed5Gtc>y{v^g{F2`kB^ zEM@5f^fDkhVq2x9yD{EHe@5>D(thEaK)y+@GJMgwV_W)!_A^x`j#_8uoXbB)`w`bR z&?c0x_%=o-UhqraSbM(Y2q5Fg%X68R)CHeHKxQFr3nJn(R0z7%)ImG=q0p1n%4CsbnJ)2Tv zDzX(R_ue}HvP^HsFk^||o)*<`zm40F?$qgQdFF|o9qTj*bQ~oMH&<1~bNyfZm zIL{(d9Gr=)t1_dFa*AKw0gVhzxlvdn<4Eq2SsCUvaR#*{;?C&NON_fBW18r{3g=oA zD(_A7#tHuY+s2NMTf^}2|4tK`htP}6r}2z7H2X!|5C)5=u_K2yv*I2r$%UwWP&ZB6 z2fgA~Y3Rc9^ubIA>=l>!^@FhBYqFK63$dmF`C-uX9(mLt#->;cU`Dn448ouJLVk#u z_+%;20;<+ba5Bm!{Lt%KK9BIuMG6RxuNISq!6^YN2|v1Z+*JV;gg;xYrnQw`k;16Us^MC+{)()%O}E{-;LtD!;V}rJ?>tKHP)~QC$=N9k%gDgVCE- V528-O z{4p@f&gJ|DXjkqdbRf5e{Uf&|Np(CW#47?aenLh!`Zfu?fbGqbC9%W#<;mb>`fkx? zfobW+)@!VBV=-i5+g>@XE>hDs%WkASz8v5ws$Km1Ru-TS`ZkIRWm-XDx1l^4o?fBn zi;%3A?(I*MXt!V2SeJu5lGz@#9k*3ce7ZcuZ|@f34qKP4gW|g&JwDho?EEx|9?t)n zo!;FbqtkZ|fG+lvomLqQ+YsBJgp=#J!DAo*=)9?`boKLm)aB%KiJY$UsZ(}kM4XF7tuph9-SBfDAKLX`y+Sa9H$671^J}o0{vyH7SNrLNlQq%S=#oV4zM%{9& zGyfqo%?L8*m!mSh{eq0|aEQo6&-ZRB;xU z>j4)*;j4>59*Gj~eQTuN-Q3+OV3i~DTwV=Ou7z#|a^C~Ep?VOr|8@=+4uBgfc)J`8 z;HRXx$8A<{*ZV;(RHYS0CeWQ8KY^f3eAs!=p(Ks6t1H(4bE*oqbF#02Fdf0YFrB6+ zpp3Km;8DO`_#D*9xgFEM2Gfo##D)?|mkcGMoxF}aw*rh1ez`;{piw+fiMjRA0yT#h zf;=ii##EOj9n)Y6H(v#Wp@cQCO#G;zX$eCSS{_h=K`zw;N9E+R(p}1Vj=&1wzh95v zOL&RArWBOtS1DgB-YTdu2EFlOmqrW~)QP8J3`LJ&-o8!BEtq>B6$5{C*MT=$5)M>j ze)B*yRSfoEKCw^AeV98F3>?IKb|gMusNaS+YEQ&(P*e!>i-)AV4|7jK{%Op!XQcUO zF+X}$%0EN5eo8@IN*L)FKCq!yI_mE*4;M=LB<4*EVi5iJ3+7i6@@GkYFA(a-U5j|5 z?vS*=D&d`x?>2HIMNCsr-J7L68}pXiQk{IvYtKaav!YfB-l$JFV8Z-ftF%A`<~wSn z{0qdPlJe5bUOyc1>$bY7h8JHt(C~smYtrM;kO4q2}iUEy~{x=>#Z&WJ}0#1!lBV&wBpds8>3FM7NBS{qt)vNolIE>OH%YMAO$PdYMici< z2b>4c>t%*JVVTNHe&aMVcRHatGmP)U_&1PN&FF*B48zULYeBdTwzn{YXdQZ!xr5e% Kx0sR;EdB@4?q&f1 diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index 4c2875e3af814bded2a613ee14293af113f913b3..206020d612fba7086f6ee8c04615011b9d4eba3c 100755 GIT binary patch delta 6555 zcmds+e^3Hf*@oS~2CcOT}kMBDY9~{G+e$kb48Xq}CmLi>B0zL0A z6oasKn`T^lF$mzNThGJ~{L_!1$_D!!Y7z}q)lPA{Pm=J7DQ=&!hLeHkssdGsy8?G5 z(yi5}7PzbXv%TDjlYtdnc0%sY_JO?aTWa0hgE;IOY zDm10^Cu?j*d#t6D0P zs{_@E&X&2!Nu1c~Y=@l7&{X9?1I`?d&6O4n@RPcb87pdu0nhXXO7pQ zCL4#tgFQ$+zt}dgI(mO=QH5tRnI(78_&HOUOn25SwuukLFI5NJ`wDDq&ElOhDSqDn zW>Wmi-H+gzRi6q5iW-E;zO3HCgMESL0}pZ8;-dcSgWHMR6L==j!)38UooFjAvZ+!d zTPzl)ook$FdD(u99hve2XotFH`PM=^Pv-#fRGn)^9FCKX$@Idx_rBispF3x^n+wya zSXs&abIsY_SCU>`V>f^Le5ez#cDuc!gWXr4cE5C@Slf9EHKFvDo-+J!;Y>zF9qid? zmqV@J)?QKZ&FlFYPnXPU#tSl4b~%_na&yod>9w!fJpeQ^<_V;_dav3Yf6~GHa@Cm? zQNz>M`?7i~bl8uOC(iU?KU=6hw;7cnwR?E8UD5i;r;5{qZxyJA{!zwfcD{&i6%~-@ z00}~dp9I-+&OQj4ek(|8xnr}}H~~ZNp%J8x-q?%om}h*LFZrO(JOi{w|_tv zkQTEpxxTF48a*P* zD%m4)RNJ)~74mjZb#gei)S#ohzUMu#s-XMfmn$BezqYh7b#J}&8PGKW7{ zJ&WOQw|npAr)0>3s~6~R01>15%@Gyq`7{CC+$wD z{u218O3P(&0k3t0afd@|{|zqVUrrpx7{#@opW%MnvqHw-jB$RUHu`7WisB!VU!A94 zWyS}dyI8D$kI7&243XzfUtoCy@cmEZZFT7Z;Oj#ymw`7m8$y+(HSKtAJoD0D@Vt1( zZGMk~Um{?67(y5DrZ$#;8~EFgv3xV|k&P_xr}pa%A#Q0?pxD3KLWb}n@PQ31KLY&x zZ5#Lt;KgIC{R!YFUYX8G{&zvqm9NDh?*kv`o9@7Ie*&J{#qx3BgG>UBy9RvZC#?M^ z`o}oObJd4g#T?_#^W3fDEUyOM_9n|000E&^@hCT_rF=Ysr*Dm0RM_K*?@KJNC zEc?U2Qwv!8ovV_GXr!0>0_C#J>Z+#?RXS5%{qn%l{61W!&6btI`}O zT350T{{_C&+*`}O$oSg_&&eBD`%2(Wb89X8THqZC)84xOi$PKREE~uTJn?&$w*bHR zI?LODH|}NmW59C<86M|WgW^lCG79b~;HNZ}ZvvirhvjkLADw3T4&X=sZ3xy*>@_~d zdCv7bt2hJ>;!>8s2E6%WHo*|^*fo|91Ap@t%TEI*U;p*=%#lA-xOwfBk;Gs%<|6zkF8+&65#VY zSsrm1{qtNd#wyyuA@F^ce;xSvR+g^=zGoZDzXLpcndMu6xBc3j8ogQL+6IcjJ*>lS z;9Cx|`~dKCudw_maPbY6tH6)F1Dw9iPlDoyXIO{#fRCBOLF?qkfV<4$u=rOBh=_|h z@<-52??3TSJm}|QYoGi^tkdK7$skIRs0Tz>IO3zCEGSfTi8Bc85g6kGQ!7B?>tNpFnK|S+YbW7y<)gOGP0Z zv?91%aWA!SM`<*-*K1kGBHa|X;tztY#}~0g5}8H_OVrKd5@=ixxu}~rB7sPD*tCtu zI=w-WRxAc1bSFF^pGf_}5h{u<58VmTZKV@~;V{^WUOz2Z%vUKY0?jOjy)=awRXlO< z^9rE|6}>K+&g=8ibR;hAP6{bBgyf0Rbl#9m+wev`bhjlTOxu^d9syEFKH5>dQ&N0% zGlO0?-AsSbMMKC^h<4|5%hc8<3YI8Iw058Dr$zXb5N(4*pmBW>4;5vX9OmNY@XPKX z?L!tLv|?HI(#*0Fq&>@_FwHDSU36dlZXYe!FNWwS`K2gL=l8qmOa$Em-H8rG^3#3w zha@XRRG=XgAxLv5qDbpe@;6se{Iq=~?4dc7XegwWPv9po9vsVT$l=eG(0_akUdsG5 zho!3eV&fsf)sLIcl#6`k=^Xw{hQEUE#`WWwhAVg}9v#m-N!F4>S4^XB&NOu;%s8p+-!PQy*@f?p{sQ5wDugIcIS?Iw9{@>t9|=MtOr&!h*J^)W^Q5&iZd8-S z`@#Qhbx9Gfxs~mysf7TG^h00V>vjSNi4C6WFHdB0R*(lXbuZLC%;m;C?YRd#Ou4P@ z`MNePH-WVWT2ZGSUbf9+H10s6C+l5#1W#bSXJv7bVOC-juU^~umJ3Dn4>nf1XnWP) zl079?eNKDd$*zBTfnhec4e6~fjk@$bFW)&suiq{qyu?FF)F(4x59u%C*1adXi;Zjd z;aZeE@S?YD*I1$5XG@)Sb-cgXV+MPq4EIftIOePt>6u?uu(9$bRo@@WAcqPQ}k(e-Cx}Jl*hPu5S1dm`TQ1-)jhQe&%n1N z{o?rcLi_1kQNBc*=d{0E`@I|4{ZAsp$D>h~fk#kdxzRBP`FQ+uo|JCPE5fIdF>2~x^ zw%g6q3~Tw3);wemcSRdE&)n?%N`*1_B5sZZtKNl9KD3YD0Ti>cuT=;x_#%l5x4bvD_nQkBF~*4WO+64fz<_DOS&I; zWt8PA@OAguLIu^vK*4EkAL6@GzA1iBf_n8AsD~|V0N$Hq`P0B>J^6U$!#K0IZBZvbC@mbLE&-s(7l(#-!mP#oUMI=lz`-8PnA z0DhyB<(GlCFbO#BW8g>jvi6hKV~XRs!IxP@iGA`s*YzsPeZc$QWchsHnO>GJ1b*}s z%Y%fQ6%^)g+Jr%|^AhW@40!(l%U1$_ZiwaUf#jW<}{0=%o6O_G60YcJ5#mUEttVSpF*Tj(@NTjsrh@jpZkS4~((g`IXCat&=9F^Ec~y9~?f< z+XAihufW|?3i>DT5g%(m4%|DBMlIOZVSKxO38KBs-xS+61X9E|U!?uup-2=QK z$=WXlUi=u#mjHir70VMYyMLZ*+Q=&E!C~K1EdL7d_-2-W9eCbZ0#YaU4DfY-W9_#B z&m5cLumcp+cd-t8fWM`&{3pPD2U&g?c=HjK>%cFc0KNul?E%G)e#<(X0-is^@&Vw> z&$0Yb>o*53=FGp5w6FaOA5R4ou4#Sa#-_CaRSAJ8#}z__xDo^^C^40YqMwQZNgz~- zQCrzB6RIYuK{Q6FKcLt&WoGsZ3c*5xrt|ydB!nn6b zC>f`=aU%L-i6}8h22v!mD2g?a{Ykr;>pIYC9gNNr;Ux1>KoY{dYzl@esE z=7gQZsiw8T5Uov$ge2k@l}J5-Sd=sxNF+(|l3ygXn{K2^NeYq(k{lufl++lRj}%Rk zdZdJ;q##{TP{`PVfe1+#4ANNzReGXwR3tfMQ6*w9>L()(#_6WWLXZ?82cpD4mSt!z zwN_S>#3C99kUl~oI#)F)kQ|DTAXO?NJ#s~glD$)8x)VxBAfqxPkhn^WZn=_(NnFZV zepN`2BGiDN%u1D2GAmV$le1H!0x3d`i=?HH5F+V9VuGAZDCj4dLkeAls6f9y4G}3q zEg~w}rBGZX3lvJyZ&gGLl6oS+B&jl@Mo9aSD9sUx#{gBd diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co index 048095868df2975ab02539a9d420ed1fedf8008c..844fd03871ab903d1f4361fc07b37f589cac3ca8 100755 GIT binary patch delta 1748 zcmaE`n{mfs#t9ls7I71`meq53GJpY$W)Ohz8G#fV5OYLB_#D-!bOD6R@Ph@)fa+rc za*5Fg)j!#vQBaAn9h;jNy98zaGfdhCG``_K!!$7M(qz*oWFyodW+T?HSwy&(xjvBb z|1T!LI)PBe|7!j|GGKNjNGzaEAeQmJdZ3>SNSvRcVZRMS=Or~pMh06Dt;on=W4osr z%yvX#CxY0!nyp-HCRw@IPSdjyXwbJ6Y%;JBX)v@EZTiLJQzsArwKIhA|Kv4dLUN`T z)k+x{)EP`Kssm{aAguwUHG#Ayke_&!DaFc6Sly1zE^;-j^?ghT9|_Pu z<|fYimQwk`L|82_#?-~w>Yx5<+7C>nU@XnrMDJMRgeOm8V{N7`aj_O+ z2#mFYze6V0w9JW%H9IRIHpR!kTK0ot&b`H^-^P<+KL=~;e)kR_yV0i2rjcQP)8;}4 zLly#wVDkc(9juH^n-`it}ir ziYr0IE6~*ILd9h=P}N&N#YN)43}LwJ0uYOdgMp)ZGE?L>D`#_OhMdHt)SP$&b2CF2 z)5yda#xyr|g)$8dEzDt}ZU&~43k7vI%SS0PF>!cKJ{T=GIUz=X3to#vPp*nFldXVO zDizR5rUO=)fE+Q|P*8aCgBS^>is;F|V(geSVkTR~+A&GQPOgfz`g5DkFd1tuVCoeKlzWo5>S=GVC$Ub>QMtg8$OKhl zUd)X7!%N>ZIchkvurM~Tfs7Uf!-An``=Cb85kPC_#jIYim9993|N?fCK|DE z%>9cnZF7;83Nw;P6|9Q3zgzZ$Omc6r>9_G@*#Cp2b-#NDklkq0X4A;9ziKn5iy_P8 zCgW|31w2_9t2PVz8M16n3VOy_9}xfzVJO7|rr8-nY*EF=q0f@!K!C+86`Buy}De#x!#=f-(&a zEa9TAhUSwC1$8%nk5y!1GH{)27$Z7aKS7cUp1{H<2PK#ZEr6!C1<>@Dz%f}mL3Z-? t1W8Vq(0Z`mcL_>N0TGjh674u0A|Y-qfEvm6)tnTA2GFVgsAgA;An2<1#T~G)lNz zDLTytC%3GMnk*SkVm7V3b3-LNKA3Dq6UU+nA115&L>8BzOD4ixy}Q2Ga`@rED1kL; ze$PF>-~0aV?epH#yPG~sj(Co$^!6r~v}$qePG?9|DQpB=J_9r`545Q3jsS zVSvXGD;$T%^N+zSn|2Qrg6B-uI2gJhPS3`{aXAJ;^ni&rvAJ|E%h#GOSWnaVT|7s@entTuIA_o=5ZKbAX<8}TVcGcHE5ObUWwb?o81QE<$GGJ5`RBn z1sc3{r3^yglVULhv~QwM#&t_@hlIcC^9JZ4Z@_oFiq2#o@@0i6+RWO0IiUq2*qhyq zd$zzh|7)J9&^1-|G63bkHB~Og3NcoQu~LkcV&u7?jW4uNT4m7^3iVj#3Ff8rejzPH z3SKfSyb6v8Qcp8pr+LXb&3K*W|8|{vd7*q)(57+5hznLiblF-CH46D!-#CgEhDpXs zq?VMu_R{g!^0sK5tEd+0eXyF)s6)|`T0QgaVn`6Tu>7qdLu)OUA-OMCB3HC?6>?Ek zqFmSi+7NQxb!f{)xoCay#l^57Nwzag57N7UFr(`ib{-b-A^Hva5D;#nquwtxm9O>1 zNO%(!*K3yY(UNg=q~X{mvdkG0#yAT^^rJ`NJ!V=H{2akZMg_pz!Y(dD@Mk0p%%)Fm zpa0pwvi{-JE4d=LV5v^yI_Kwlrcu|2VUo{$D#mad#lJEj&-R^0b8Ssp=8R%z_UXmN z8Tpa@)66dmDMQ<$U-yQ#3i+smG3MY#@w0y;-#4%PXuGx$79;WoFGjZqD|Gjq&!y`bohjjM zgkjCRZK<`b_~z^+7li!i42!ylOSS&du*Z!U8FtAn=Q_SWd-{{GmT!x;#%^ZdA1OPF zr>}22I~na`f+0GvH^*C(*W0&ES14tTz|Dy&F{zjiOR&Izmk0c^LGXn zMv7wp*KmS;eQ5%Ke$3B5tL8b(cQ31dAM=Mt)b*z@-#o15KbH;}kOsE?sZLDDhiPEj zPip=m=8NQ!soWLJTbI@UjCrm|{Y%VZe%1&~tmDIsBeFrPk?DdWfvj+1cn$4y&tCYOAS+ z0BC=*$<^^(cPH;PRh}Qpg{6c}eE$^r+5)#{j4CghtdZX>&~38#5XvSi7C2~Jlpdpg?0L;&r#EEXXqd4u~TGUQX delta 1952 zcma)-e@I(b6vywqc`?tSE{0@3bP1Y1ZLwRIU;ap&y2Y623e{r7wVhDw>TsK2{%~xR z(xfhtB2t|fnm<}H8Pf@a&Ur;sD0%CKoltOYbi*+UE-+XjxUv;GW|Y0Wci&51|JeeM zd+z6)`@QGf^YU);F0;ABOdUX*&wNxj7jEAJ;!hrdUYbK;C=#Cb@|-1%cJ&w%DFDeb zYS48^nEs!2bUp5b<<|e+gN(vu1A7}xr*4C*Dco&btGrVh<39wqQl1{bCoM5PO6?U( z4DWQ_Bo9qmgiS#+?h2kDUk6hN->V$d^ST(X8)f({O_cutOz`-t7pw%CyAe+BuOb(1 zZ!tK#Z;AZ_Qnx;66$ES~QNtmU-ruY>?#qty07S&+zGhTW!Rgh#MN1E)LKRx}={>aw zF56ruL$y6n5E^6fiK9!_>THzH3ARN&lb+WAzpjb#ZI=c6l4s0w3Glx`-27miEHu}N z3F3>kt7O4@fI)(zWBdk=H1-nPD-#fh+AYGy%Yt)WLrRWk5NflNe>y)aJc2rnezI69 ze~l?9fAMurBW9f#KBmQeeH-kXNS!&c^glwxG0}PZmzrN(Y%3t|(%! z?W$G8`ZN8tIi+{BmHS$q_LE-%C1mTHCTM))alZChHeWkQuYM}!mzu3DA(hlLL?dXO zFRecHILhawoP>4{ zNym`c?qe2zoe;wYO<}H={wL{BB91!n_s44fEahNE&8?J=98>d`CyovJ2LD2{+ zULSQlI;ln?c* z`4r`mC*rSBp1H1$pQHTVw3>gJn}&f7FxzU!Epe&?p}T7SE#({YYW@S|=bnhaPr1#c zK8fEcpN9|}DwO(*I=U;>3*OC55i5MxcWbL` zHrN#Ea42SdKnZB|$_lm?pRDT)xaC;8yP-+m!sT@eW;nOAEzsJM$4aojQK2n?dK%z# z`0UhlcF!I{$?^=IE68Ni1PFRuJq4Qml<--~~5 zhNk$-9DM#M<=QKccX6=6kfhgOQoa_;;*}gU$5({lVi|rR32bxFiUtjFLk(;$erS}= XKct8k$KS6J7WL6ij>IcCXo28gRgd`6 diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co index b82ace218d6d52850c52b1e4222db030f0660190..6c6d75514b720d81bdb987cbe16bf8f38e2cafa7 100755 GIT binary patch delta 5468 zcmds*Yitx%6vywKeHGZivR$CTTDHq0bX)7r%)VM@%WikOU{q8rr6fgO8c`chfhzKbUsq?sZmg$%22SXM% zY0rOt=iGblnKM1>{4tZ&24vb&eT+HYD$Q@&P*Se*ez}b zLnpX4Ag-R-EoQJ@H4}NJh!p=a9JxcaZY*C6M`{)?s#Nq+nYC>!Uj|3KOIj;sJ_)2m z4t#Q<20{>#Z8Zd3WN5RZV-M(+vOd1KF~RjVCL+7)IY;X0$mFz-bEW9Wl=K~H(Z=$Z zaL!gZA~^ESs=)c04*~Fkz5#^jNT&L@SI$gL<*Z*6mMJZIU_j!y??9O|Hg?*P%Uu7FKij77FQGZp& z%vVyys>&aJ_XJaZhQV3)yYmor%rdMA_0Nu4`0wwgMd<-nenRH-!|vEP?a>^2$C^;4 z_STp>=Bt-F_WQr3-ZAM%{ zgsx8yQ&N0Yk70c8>PrKAr@{_1T9F$xQ9sN<)A!6XJ$C>3JDOOaPa}2rj>kaG47Tgoj;Mp+bYi|~(*v0XW z$P~-`G~KX7wPmNu(9l&s-ZwnX_96H2^@bxlTWFo+skxaE<@S+rc4UHnoL#gC z+Z(r{>@YQkk#^8QN%2)ZTLt&=)Ih=k(?)j~&$YTI`FqRp_g!r4inW2kV|qRf^OK{v z2Xp&Eef(3H`yzTC#(Y7YB6!Uk$BtQ}7O4B$Kz5lHs)(gH{@2BN{u<_2UQoZ_6Ysm2 zcjX9?t(1Q~b|i{tQAjuD_BRxv{mNp#pi@JN*n@d^)IJ9=?^vsk|4P2(feoZGddHA* zX#+deOq%mF=7*1JND&t>-@9MWf5UvOdNR#fzO8puKH?YHAuD^BnF(3T17%QuB zN|+6_t0&f+F3eruX-E+>Fkkbd#?^bJzO%7`!=n-e@PeUVv;~TYV(z%A=g(pO!8JW^ z!~D!GJzoix{5DW&QX({OCtlEH*Yl4sKPSC4dGM3Qd;@;_M10#Y-#JYmzZ>&4yhrcY zj~#`2JwJl^P=lW5G5=@2p40MsE2<0>@M|t|pi@$q(_{l+rb==rLvS~rWRswC<*L^^ zmoscjizbskEPb&!OJpz{Cz1&;Lxj zl7lX8Oox2rJaiyHcB8{Vax}U*7{D7dEYnP+kI`aVXtD!CGvvT*T+7DNJflSf13|Jy zD6VZBqhpw|bTLPQ(e7F@OloGCh6IWAC$)58A2|;j3y|{tp=M=SlT^N+PLSQ$1Vb{j z$pp#lXOfx>#7L~aflUasGcd5A5zeKkf;aab2kXa^wihC~LmUh`+h0{5kM<_j9)!8S fB~Goab delta 2540 zcma)-ZA@EL7{|}O?WF}F)TYaPVGIh4ij&;l-WH0L32k|qnIItqmqxRNWs4!YkR>vh z7Ff$FVT#1TTsB{`#ne(+EsW(3=MIi3Uj|vYO zip0>4k7?o8?p=cQ6f0bRB2QQ=E)@e$CTNRfib|$Fqjbvr!qD1wxsO zIeRN5B3e(CP-0J+{IyoxZxk9|n6l?5Ba)(48}WVMt4Go`LsY7b$#Ji*-`9(zJCrbR z=7O*{HzczAzEXvSK{h5%RymB+9dR}It=x6-v^*a-h?xi0X*%u|Hygft)Vi@bNn4IS zrdgk221JqvgLz}Xq-)? zn=A2|z)Y8n!r>L`<&7t>0MsZ^4m|{ zY$ACZ)b~_t|7&peE#S5xOq5aHzY7wLC$}dMiURLBOmc0#2|13E{1d1TJft%QJk~+h zUsR@HWI~B|$ix+OYC^5=k^E=imT{7=0MGlBP__IF7R0sAYTgY6PPdnr!?3b5Ta&7AvQ?=H zCk|af4u_sQAc^K33wI~oc64k kv*JxMrxRR!ppL4f$6fJXl<`(sJfmJ78_VPS>#05e0w|_My8r+H diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co index 92b5d718cf509a0e1b995ad91cf4e96e979645c8..f33746320f7c546e7b5df28a3ae8f8cef2a3e31c 100755 GIT binary patch delta 1753 zcmaE{i1ENG#t9ls2LdN*Evuhl!vF>_nn3`__!k>|cN*6%53>hGWAOO|J z1Y{DU52}B%Kck=$VLLWAF?I>c{AZZ74`_VDe}-vb+NH^+Psm27LCi+1VY7&EDRX@w zhR#cBjEoGnAX<@;!Nzt^ zGnnm+dUcQsqN*i5o=v7P2;BhcV)E7%lZBhnCPE86sn$)`>r0BUClL$9hW&3hn_6xql&FD5?%2G*MuUa%?dFpX#3)zVJkL3a!*GiasQg3#9x#)g;f6Ir z48#(Lihr1F=p*iR22EBMDr*q~X21yxsCY{VTnNmV2O^m`7-r;6p5(L5%F)qvt zkV}j{sQ$_RjDkvp?bzJJ*d@qukYUokgACL5ZPpMjWu7c3E<1UWj>6MWZN=$~PpyhLdW z$9qPoG9Gb|GKI-x>)$eNrLu+7TS*_WK nuuN78F=I0Do*Wfo$MnE^@+=^A!DsTV5Iar>Ux=FJK*Qt#>9nyE diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co index 4a4ece3da5b01012cf816b7829943d6b3d7e505c..2b04347bcb3cd194fa056bba7d3fcb5dd5deaffe 100755 GIT binary patch delta 1794 zcmeydneoDI#t9ls0Z|jRmepTzVE_Xd%^(2bGXg0#Aifa>;om4jr3)Zj1`SRq1FDY+ z$R$P}RR3guMnNUQc5H59>=Kmu&oF5p(D;V`4Aa20OOs8Xkd07-n2lJ&W)b0B=K4U! z|G$|0>I6a=|Eu}?$bi|AAhCctfmp`>>VbYTAaQ<%hW$1SotM-Y85wLrv?3#ejqRRh zFxwG{od{y@YPNE*nPlZ+JI&8Vpuyi(uqnVsq#@8&wCNX>U97k{Rmzl!usWs9yJVzT>t&b_J`$jR z%uTKrlPBy}#m%xx1pTx*K*fQz{^akb{lL@-#^x+d^o}h~c(Nomww`Jd^cgU@Ds8sX z0mjy&ze6In<{A?8)8-4tHmrijf4A%h#h81GO}~vN!~XY^Wi8DaUu+Jw+)7ju*0Nc} z0aSj~g&5^K7CU(G7_RYwmX%P72TZdwT(CwJ7l(?!m|PelUVj8lRu?K;l7gzs0xCWs z0#)1@DsF(L-VZ8%1WkMwNSuj-0b1fuW)IyM;OOYYkdv5{niFqeW^Mvw8W}snm}cgN zP^O`Qr3*~d#Sx~~(7?hOuE5D~Ge=ke6Vnx!$p;05C+9{;bHS^Zu*s7m%!EA3!H#3_ z2#3%+I3`C&$WHzqA;}37S`XH%6RE=VBYbjDq#dV3Bt&fi)KKop+ao18JD{S6VWNx- E0ESP^iU0rr delta 808 zcmcbxoAJkH#t9ls6=4&#mev1pU;qOc%^(2bGXg0#5HlFUXDCLc3!q#EUMK^qj|s>n zMjurFWPe6MCBk-WZer{b&l(6U+Vw%q{yX7<%?kPLNjGyhu8PY4RcaZIcx&STZ z0i9*HImGlEBhbV)mdPKiw+Jf!Y1t1l*}cW4-^P<+|NY5ZZOj=TY<_FAm1*-K`%@f1 zcCQDJo#efZr(VPp7Qzsk2TZXuEU-is7l(=~XrYQLLB*$_sn>;y&q+X4Zvhp*5dvm7 z!(|~Xc7})qxDc4(2PWAW-k^!^0*NzmFfbHPz7?@8z|z^2Atx~@H7DM{)YSsUG%|oQ zOj;K^ArX0ip)gjxVi zxDy}&2X-vOWJ5vW$roZIm6>G=j5IQ+4){b*SC`8p^pxN>O9MiEu diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co index 77fa651c27cc1829e9bc82069a8ddda2961211a9..03a96abb4ce17847212ba007d27876bb1f4b3fea 100755 GIT binary patch delta 1762 zcmZ2+k+I<>;{**RhRlgt%jz!#Gk^h%W)Ohz8G#fV5MM}z@GrEY(ghGMg8>(m0oBI@ z!+slv&P!^Hj10CQT9J{##&%CL znC*zfP6V-cHCwsZOtNyZo#tmF(BN+?*c4zR(hz7X+VqRbr%oUMYG(-J|H&Uzh2%^x zs+BS@s56*eR0q-;Kw1MxYXWIaAU#RVX!1ER1p=xjACOVn%qOnLT<_1*x}Uv?Wxobn z%l-<6p8X&z1)<&)MDnH})SH6T^(HIWpOby0lnA(ZlA6-yJ}DC>0?IZYkdff57iLCy zMu7ftHuuD_vp>YX*}}krvHteYrv1RQ3C5~yP4tc|PI#guF0v-~ zN!e{?G5W?xARvGd_Jn=&9cvTD`qzI3d#p`TOV~WamY|=YG53u_@#fE#{h)|*Z?WmO z@nqP4hP8FSdk2u+XwzoX$guy(=A#~4iAV@LHXHb`a4m;UKc9fP>ibH0xJF?9n27h%Ps@4m^c_N zv`#*iyv@qh(vTr1F)1}C-oV1l8OAg+F@Z5H%q*cyLlZLt7}L<)d9sgS%4WILSBy*- zf+s5`i%ynLm*j$1AE}d_(#?b%+938iq(SHdoRhiJWhXCBm*j*Atq1G9m#)P0A#E~K gh8^bvh>IBtpoVfzPS23!49I|(08uWV&dJCC019x^;Q#;t delta 719 zcmZp;$++So;{**Rjns)+%j$plGk^h%W)Ohz8G#fV5dTPo@P9O-(ghGM0|y_J0oBI@ zB=UnI~UWmz^x6VKBK&Tw(H|D80>3 z#1)tYW^=dhXK!NJe~z_fe+5I&{>dHENqG=L(%wG z%YKka?kzU`Hl7Uo_pr9^ckckQ8*SQb8X5NA*&OS$m1(n)|1J*3JDcBz1KEe7{CMiu zL_mWRO7Vbcc7_H=RB>^rxQ7<1xDr&n1WmmzRD4A~s(K5kxJ3$>AqwW173TK$(WdW{xnXfyLy58HST<1Wh)7%Xr1e z^uvF0VwT*bYymEKdPDAEo?IM~A?41Ln5U($ZovEiLH$W4gsTaXQQ94>C!{%$C4p zwv={bj%noABF8*b!nsVNQ)ASzjLk{9X$bSjf-W-=aqRB;y~6R&o%_>E zn)mwozW2W0cYW`rcg%$0emmq0RP3j^Bf4IFPAqGq zW+1G@3ijiy4ujhC& z@G*Aq$z=zGpp=t25ZFumnt0tgMX%!YV@)M)duxfiboUy&*0;7a#s7vq&9|;J-9LtW z+qBE^o-5!45%K7<%$FSR1K^l&NsM}4kuaegRm&AbU1ox2XMB)R z`<2WR;W;)i0aXAL{he%{9Gz&^RYO>{z5|Q`~lTo6kzHAaltcB!gKG% zdPt2wEx7~b$v>e=K0fAW*q@XCl*jprote)_#OJd(+w$mH5bd@B)$0%Crcv7_D~;|& zeX5+JV*!TCWO2a0)BYl0d~rc$aWkto+CQ>40;Zq3>aJHxCC1)zB5CuH?%v;&M5Izn z%wF~t>PJM$>Ftzb`CkeNcgk?SCyt6#jwlp#j-QyDew3)ac7_ODKTBkIn8&8>!*QDA z)M(8&6TfeiQ}a~8rjt^4kBHX$o+Q=xN|_ZxU92CL3QK#EU_&Bh_|y#(x89p9^&rvM zBJ1d9TjU-6V2C~X!UuAXUfU-3=oh;jFsJ@rfZ>iW)OVPjI#cU=DOK2gUw(aCli^az zc3RSdE$Kjp;vWNF%~b8RZy%TO9U2}`#~;d!mhoUBu_H52qPw}NN0(4=2vVPqO3qfU zn(3@wIF?2>JOyXa`Wdhns&1-emdF0=pCO{hXF|!1xcZJ{BtLt1vb29nlsm=g)ert> zKI4nHd{Xbr&Brkp$;=y|9ILy3qYc>4m{uxP|_+M%sxh`gQ~Vph5fgA_w#iyn~-Q zut?yg8uZ)~^I4d;9OIs6Ca+vxRfCq5lZ?}z!@SNZ=2pyCE#MVzY}JJqYNk27gn76o zX38s-n73Dn`FhN|rX>)~Qw;`AO|@tD{{TCjXq&nKKoIlxdNB`S-ZRbqOUxakV*7oV z&%Pq&hqXna3k#+>m@%(AC=TSnyelB) zeH=$KK=nDYbw4`dHJQwyrs~ZNHS0`8{eJ|GO*Ok?ObS%85dR<19D>W4S>H%pT2%so4~ZR%9+0 zY)w{!uyc{wEd;ZeiUn#evpI1Dr^Q`ZjCHHKLddze%q1jhu{njESt=yzG`a-?o7LhX zzdL}6iG{j_^!NdE3!?R5@Ap3eyuTxUvuOs$(tSMN%G@PG)E)A wLquarD%sbG=Aq_J@{dlmNYi^L`Wx2rOAkBHOmguzXij`ySCpTEd>jb=12qr9m;e9( delta 2408 zcma);eM}o=9LMi@+R{>9x^>{tvB{7j&bPMo4c8zqEih*si;Ih95z%d-3|;?N7RI*F zVKIa)+HuCtj4=%hGx#=gg@+;GvW-k*$W&BBam<)$j241bNdTQb&-HnZUNv!-T%X^4 zp5OERJ>TD5t~d9yzHLe$EP%S+6B~P){Xxi)gN@t@>5>C@NFuzO&R*77Y`)j~AuDV2V!zE#w=sKo*e-{md9JVzMGcB_QkoS`JZ^Vjn4 z!OX<3J91UXE7${tsQ()c|L2BOK4_XWOf9yExE<2DLDRyY@!l;dOq;PnnQbTu?){6O z&L^@azPLM4;=JdJ+^-h{B9W;KxL$E>=0u;SRjh2o@ln@HuA`hd0QuSecEMCRtKxHW zCWO=-5p~D5PB0*x-gZ#UNBsni>cB(7XEmuR*<0B!?Haz#o0|LyZVB;M$Dj5RD>>-q z1@JTwcJ18*(f1>uDUEMVoj!GRNP< zGFz5pnc{9NV>~yMzW(+3fJmg6&oP6I5{p!fDeJrHunuybq^ zbsmEEr3w(S~--A&#xzUU3pekA3DE536gq#PUAD#UU;09s)xIxuks7Bd590O?VOE9THk-vcp z8kT03yP`2Fx)Kw=l8vhJpcTTUdAS7c#VwUMUIm>ncVQ+Vk}|AMlNriB)$7r0Ex=vz zJqllto)gk8(iJbq~!*qO1@gU)N-9+ z@{(IVa-~qqjc#9NB)Xq+Q;8JFKTgVn_ew#tJW)<3@=9bR1&>i4tXJ|UDIcs+@?DgN z6AbL5JnK??KCa(HCjy=E6CBq{`Hcfg9;Cc6A^#NR?uatKhjQ0fO5R5j6aGNGlz zbU|K&veh3_UTstIamsx+r9l3Y zXJjTFyd?#mYj5wTOmhvqz>C7AA&u-88}C zD7Uj*kDcf?lgegmg)HH*$?GgOR@veKYfQpskt6=|raPn$L;Kn}dOb%!eO9nf9CCyCUstXQ`NA*@&UJ^{!>|~>aW?cZ89zN2()Z)B Ll{iNeOO#M%k&T7@TpnmHGPey14oMk#i?b1WvNWC zWaOb;rCqrWW^gF5mC5x4x^kT$zqL=PDt((xK8a?(WYdQn-$Ap%j#Q;CZ_`-kA;)$! z8|q3|YP^t1y|y|L6R02T(B;rPxmA-t?x>3f`s8{RH?fq0aUn%8T*k;Fmimocd&g=PX8JGQH9jdi( zIN8#Tl+jwtz~|Nu+X{bwC!NE`sDH1ur86&A)L4Xvy+5Wjiho?t-p|$Ht^fV*LbiU& z@glpRy(_`kpv`&ZPG?Rl;PKAD%Yh9{Znv!~_jos!Ukdy<@Dh`AE6X=;uvyfj;{Kk` z&kVOYGPO;|SvQw`LPb4FV%t!@m1o}&_Ed)3rZ{eOO**rp^c%|9?mPF>Z1>{EJZ~>J zyUdvFtq)|5R#?rSHXusX-af1Lk-^T(S6a>-T&k{F&eo#Lnr=ILCVwHrqO^3Mv&x}3 zj`leU-=CQq@~0kH)R2pBZ?Vcj)iF?j8Z$*7SUmt#Jm@K;OdfVyT_IdSQ=t;PETZbO zGo3lD6f*9+@DpeLxC=J*Xg?}I%8AjBtv0PqePU^*Wh75Y%=Gfv(GSpFTONK8;2?PZ zaggW@>mc|77(q@ZW(v7we*F%plQ+;Mqz3cRtLQ#@%unzIWlrN7^g0%J4bW7+3Uq>?FRHZU%3C)+^sn5 zSj??Z8=pkuRue`SG*zrzQ)*GRWgY5eKl<-N?zi?bU>8%?)gE+=cka8@nbS%mx8#Ll z`j*J*k$x28t)A`pejHqmWc6F>0*vuJsIB4nP3HVxIFP+P+GprHdbmc0+B_N+13^8g63bv^Y6AAM{V?7)#~mBR%Q!BpUsVau5?kO zFK%MDEUu#NDz@rB3mJIrrEtzFwpK^ms_ONhb>_4x$dsqc=_y}t*WLv^NF95Jt+R}Y zhrPW-O?<(2@@n8&_Bf~Rxx@DIr&f-!ELyKR$Jt+6UKo4_j2OqA>U1F&M9K3%gwG?e z*!7iX5@p&anz^qsUF4bfzM^?G@X_sawwkmSczKNGGVrD~x{x%r5uy-Gqzg{~x8D^obFe^k9E#5O&SMY(nIr4fUi78^C{qyIR0G!G5j|uE}YN> z^T+>ykDM`)E-ckYfoIxPnwJ1SaFOOKfIEI`awGpEps2iRDs;gM{F*}-Ol1&w`E3*F z!Y1Ii|3>qzz&mfzycOyBd1mk~t!M{_h9$aSD)$3Fea|gO17CHTwm$-#*+ui$fsZA> zPAg6l#ltlJ8SrZ!ntuemp`PX=z-OMJ`R{;FZZx=g{(lC=%|_bc2JogGG@k~}7_Blp zWHg3n>@U*xHti1`c-RjlDV}7CK;bZYWd^DQUYVsFm>S^SCur^l&isq!^}tvDhUN`A zPx4IL`?TUIaG1V8^ButLMsLlHejj*roVI@f_}C{jPXfO=g}K>(9IgixmuieZ9gz&L zl0X-9fneqY@QGHMp9MZzK=TiQZ#G)4$I_WEIW-1l;u~&HoDg(mfmc zobUi`f1B0o=b4E(t>E+q@Juj4^REGaZyU`Y1m3ut=Bt3W?4|iS;G;h>4#R9|9Vp~3 z+93vfEJO1I@M~|<{CmI~PSJc9@bd$}$@xDIit7*Qg1MoCz^DE}2l@%{z9E|T1FyVH z^EZLZf1r8tG$Ayfs4IahL9g-Q1oq*`**$#8&V}Ofe$L5CPi~QHV=s1Ra7Y)E?ah)P z`T=`g5f_M}%R@4YLf8~V56LV_eiF^=3X@Y5{c%#L81a#c#8{N<((4vU3a=RPBuQqU zaUQs@-jGbvd800p&Ks9VI-kefr4*7$I?3%PWk_BhIZ#O!$aXh598A$?TV9a^(Jqd0zflm~<^92qay|BauT2N8_YbPltuDF$wL> zAt^>`4*4Tyh=`dZ6gT%1_PEI=!jg-WAC@C#j<8=;AFbkcvK;=ONj9^RTU#(j{y9>X xy}Xi>ii5K8a0r$g>oA`!c5)IMEYEIqa*wgo<=Gx>9d>3fYpbg|TTsod{y!p(xRC$= delta 4530 zcma)=du$Zf6^G}}KE`$&yx5z>X)zefF>9xoo!Ob$8S13$>;p(Fa41C%jfz1OCB$vq zcB`b6u*R{A<+!S08-}JvP=gXeiM3%+rlH^!m7_e$!$l}qjVf6vRS9(x7aWB&F6una zz3c0LT1)FW`<-*o<9F|jMmun&G&fp05Mpx+_cpwenVex6`ww54fb*3>*2Qmqrpi1H zm|8|(JKQiP0P%R8o$dcw2kWzI(fpeKUBl=?Zmb8tkzRWheVY#J=pqof^Y0 zcKYy`UZ2r9y0h5qU(1G2_WbVxWf!IkQ%CM{r#6J4oX^VD7ht#k8|SekYc#~S*3r3g zC5!(&-JQ3y;ITJYdoX+KdgHN6Z~N_m*A!=~yS+y9?S4)l{3Dvl2CjSAdcV2wEGj`p z^~a}uwhV66k>=ZF>~9oXh8H||dh_O8=rUJ)wb~uOUB6I)ocZq{Gk7l&yyi8;UMn|O zEk(cPSg8p2ZjRKk7m-;TwL%I~m@P4MI9R5CR#4!n08wBU(g$WxCzbcPXY9Q1eS!1Z znwY1aMVDuZVrRWxZp>T=A-(N8sDJTGpLOT$8d#sL<+MIw^PvMM$=L$!s6t`0>a6|E z5~6Mi+xLhqz^BNVdb$Sbr$0qki4-TN(7ck8_=Bz#L-#lb-aZRCzpcWST6U}i_i4}H z>7O*gfwU0Ac}Mt9y$iO2nMB|UDm)U@vcqzC?EsXKj~wRYVaeS6SU-eyk(S<7#ohdyL> ze8?_$?hDt|&6x`hvV$J8b&T!j%!wk;)I6{Kh%(3S^Q=Ib6VJi(3NEG2m1EXd*blnG z-+hkBF=sxfc@1#y6NOmYS_*tDPICqLz=MvFAzn9*V>-eFImR02KgW#DN%T1I-X=PJ zEAZSZnm+^l-SxA%ReuLKI(|Ca!7x2|!ILx}0G?>0`EP+ww$uC-;4O0$eZ%G%hGTYq zNG~`83v$CWe+T&ZZkk^PuI!=t-+N3Hw?{VjM49D@hx1$Th=zDo0A=O}PY z;1tb+z-P|Vd=c>U`-Hpo8^F1-_%w1AOM1#fkow-hJTsXp#FyiMQY-;NN(|F)(Hh1J7-w`R{>;yfl9k zcyCF8JN3_jBONR_>?-GhPgK$TD)44^FGxezfFGUH&?mr0B|6V-)|sDUa^IpI9!CI< zS+JVs^MLoRqxt>71Dk1H3w&lf&D~daj(IZeZU&iPIV`yHJiQnvVb%Z_)fc;4OD(eg$}Yv15=%TkMw*w#dUwEz`Red(4ciG3}eTe7}8DB+g4< zM#WeP%y3fVv8hBvYzhJ~BT^hwB|>6lS-@14NkBY`DKDxH%~+Lrp5Smw!YOzmI;%BY zPDt{&H9<(?#C%kfp&YMU1r)hNhwl8Ohs`MkyIGh5W`9s)`>z2iy}%Ap4nQ4 zRTC8rXAm_3waC{s!0(!nt{ZU6vs+Ps({N$nu^Owi4-muwX}j|vdR;a7qC>a z)iDl`t_=#95P9j9Ixv^*hs~wA}ShiR~#{Fn~1dp$YVH}{z2@;^HnmMn|vxW6o zKc(nD33-;5kl#$o^y49qT;3LT9}aB+_je>gxP5K; Kntr3+Q}jN48 z|6=m169{GeujcO~17=5p!~*IBVj2Id2l~l?#Q7N-_S-OYUQ%OZWUvL%ii`|4wtJev zY)2$^B8a`K*~-Ocl9h|?G(8)E27Oz>CIcIh218rXre91xbpioUJ3|=%PhKM?Bxib2 zt(1X5ox${?I*`@?(i%Wo6G&?U>B#~fij(h(84^~ev{_Hwin;zbSL=TECYJqmtS$R1 z7<%@D>=cChQxM6Yf>3`7QrDlXV2@6&kWwP-V#UonrCgZ_tJ|@8i`*a9`h83Y9|_Pu z<|fYi=2A5w!e2_8<20tR)wlj?+7C>nU|h}GMDJMRgeOm8W6e{axL9*B z2F9Ah-ysugvn+{=wRbj?*c6+8wd@DQoO_E+zl|ru{tVXE{q7w=cB4(3O(VnpO`8`w z7qSpYFq;KDEZ7(~ZT9pds1#TbC`|^Ym(2yi7dYz`!lC6Ul;Q!?>i0|Ud!AL8^je~43HV!Giwd18XxWP?NjE_f{xKiMnMOxB|T zVy{O6gxfs&6WoM D>?_C8 delta 769 zcmeA;%edhv;{**RgZPPB%jy}t7$5*f3qZJxK#C2-jDheOYEbC{D3^f+%7E%)0&pHWbWupOJ57`p^H4l+#IcaUM)zReoKz08xB+Q?3RWMeRSnz+K`Nnv`MStN9r z1>Ct>_p>*#>{nrJ*TJwFleSGZP+-}t;HbemIl$nG$o*eU`xzJ-z*v#B3B}aSd`2!H6F0GOiQBB4&5Rgw5|dJM;tkAA3}H+oLl+p++}IV$G&FRA zi@F&aZ=RLp$Hc_oHJLF{bn^ccNiKNmikYmGY9>?wO>70w#5aL?^6M1Y$?d6cNm6#M_CqGKH;}n3nmZ1P@DCcDNG)c||sOV~#C?f*^c@3*b diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index 91a31c9482b993d65bfd71c4e145806a0b8a17e8..54f2dd5d5dc600e983d62f5741198b4bda8a65ed 100755 GIT binary patch delta 4881 zcmds*e@q)?7{}lDdIyvt<3|~|AY zO3`5);x2NU{WG^Ymq@nB&CzJX5wk?&k7S!H)&#R=;g3ZzAu%D!V(Q)Xy_Um2ju|B| z5_q2bywCIg;CuaY$B7^Dd%xjBXUBd+`x*h`YOO7B%x&$Ik6L~cn-=&O45@nzq zLka2=FPGAX`k7Xkt5;ux3&Lx8#c^X&xcTcSjZV48|J182#k6X#^jWM+KWZL z7SKEi25wbA2%KWv2m$Gg^hvpULEb6mpZ9I^kf*kJoWn*khu-4M4Vp+DZFUv}=drfq z`98E}JB%|kg?T0Rn-wnuU;y@;1|e1ou~LXtLaY)Zs5Blx%^dd^s#2BQpe=fhetPD z$0-|@I#B-d3&-3k+@Q8qK`SnMb33L|#|6r0g8JZokQeS?>H9f6xZGlClXvA(#ENw< zSuD=AMNsRTzgC4>EjD$v&NzAqJ$paIi;6Xc$alz{fX~MJ`KA*>-b=ng_5z;3xH@uD zRr#FyQUVf;d*Ct`^OUKTGl8-&7ipSK!*4r+Jm+JJAQx#p2_G`w$~s-Dp+v6&;4OaV zP$sWglkl)0k-CZgXAjH$hgmP>W8v~2)Tn&enOHE&1wO(FhJGeSO{^9Z%Y@AAzz7%U z98@!C&NntC9xjWS?21yo^C?%Oh!yW@4OMX5(=kh~l&kW6NK6_C0?^MF$rXQhV3!XYx@I%@lcU{f5 z1zVfQrrdKWJL9!0+s@9#x*tff(QgI8N`&Vc`88n$3ugRDU9j1yNcP71xezakCL7rM znMwm|=Dz>+Pce-K8hf&CqVmg?< zq~wK&&n)74#3QZB`f9{W>y*3}@ncm|LM2BV(SUAILkr>{kSvvwy@=0rDEYI9-*6`0 z|K-sa5g&O^CSnxUe+>;>N$o`<{fJ-emXhR;A;ixwsy~f*_>*L*lw3f3;*64iDqd0` z9ZaSvcN&rE(n0i&vf(SlZ-kZn7UJJ6s{a{rmsvUgJ;aNOAuukS>UT77zDC(_SiHnC zKy(*-_i=cQ@_M}BQ0M*whxS=4-mR&$0i~t}i(FuGG}g-0V&5*Ojz+tjdTk~-^;qQ@ zY>oySqUJ`oOwBcxMuAe|%4W0MEmzc7Qk$^IYnZ(?^2F9!ue_YaYjvPDi@io(L!eef zZPu;Hw(T~#)@B!$*!NV6y|er1fwAEyU>k--*R#J=Kx1Z9{zp?at2aPL?v9qkZ*PY& z5yN6x3Hzo2KB|@3W%k2Ouz(#mKt1+{k-ckxby^z9bR;nruYg*%sZwZ~MFX!$6G8Ad Dhpa~c delta 1948 zcma)-e@q)?7{}lDdPk4BvPwFg5fIOfG;U(=dhNQ{;Ao3XW`K~fL?p7!L28`ZKM3jO zLK(J%Ai>%c<#?U7G@xmZ_X0`G9$h=rS;BBLP`Q z{m*q|J!*pwa{s*t9)_7p_7+Gj+yave=!E0TNBb%g^4nm3!G99Smu zPw@I)hC=UmR--$e?Kp69jY4!)*!~!skgdbCY^URL6pYJgy^6y^bw7nt&RRUdPtj<6 zf10&YB%SQ!lmaeXUl*cq$~nk5HnxUk0EV=$jULWdL8JL~EO4_|(xBtwA)w zPvW6qn5LA(LUIOWecgEJ=!XzpZ>drC%_zy71z&92rl<`OU+9=FK7uZby=|9n-^mo& zKht&AqBUL7f15$sSF$WBwLj4-TJUmsq0I5qy@LG%Oo{zdFZJmMSiB0 z!9h#WWw$D4Fm;Twe_g5D4VM?}BB6Q*C%^ekZ*=D7)e06H4+FVvRA09!UEd9d@c6Gv zx0SgO8%{3zpyS$R!F+Fd!8}f7SM&yFZuZ%5)v^fLT9>1~~ zm(@~+>rv%aif=uDS2zErwJRAMOEEa|V;0WM?t|l26mGsKB_DusD!?cYz}TG9!cXsEcTJR#_;t4Z9OL#=k|A~z3n}oMd7(^YF{V$Nj{Q2Sp013jYdd)mZ z_-u*(nxVg8)=v>0x?<)ZtJ4^;g7lU-F|SRnp!Tkre?j=rteM{-+)<+ch47^V=99<~ z{=FT7`r=X>BoR7fUhuj)rC1ORNC6&ZVogm)z`4_Bde8L;JTDYNk%X>B(Ga-2?w~;h zAs>3(c|nudXb6I0h=ig$WN7)ukWrWSG(M*YV(QMk%XgTF6>NwFmzbxaBRmnfy@F9# z2nKzG3U05<2!lZn2}M`H5V!+}y#^ILJm~3a^_+gSzjtm^fE^SS-6OSlVEwMB@yg?* zVGs0H=EyupUWxl`(s2*0mVWoZN6HQOkp!QHd!-sLnJ>v%M zV@cLgQ(TAF^C4Jd(awS5@LX}}IOsnwhL4SdBeD#{*#pIFacnj_JH~rWm*=(=b_-n~ zb)IboLTyF2kifdL0$FAW6t^0VKcOOfO4h>hsJ_KxCs!P@zCg__aeWYt?fbCxzs5`QlHO7|I6~(QrDQ2si6@Nml*;5k6 zIU{hK_oj>IIj&Z{2S7D&T&))6B2g|9WsN9nM9GKKc7CzBT+1z5X+@ppg?ZdB)4GNH zxK7xvO9&mHSCDd=aGj0p z={SbU7FF`%o5$1XGL2I`5l%xiwm`Ea6s?Tuxlfe2D0A6~u5b%ElC zBW>L81)1r!(~#XZIX%51%^sb`&4Btgqzs%hFI!RWjxK&|K@A$%T&!8s*{v50+~LRB z@wLP7l8i6i&O4gbjlL^(?DDf*@0c{Y_+IAcsdQ|ROBv?#j}wB_G%~%~4C?8{pxfQ6 z4I2A-C{_T!)$^kVPeTUn@7MI_&u+ajR8R?T=}_OX8+nwk3T;1LpZCf=e90p$ine^U zoZ^u@V}9G9RUv6vVmcaoz(9_ElIwlL<7@zhFOF!9=)p&ajFxQ=6N1z} zGR}62PY)9x#vBIT!|ogM8lZ0~L+ySMC9XKozmE@Jj1BEBgK4d6Oh((-Jg##y`Qw|B z1+vHo(?-;M4(7&5+=6*3r1pOU^Ky@xyD;ylkrhs{`mw<@X+aa_AgZ{^%6iPlTGV_C z=EjYgKl;SF3-j_X2$7_i|3hrp-8InwKsV;8ZED_!xhoS!dg*F)!#;TxI1O z>DL7E!Jj>9Ln?W<5zgU$lxaS=bxv)*hIzr)YJMB@)-TmO^JXI-xX!5g-`GD@uD(#u zGn@mhf2kKtm;J=8aY@ZhnCq^nc^T&B>uO$&`J0pCJ20=gEBR-42Jm8o_YZZTm6&h& zN6nit@1SHwF^aD_#7{oxEmZS1%m-edz@_{jU_)%p1cS)qn4fm5`F_k36>5GIbF)p& zPhh^zqvmJCL#K46V=L5#@9~0r>(rcOH`X*@eR631iRf&;?hAU$wt#$K=E)fO8IWT64=i; zabTJbC^>>IQXOr#k(z0T!%ci^14=rdotzIHXdvY?zD9YOM=YOlG>~YFwSi=2Xcx)M zI0K66_9?!BS~vf!sC@{ngbyHPW|PNP!NoJo g6Essjq2z`p=%HNUk0ftOax{{h8-kVp0f8im7XSbN delta 2535 zcma)-e{2(F7{}kc{)A1^%`)Z>#_V8L9LwGHR=QCK-L)GN6owNsY#1nzX_iRLf@Y}g zYHJoVbnYe{xR9xYs3CLm4pzsq8xzzhCa@47L=z3f5E5_(i~?iv?)tn}-tmVvY2WAl zJkRre-sgFHZQo(b*gi|An;l#A!Qumv6vrU#MONg{w@9N_jZ5HvkzTi_!_dlXFd&cp zpLvikuVHdr|6K#em?OFNW2kHN7}`B5*E_#|Xxp5G@-`Y7^{+?DvqcFd4t9M}Lf*M< z5f0DY!^$V3^JTV;lh5WC%3bbK9DKlSaxAbX6kE)!ILvY7b=0BAlMlPFFu#SBCmKs} zFW+O9I~ESwZ7jIQqApcrBMukmvAE0qmc@Dfc1%Gis=x0u#|#zOb#WSV1v?l}T5Yla zeJ7)W*|x_JxB1`kw;|;YBUkL| zm%{~N{OgJzbX~cZZ^u!o$;{qi@Rgb~tlHTT6tKU3c`}-s9^_OpZsh z)4e;bRPXAs`kv1<!Sf%0*-NbIWxqSLDTk@jJN-R}Su*tbW6V$KHly7?n{n^C&rCy;P7=_v7Y8zfDb&A{;eV2SAZY?kPvNE>$id<`sUpZgc88Htu*fhp8qnCsICA4q5FJMCkZ9%e(SqXyCqC-A~!G+zMR`J=(p^%sL<|BM18ZO zUj;n>8qL=O&;Ezzn;E^n4Rx6H2*cY93DG$;e+_tH4b5Y~n@efF3;31?Y2E|;d}I;r z_!J!Z9-8+9@2I5t*TCaX(cGi`g|c>(f`44n*>N>2*JMMeC{2=U>zSTNC{&J`H@~p8 zxl!;0t1@N~OfDdj0M6@?2o(gw^m>edkQg8Vl1LOp$%y5pDjouOZ-7(Hh*p{Bd_;}s zC9e@tP9%6y3K65&3UiOEO2q_W@@NDX1B z&qwkCT$vHnL{iz~^BQI#L{2G|b9#wNwS7@A2JZ<5Nez*! zB9jnB&M>_pq9z6cygZe&edzw@n5zt%;*x9wQ#zYmdrC@HG%(&=_m=e6!yQbo=d+SA gm9y?(a=3x1WX}hax3tw!o2=NNuF=|L>jtLyUomz{lK=n! diff --git a/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co index 8642f9d4b84f2d9b68d01095004cf667cafe1091..4113d925b368c69cf837dcb99b184f7e39091e34 100755 GIT binary patch delta 1745 zcmeycobkkZ#t9ls9w8I8meotxGk^h%W)Ohz8G#fV5K9C?_!5PvbOD6R@PZG@fa+rc za*5Fg)j!#vQBaAn9h;jNy98zaGfdhCG``_K!!$7M(qz*oWFyodW+T?HSwy&$xjvBb z|1T!LI)PBe|7!j|GGKNjNGzaEAeQmJdZ3>SNSvRcVZRMS=Or~pMh06Dt;on=W4osr z%yvX#CxY0!nyp-HCRw@IPV=)7Xz;fcYznXuX$Z6xZTiLJQzsArwKIhA|KurRLUN`T z)k+x{)EP`Kssm{aAguwUHG#Ayke>WOMse~{F+;-2lqR2Z<=d1*z*-Rxl*A_1k`Q*AXCFyZ^DG| zjR5^)ZgRVrJOOutB5U(jC4&9|ssqK=w9N-pPq5U_{N1!4m`cGoou!H1F~$i`p2Wu3 zT`ht>+-#r=jImRHheV8hH6rLASd48lJHnzg{ddcLP^7uH*!0_YGVJGLY2EMM0c1DY zwAnN=>}O@#{LJbKp+pTda>wQZy9^d4p!!=*#Hd%=e9mU$%^cleOgAG(7}LebbaJ7f?q>Y}MJ6T*`^gsrE!!TEkSpY7&qmCL0P0PyP@j!PF5rnJd_i$s%a77m(5jp4=5|$2lVyq=tcE ICD3en0Ms|d+yDRo delta 774 zcmX@Hp7FzS#t9ls1pyPameo60Gk^h%W)Ohz8G#fV5Igun_ztHY|o0}NB1UU{eOxkylVcNdU8p5T_lLf_PCpYOTOwJQGnEc2>Z}S^* z4Q2s(j@JF`O)UEbSX%a1F!b!7?4YXzR28?`LPm^fa*^Y<$sg2NHeWE{VVP{AyoH0E z395`o9Hb0rn3>&X7F98(&5taOu}sq0A@ccm(|!hq1~BGjX@VQNxkMkY~v-vQGdn@8pcqH2TZdw%&RXrYQLLB)H})aydUFGQiL zw}6V@2?8^O;j#-rEG7;Hhuq1pg0@*X8XGg@BqpWi#2c6ynZcMwhH$2t5nO?*u?b9p zv$6T)LP6cl>>-LwOb*tQ9fL$C+lNVV!IPNpb+3;A diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co index ffa2f8f171757e2163baa2a9eecd81e6e50d2f32..00fc842731ad2479baf6550e8623c46a85fccbfe 100755 GIT binary patch delta 2350 zcmZ3nfbqf_#t9ls7lI~gEvxUaVE_Xd%^(2bGXg0#Anx#o@H_HR=>iCsp#n<7^f3YX z#OQls6^O~%}tD1LNfmuChY?n-|(Md8klxzvgs4D5o!>#5o_4|MVOJfK9KSM zFDAb_fgr~JYW_YlV0I`-ETB#xjPbvEpq~s}JQ6A%1rZ0UiA7?EBh>RVH0-xw=)9!H z$jD#|q7@k#Y;5;5gV~MGnigf2htiqS_4RH0%=VkJy}3Jh>(&5aW66KrcB}=z^le<_F@9 zjIdAKA|hZ$QxY(r zp`|1ZzV*MG_5({NFb-tdyhsxfV52DsmXfai-E3!w2$<27gqf05*w+4T*}wkx=6e=( z0-F;&9T@8+ptUfpvgH9Y*cndP!bBjnI8?ks2UT1ND!vCzy)IOo2hDs7sQ89hFhd(m zf(U0Yi=81S3@!v_EC!KG91I=#lfAsRxjGq`G2|pBrRKyNm^m52m`0|qFs7NK8;t2> zU<_lrIy%Fc&Mr=q^L#Wn=lMu6F?HBXz9=X%d7hsX7rY(fKlzZKncxp-v*ZWV0~}C= uT$9!PB{^ZD>%pS2{>n@%0wy>4+i`9PggCzduKYPrxk3;`bPY_DkpTej&TUx$ delta 730 zcmcbxhH=FL#t9lsD|{wuEvt_)V*mpf%^(2bGXg0#AWrav@Dnmo=>iCsVF5@H2tf5Q z0hz?;gX*8`&nT!w*pAIjj9Ee)2N@>qJIF9?-)0UGM&`+D#bqbI5)YUhCE+o7mxRLR zM-tN+C(BC9PL7fen0!jwW3rZv!sH+sDXwMgt^3)VSoU9H-drZLmU(T(DhH!RpT@K)3l#~p#hA~GjBeoi^B}Df1A_Hu$rO7rvIm9zu}+FT=sPWETGWZ z+~M!QSpUWY#6|!fFq55Oi6ue|#1e;!$7rF7D?!C)psCk|ia$Uz-vTP$5)Ni)gGmtK z3}&%2I0V3jz>LKpl8J*MA#-w3;5JuFb0>zJ#H7@mcmq>MOBmD8#T>>obu@!9EzK=p zOcx_V7}LqrU~_JeGZRyc*<`~Ixycto1i0Wy%X9Lp5Hq0*(3Exon(_?TCL4vCFyg%^(2bGXg0#Abt@7;lC(Br3)Zj1_^E`1FDY+ z$R$P}RR3guMnNUQc5H59%o39M&oF5p(D;V`4Aa20OOs8Xkd07-n2lJ&<}boOnd$=> z|Nmn0s}l%f{IBNkBLilKg2V#q1i~2qs|WhYz{Mk>;!zNBu$ovTb~r*kKSRTQ8-~tH zYK)8wwjf%Ok-^4xPcxY9h+r$IC4xA+nyp-HCRw@IPV=)7Xz;fcYznXuX$Z6xZJK;X zK}pW^qFN~fgF1uhMRg#p0i-p6v?h?&1k#fih)YaX5Vs;mS%J8b7vFr3K>C~giMGeks-r1d77jFaiPOC`I)2!?humU z0eJ-;@H$d5TtZx}``MdV_It5x4wC9;g9Y1Yf=ov*GCKaa5w-gb9G@7pHWbWupOJ57_)>p4l+#IcaUM)zResWKba;k6_=g-NIYP2kc7wNO%e)| z|Cx$y{vgrFI9XI$c5;w(z~n>H9+Q=16ecI>icL<>q5mlJW$nJK*hhrfEmtk zSqO`rVU9Ok2+V-EkDcL2D604x5SNLAVMpHNS7F-%99@kVauSnLbK(t5EnQ$tLl;vR z)6~)t#Zj*OT47-!QzuK`ocJlBv1bJx)H$vl|HWi%ze!iik6POMAH_tw5q zU++j~Cc)A*aR_97*q6>s5u4wimUs89!o@uonM6O6zr{*%=Kzy;_s1HreMDgcapw?| z_dF&wsC=04kpjnk#x(Zh#s0fNG9!XBD{h7XVWn(jfEZqy#CAu~PPmUMmCq*+M#Z^< z(eV6(q9b`A>`d`uZPFj!le)#m;!X)|4zzyfJ`I2yq*vVIEWjHbL<=JBLEMA52XnOX zCz!k!aWCRt#J!jg#+{=~-iNpkaUbHoaaMk@u|8%nGjfOF%*Zu{Gb2~gndNL_lhKrl z*NircJZLy2@+O@UN(04EahgzZBvf2Us5lZTP7^9l6Dsb%3Kh3QsJI5V8a6ZGbXVPSOE@X(;utWM`D{CtLAOKveaHg8@tTQO z8Fx3Fc-L>k5g23S}f?$W8a z!$_+*chD+|0#~()DJ%c3)Bj(x@`|k>D+d)>iA0Jx^oS}_%uFliOvAT0uc9q4t4{-_ ztkgxUTxI1|jjZffWF=0#KdPYPK(BdxQW3`|IQzaY2u!#d4)El2B5kcNk? zh1VNsbDHo^&r?z;X9<5UuIFzNe%H93FAzRj(*GlcCx8{aU$Vdw86drl4fW4ggxgS;3Db3)B+6oY(dS zU_aqc{i<>|wtS@FQWZAT);E0HK+RyJgPIYZ_Vx1uotKYvw$lz>t+Yd+-H&ES6jqK1-K2*<7^N%wg-~0O zhNu}4I;q(eZKY;ZU0LA$0--{8H{F^L>IzcokE$K`P#66_P-nVape>+oGtf>q9|#HU zKxUjg%>CTVveKG-;g@|fpL5jEw|AsXxhEIovgBo+EP2W2uuY!J1=;MT{B`aSdumIr S*+9j%+=Ggrw&k-M_x%N2MSorZ delta 4443 zcmdUzU1$_n6vywKth>pYhV0ajNKz{4q=})`nc2<9G*ZoOc2WFV#H1#~?+*fgBmqGY z&4!pQK?KtbtG2#X2&FGoF2)zhR){FR)CAhMpwLtzZ9y^MMx*PUnRCaTyASqpIxu_B z{hf2}{qNy&c4mhLJlCfShHC`*@9%0y6LXWuyx53JtvRNHm)@BfYo75KgwA2?u`nuJ z3!B zRV4MM)@d@a^m`nP<7_obPE{Wh&?_O{7ubQbHFREM&bRjkcH- zDbbyoOGijryHfG2&wfIO(U=yh)~b-79s{=}rMDqp3GYAPSEDQQ|5NF&s5 z-ggVg;o}{cY+h5h$mPW?e@pu6U0z(tt1S$sn-)mPy4q#J$tvT>peMCw_eQs$)LBf%4qeM@RxWD;N=_U2&1eJpirw>DWwzI4MhBorVDG8?BjBpy zO5QcJ4JZrb=fthFGD zvb&Bf-az*KBo*@_T~t@&fuMK({R6>8FYN@hwvTVQzP4lJ#)QjjD|x%;2h+XtqzkOn36-zz?RJyb}21txo}8l`{N$)`{!+B8dJd0Whb8oKJ?JZXMwLDGxpE9 zUVtNGeHGjD7I=sz+tixDl$&E3GwZ@3W-8GLGwZ?%GqnagHmb38<>p|N z1wKbtN0v%f4dGy0F& zda-MfS$ov=msR>Je*H-HlavpyC}QXrBR91cOG4R`)PDSRQQ1=~wg(nnoVtfs_=kXA J!^R32y#r3-{`&v` diff --git a/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co index c28403522018f434f31c9bf25f7d53b024255e32..fe1bc50a145f51c59614b519536aee0631884fc3 100755 GIT binary patch delta 10002 zcmeI2e{2&~9Khc@x|Pkr+RFwErnp+Nt^(d&*I%U5t?dR;-1spN5MfgYkZgzvghp|i#~&Dw#7Izc5DMZ7iHV}9NcaO6jD~13kr;?3EQ46z-F@G+Z+6jth--3} zH23Z2`+eW{?)%;M-uJrO^PXw=ys6*8ZLWIVc_ebY03+p~XJk>oB83)`PjZH)Px{#N zHq8tzGJ^o)xc@i~#>ZQ*)%Ndm$T2*?SEkYaLE=7|Mg!_O91Fb02l(i;z_e)m!LkFh zW?6bu-RR6@<-)w;>c2*9_5yFS(o8-_0r}_RbHX6 zWnI>lIPF-zq*CXrfv?fHYV%B%{8Yxi6h%`3wD(dq#;8s?S78*WJJYSo4H-F7-L8B& z-Mt_j*uNlLH!we7i9S+ijR}FWs90AX8|9k2tV_t^ko&vp9SBvU-b>Y;98wqBN+u-S zLAZl(hsw#qSK)LG;WdQU5MHD5)-LN>oOTlKB-}~3vy+SOteVxF!!rp-PR}GXIX#n5 zQP1(SRkmC%m8j)fEQw%Fmn53HON(uW-2cXQ9KX4I3QZZGl`?X(5Q+Hx1vMZ{eg(mr%8l&d&W`nciA-VGfQgeqTqvqoIvB#a=WG_5< zrzdslj(pAMoW@-hoh7bj~dR@Xiw_gsC<3b{4#F&YGR_zze8B5F$`JEG) z+!^&Zq3e(QqB&*y?M%D*t9&bYxzyO}^5j2ucDz^q^n4~^Mzu}wYO@!V-|k6Gm-h{Q z^{`3#U;fEeg}5F^0vnYtnWID%&8Y2z3{#}(z=w~r+y=a41Dz+1Hy>-IkYzu2cE zRTu`o`+a5ptm6+*Y&e(&iTD?A$6G8fQUhj2^@mtq2K>M=md^%0a#!ct{7z6T`d(M4 zf}1)z!SZ_G$t2640N(!z%U1#qjPc)&)%ne6&zJ0g4j9n(HOqGbzmd5D)gT6Z^HtV< zAMpAgS$+VxFY-I9I0OpIEtVexE|6DEvX4{1D@$4a8StOgH+(W9z6GA-H2XDFH3W(g zUQlUJzX#Zxf|q zKb=mhZJ-8{+oKzJBz?MYCanHOeOQB^MuPZNE-s=5zR^uDNovv;;gUksvCHM7j$KVY zdZDhM;G?Vbdg&#(yp8&>V3RI}yn5gTpM=)0X?C}7*}gGeWOc;n@a~f%Q}InaiM)Q6 z_|1an73=Sr%}Cg+*uy*g+Mki5LOWk6RP63gykf-_MHGIRc*%^*6N6T48 zY;4rVwQE_L*3?I7iAk%?#DKA~G^r8lt7#e&jgJ~Nq9F#;CT)Zi@6Mci;SL`(@ymR0 zlimBjzjMw#XU@#tot>HK{Cy+&DW_1d=GEHc@%Af7d(jcu^*JVkg}mD>IYq`>`LR!< z2#wl6K%79P|0WLN)7x-K|S(c;?6ddJy?;< z4uZZ~kxVzV)~NQi_X|ohUfhC`7q>hipudFX{*o5F=p=KeHs9V~(vBBv$h=0IxAd3n zz>Dc-M`iu8knM($RzpY|37Om_zt`NobyxxmgV}}!wfOGZ zb;5-@^^VrP|8D_g<7?^v4_iP2wt&fewYD=}VD8Hlcc~*g2UZkmsvV#xJioL9e5C$a zw+*WUPfJ#lclM}ya*w$?>#zjp2eaXMwf=?rQsMKB>O+TL$~SLU$t}Rw*jS!7&ofs|dmI5wPldV8`_kdg8e+VX$`ZY zg(04o0&jVX=eGbq@;l>t{#r0xoni(}c!1w`g6CU-_rAmPM&OZ?JZ}R&V_3f%Yx%9n z{~;gH3jy{Cp6>>pS~>#FApyMfGv0qc@YyeTJ`DVF{9E4eHW zTwq8O=Ya3~gXdoY-!Z524r=}e3`dYAC`P{k?-O`F2YgiI`NO#f96|wk9+J7Y0>2-3 z#atfL+xUA++zn3{g{cN$Yn-(F(gCTmLQpDw= zIbsnnNJWa$rpN&wJq}rtXeVSTLNm*9K!z~l8=}(dqM2oHnAStq&B=JK@p8~j)5$@0 z9CDO3Q;x}W2_+b#zKYjJ8?VSwg%+%MXu*olyCfqK>JW;CP*+Ejr+e2kJ!xBsGhHou z#@aXIIV`ka$^5t;ug!E-;q|N8BQr}TQH8zY6g)zkf=6n8Tc)uJZ_NBkoNW}ot1nRD jz?H(Y)=bEO@3J`y^=q(?EpE%4aF7hcVAv&SD`E6Ek{(sY diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co index 233ffd5e7126883c6e3d90b684df5eeb9a568a15..914085792ed96c7e37aaffe9cb0f4cb05f8ee294 100755 GIT binary patch delta 4436 zcmds5eM}Q)7=PavXelXL`5Fi+5T}f(X-f+Q57*L)pTTa5Q!8qjCd{x|9aN``)>6ji z7OmXimT58*kugUW^&+Rlcr4o@*)oShj9E70mdp@0=Q3ja%EsQiz6Tut-07cQQl8)a z-skh~d7k9xzpFbns*B_yW5eP6&Ok>s0>ULpU=VKs#6p9AUG>rPt~R;!Mb*H%1OkYi zpryu+*wd@vv|QyFYzX!wWsQQ?i`ac|6ts)d6`~F%QAxoxDlJIcQ@)wLc3Lyv1TI{p z)&kyRZ05rRmshbfQ$Iz4m*MCm6lye7!cluuFiXVi1uooZsDh)8=1`WvPX$4q0`Fh7 zLkLO*nF9e;dZ9_gjtST+;NGSZ7u8zgDsA6L8G==%(?Xl5l;Gyl)X*aoY&5t@%r$V7 z4tkPiTCdpa0dN586$h4OSe9Yw#L|f+9r3)Ppu*nt*kkCLOug<$T{Eu@#qz`SJDD#i zSYgMx^hxgxwj*l}Qqpzj<+2sqO5*75`Fh$odr0#Z3iEpa{ibKOg6Z*rU67qOM==EJ z&a-Q#W}g_r_S-U1^|~t~iPgBD$pa z0P9=%WsHJ7`}ajl9P6%3kBQOF3emGt&(rR~e%d$m9o@5me`Z>LPn_-$o^qk>`)RLN zP2004yYyZK!HUSznvV%-tvKFsJx_m>05_+WR_QI}_XcHha4B{1deIgG6|nE$&q`!5!U^ z9CK%$*vzbZ@a(g)zh-QnC%Lr+%>_vc)Qt2zEq zZ8d%OR9BOi%+;o#F34;#Kz+`#kuWb5C$;^tDe|_j;O>;c;y9%@N7g#$o`D)tnaDn< z=V}(B2H3s@#-a)>x4je{fLz226*oMW?L=$RIP9W7K4z#y8jEa5_YJPYi~cgRp#=^; zBb(lL?PD!xj6fUGt;&f`N*A-U6{=aA!lZ=NEF@t_re|qZ$E$264v$8sl4Cp;S;^d% z@8N7+qUY}TaGYQc52;HZ&xY#sZ)I>>N1z31zQ9{r)%NG`M5rtxn254vI!^?Fu~R2G_<52+1>xgs##BYHp6~-z za(*k}Ba;f)LHISj6ZLWYz2v~oOgsRfnef?lay~+M%Ow3Hgzx)RuHQ-c^viPov~U{$ zTCjA|BfUTj!t3OZ`V!$!Jum045`N$x5sdx$h43Ad^oI#=pdpSw;Tja~3J5ehP300SDsV{9Y&A=;sGx{IlgTp1E-teW z7W;*ZKNs~(wI3BGrME|4Lb^XHhjq-0el&x*0oXJL$VCOAfc zwS+XH;A1?9%ovjy!VKi+%2x9wv~=1bap z-shhCJ?DE~({}vt_|S5E?@n0e?c06cH&X*C;UX)g6>llPFL45I!23PYwJDrN6@@WH z&Iz|qat@!T_k$$e|JPtU!Ntw0E7Ym!E7ZU=eK6(f)3>z&?o;aWwEY!|tIrK^ejNW) z8(=DeV2WMZav#vn)a`U<>Wl1espT^D$@l=L?38mMIi2p_$@5HhEKM$B=Q0Z7z?*b} zeizRSbo$Zj?7DCp;()tt(`mH5J1Cr453;_G{PC!OKmAqrF(c!6ePK^#2& zC%s88s2b?Pp$!W3*^+skZOzXKGJE~@llFZSr&R>Fz1@7=VehhcP+S+J8%Kj|X6cxW zp3At&t{?EoX+z~O=wg4Yw8`m-7J+${Fqn?YgP;@8o?El@bR8FQsr&go1+8*x;`b+o zosQ7k-GZr1)HVcO1?;^edWdvqK!Ju{3aD6**9zdva?&@Oe$*@DUAHA9)~-hFexZuk zMd=IwA#>or3oyY~El0F%QT;Kx8 zdUwUo37W)n?WIzO4-0w_my_3$`A z*Cs%<0(ou&J52m|IXaPmRx**Wblf%+dJIeuezQsn2y=Ln3Ys)%g}RnyKPOZnGpbIO z&S*3qohbwSM1IYHIpXI-7!^MezT;sv7)9AmIHe#TkkBqP@)0ZoKF>S&TL~{3aaB@- zi&DN7^SXZFYc6`}#863Hcqz(+C>!&$g;H+Cy!|;b@I`hVc%y&Afhx?Yw<4)xa2WFk zwNmcJynlm%4=^9R7@N=6KZZBt{jnPq6~Ns6p_KPx{)m?H)0kiVMas`(Zl9I%uY_A4 zrKGAijPx=-Fn>Tg>K`y~&y?~x%rhT~LF6smhm!Jc$bUfcdjMZQ=32oUswQcHzc8Qt zMEI$SUh!g@k{Z!Vc?#zJ_oO-*n0wDf_#shkH{R&paKMatidR~o2y;)RlwT4!R8ccM z%%v8HeC86$Q~$;h&ujTErv-;*hj|YU4JE~;B(ylKB+Sn@kkH~Z<1o+Qh)}1o7*k%C zB;=bSCGs4mJR!lySI#rqY(&Cgbdqz<5-~$)&E{}n%C|Ijg}*I^JWj5Mi^clt}P2a?WHT3D#m;3B8`wgmkU&A%HVA z%;q+jqn;uEYZ{pHHfTvQoCrS!Lt}Wo!SwlH1~c9UQ=y@enQeoG@NOfc_VKH>nJE|6 L_GTvNgZh5}xj|wT diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co index fff0311b6f2e828836625bf59260ff6acbc1a0aa..44055defdb34bf0b87957739a97288aa094613a1 100755 GIT binary patch delta 6573 zcmds+e{d969l-a!OL9k&l1my2G~q`wIAB5Dz1zLp!wcTtuLjz|RMKi|fS_rm^{Nru zYCAD|IWGOdq)0i*Q9+tg2I!RO$Slp$PRbFDQ|yl+w9GV96%j_w5Y!Bo3Mm@j?tR}T zFaK=vFE<14o6q<6d*8R4AHzc>r-n;*EW@WpU%2C->OPG*<3m@@ZGOoivJ@Ho0_d^B zRJdVn&6{zfw}1eChV^v(z(4bKR9{?zs;`2yu1BT7$S?UEfwyRwl`t4Ee3CE2F`kE2DMYO+j~ZeO-BK zL$ETrv2IRk3@6*&4Un@Djb|Dn0(_LU=n=hW1N%3?4 zJDuVW?YJ9HuliqygGIH%SUP8PaA!LB^WbVO=PK&V-T5q$JAzLKJGdNnXtP(lifpRX z$rg)+8NS(_`Bvjzj2+n%d(bv5d2f56o#)B@#8cy&=Q$itEJK;f<%YxgOKsJf}AgP%$Ce6p!fU^og!6S-Gy$Nr~CnD@y84EmmpC#%}b#F z^CNT)z40lsJ>-cI(nH0l`d0KtcKrfmH$>zDBvIDA7AN)eBKsu0>u^iCJ|v=f&a&+2 z#Ep2kY$i8CPY2OXyXj*4*<7yu@zrH0J6Mgn@ak*Rx!cBj0MDQ+*A`Uitl2f1&KaH2 z3({E5UXa6j-IJ)$Y4=bkSEH>89oFC8GYw;M1&-G`4#lj=?|0lZL%-Xpx12#TF5bE< zKlxXGcnzJp7hhkAN2Mymeue%xt$S;+k7V^lBD`A={Nsg2}Y z=gFT^^9v`(JB^=XCO&gb(a9zL!17w)eGg378q$}5KO14W47`4YDX5kkrI$(-bW3KYXzCM!6ulR7-a@*TkYHn4mT@PS8I zeh~PIDLZ=9;40_jj(@~D^npX-RhGX4ymDgGO!6V{#x!gHC*VELvwRqMGl|cH`Zp+k z`XcM_Ki~uVSw71g1t)jpb(U8GzoxT%5%9s^T0Ecs4p3BoU@1&N0^Wp8!BU2R&l|Cj zDcl8I9%A`@zzZ(2d=oPBli~Uot4M%D{TR!C0(|q7TaW^NcDpH9PR|49?_v3F;7RpX zR`CibLi1VvI`FPqmcI#HUe59X;7g_?J`LP0=k2ZY{{$3GpT7cktzr2n@X`F< zS}vah4>q&*MaHi;$gmG@X1L0ggW}}ljDo8MUaGQuDe&XZu-pTjKg;qu;5&}8yx!!h zlRKMX73;vEXCKQq0KajVcz86!Z_U zcg<(*uVJ%)C)ZfZDjW_Ymy_d`vwSx2BVLxz2mT9%Z2{SI>?+{MuSK&LFXCP` zAaRLJ-+3^x(Hr0+Ao^lL7(_lAi&Ihdc&O-+EYT}lqAXapKA%i&0~59pFQtMvt{4U? z$t>`3iust8P7vY(gdnBSdOQ(N45&v4P%8L&+J+!YmZ&H+F0rMe5Dm}>LOib093D@M zM)OEsYT=Q+B2A~rep-(|YKb1+M-8GPb@TE(jqCMVo8pzCbgRe?o9#cSCN@e+oK`G` zWqJnQh=-@?q5(^c$GkvA4^Pt(w*aM*NDCJI0a~RPh|tVpBu-O^aWSrfpCtJ61(ABh zFij`|(9jP0AiiegNCQXAy zK#^!**(cDzej!Th@q2xiD23@6`28`u@qWdcXA*(N^+&C7^~a+Ur{;%W_VBc4S@h6B zko^kHEC;L^l_MUSS&msp9uOk5;D9L5jsiZBW)8?c+J0D&=y^3MQj~6dARM+r#C$Y_ z!pG@3D_-liDt<35LJ7oZ`%1)5b0~2^)PFFHAHdjsI{WS={N-Z$rx5prEFQtWiX{AL zB$18~>xE0_x3rvW( F{{vR6zS{r* delta 4553 zcma)=e{dA#8OQg1?=G8PC6|yY6e$qXCc!lB-tO)0rK!#R1T{lLWoWdS5>sk%nyIle zicH8sxG1GSL(XPptiyrSp^c?9YqL6KIM8&Ejz;QGWg;rlcAAor$s7UGXqoPh=Y5Zt zf7;As_Q~gYp6~PHeedRSZ%md(8#( zVcFZ7o@(00W#_Pdxfks;I_~ZDnH5i=fG;hqeF@KD}o1&tB}EmtW&sg@YBX$UhglkN8VGI&TR1-Etg7py)=V9Py65Z;?q8J&jD17 zjHTx{`|LH)s0*cduWYShyN6Ey&SwYtBK^zm^ey@p4|2x8j?6J1%6ZJHk5Om2sVzd4 zJpL6=S~u_V;BjO=Q*DLhH!<3Pmd-CTr}OjQT}J2cMY{hy+RDgIw%M}x&k|XCBj%wF zGWMTNtgv z%j4GYmyt2>P!-ZQk6ZT_ZoB!DGibc1I9luOp{mlk?)E+CF0fZ{a2fU&lP>Myn+t9j zy8~B#$-V#@{=&yo`Qex7tM9;d*3tJO(;7YJwUvIn#C&!&mNCA3mFPKBZ(M^bSk0BU zRedy-%G!!#*4!=ZtWT7f_A9{JjJ?^7z1TW$Ywk-G=IjvenvX5NbrW_Xb9xxhIak4T zd3m$_QQYn^+YaDT-t2f6A1(3PZ%U^BB5p>>mdD^73Ld4-ljmfQ{T`S9xz#awuIH~T zuL3^PmXEcquK{n4uv`Pa^$y2KQg0L-O9hG?V-;}uCZYR*Z~8hLzaIF71j`=*u6_GP zZq?sR63Cl2`1^s+x{5Sw0QgKhMUL?t@bep3{vvSUrj7pI=1Gp{K7X4{I7SlOOzJ!Z zysd{K$9M<$rfn?$0QjMuET05kZRzLtZ^0|zXdhw|t^?n(pXEi)Rp7bSVU||{?|GT! zOMuHKD0k~G2S>+g>Trx8;cf@i*$lktpAQ0r!t!TBfv@~3%g2Dv zxGSMLZv*EfH{RX9<(>dXeUs}T8UFx&F2?d{;F;Ad{|xw>53sxd+loB5>H7>%a^>I< zx)}#|8}RlgSzZIY(A_!dlM4WE9%tjf0sO=;%U3!)$#d0vSjT;kaK=3(THt%Y>)kV= z{9)jSjH&Uegyl~IuXisEEiefD+%s(a^T6f(EKeQ=$H1>yhXK6s z6_y_d{_<-qKMmaf7R%29?{Y5%q<+bgh##AqJ)r z4JE)7Vk)768WRW&1gI&=8lh^G#wuc*P%S|NB2hv`Np)z_sw@gB!9tvt6Gc@839Gdz zn1P5yS_=qKLPaI0K!B)*h?p3Phe(G&Y(~O9){QlL7%6E62#Ptqr?fCcD-JLmKH#4$EY(QaC~Sk)lB|s1y&A=4By9 zdY8iz=~9*jq97~Qhd1|ZWlbgfkt1Q!j~q)#Nr(sv^uU6WOv(j=AyO`=MoBp(OfP~G zkVv^;n65Y&Q%UcNAdn^$Ng;D83R!k?ouY+FL`0+#Ax$AvlW9My5FmS1#R%y~l@erp zRiWEfwIJEHr9djHk*EkGt8wdACEW#Sfoxc~qF diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co index ad0841696b8dbb372f855dbad8ec72bcf76c86c4..9110f6e8007432096f7eac05f1b2116a60b10859 100755 GIT binary patch delta 1744 zcmcbxi*dyv#t9lsD`F>VEvv8aU;qOc%^(2bGXg0#Ag+jl@GGiN=>iCsp@S95fa+rc za*5Fg)j!#vQBaAn9h;jNy98zaGfdhCG``_K!!$7M(qz*oWFyodW+T?HSwy&(xjvBb z|1T!LI)PBe|7!j|GGKNjNGzaEAeQmJdZ3>SNSvRcVZRMS=Or~pMh06Dt;on=W4osr z%yvX#CxY0!nyp-HCRw@IPSdjyXwbJ6Y%;JBX)v@EZTiLJQzsArwKIhA|Kv4dLUN`T z)k+x{)EP`Kssm{aAguwUHG#Ayke+_&UQi_$Cu(}-(4xJ`$jR z%uSvzCQsO{pvV&3Y^CyriLhFw&3@`)Z1qonHSGtcQZSZgZK8Lqal(@)v9Y#Om$+Cf zF$Bh1!QUYhYhLEW#af(|5S!xTUoHDVG3VZ5({JO+u%Cmqb-#NDklkq0X4A;9ziIPC z2SXMDiD2^qmmRE(O`9vd2r2~@1WJ?r)TeDu@NeL#7YKnCqfm+mOtUjAu}2jbhl=ZH zqlzm*#b=SZ*##gL69+>@)#RdxZC1_}CJZ@=NvS#U2Igj# zFs6}-6O3tYW(H*%8d|u)MBNNrCKn3oZtjm%WMZoDm~0p=H~B)e02jO#iJJT>+D!5V zv{HEitz;x%mC0mXLGj4}F%nEKq9$j>*fGtBp1cZ3b;L}56=TQwBL?E!l|Xal0Yccw Apa1{> delta 772 zcmZ2+i1ETM#t9ls7a}KWEvsMP!T<&^nn3`vt zkV}j{sQ$_RjDkvp?bzJJ*d@qukYUokgACL5ZPpO(Wu6>rB|CYNmBHj`;tG?Sg7h}C zNXRe?v~#xZXK!NJe}tuFe+5I&{>d^1N>x59nPHvH#t)pMjwPj1RIjp_sb4&VYp(Xrd7t z$K1aN(>71CQej3ise)C}_IJyEkV)<>HvKl94EukuwC;EB0J0lx+H4vb_E&AzbTMR^ zyvTUlVgpZB#;VPReugZYn}Wnj4FAq=H>z%)BUi7l$QI8;1E3sqbRDxQI+UKgf5 z1y#KTRJqd zfhAnj)zEEnp`h;O^jJkErUfpO7e-#>=g)y&>uJ^cgM?4 uR!@-RgbA$&>y1lLVmc5$xhcVpb3+6~Z2{C!&dJY#$`v9ZqAOvdj0^yPA+Uw9Bp8lG#_ zaQs6c#E{_Kn>82)++3`{0O_6?;IU^N?&R>-0}Xz1w88HlGm*;B6YhernN)`??xOH2 zzPG=y1ArnNn9vw7 zk#sufz6|5q2tjG^%)$!qy?*pB?d2lp`bEHq(Ft#WCaS&{NrmyMol@$%wJ^hOW&Tq~ zgzvR(`RV<+N!?6Wx29W*iPi1re{)?osO#Cha4il}T5)|X!f@j4ijc$PQ-E2NM44Uh zu=oTyNS*-9UHDnUm$=v-2)dUD>@M_pMN`?^{4%`QNao>PR=B+Tll#oTjLjMe<=c@m zFugRhUvk*MVBfxKQH7Ng0Vy3iq@aoRd1_=E7+AGLWYf}#!3AiZiV*CwZc{(4W(=%c zx-ATBYr0h)ou`g&2Ls!JaFcH=Roq&QFr0XTf$5~#1~vpg>T?lddqFv|0H@!UOYtF` z)~=kS=e(74@WMsdd+QsVUhr#0>S1D#l;fv$&@@~txxXICjLk|3B*yA-N}@{H8lIo? zwg!jBBVzYIR8Aa!fC{a9$jr@Wv5x~7eQqTL?H5E{8d=&o1%>^Uv9qK+=j{xdu8Qk5 zJp?w+HdJhXsa< z?EG({j_$YCcK{GY{L)L={0QO$8}dIyeD9g;{L}@Lf%*@!`Omo~1!Q32@9d5lz9|Ef z^V$4$#E(yB^EVOy{)YS?5ii$eUx{B4A5>t#w0@{RP{&+lcEPJ$6PJUaD|Wa5yF@kF zEZ|7zi(N-LtmYu1nm}=6G4mD<`SUO z!i_+2Gh56}^%=SjL95vzjArrs8WC%AIMWF}r%-0KIfXf_KBsW*wkEq(DDyakT)Vl> z20C8Y#R6yH{z~7D@KlHa=q^xg#UTrxKY$6MSYwV?=lJTV{lf)cWZgV*YzVwW+mg! zcUW9oKg0b6X}jCb%Vs2NZ$$(-+I8j^*W)4ppXPsen0S#(q!E&COCzLKH9BrnuEb_| z$NF~F9h2&7kVmiy<$&){kreuUAEBFyY*S3)5v8~KgGB7fvf82+Rd#jWnrn#aZ_I>%4 z^5pHe8p_GRtF4ghQ-e}B$p<9C5Yyanh55k4jR3ypHUhwWBu9ECrJh_a=c#FF$^74MOIth7hc|Ql%XDk_G$UIss2$N{Y26Bhm;=~Db`E(|AaOMKPqkjkfeP75iOsg z{P7a~Matj4q}5MRe*V0czbX%7Ko3r>Y8&(Up&r~{*7Dy|e%G{?|B>>)l<4nJe%h=} z;y&fA5Q64ntG{U@^1OD!d-4!*(pO(tu7$I)a7z$G4j(xdIppeec5J{fg-(CCO@$t( zPlaxG0eYMTK`7`^1;LPu!j^DQHFNnrZpyk^{UHg)7khsdU?1-h}vK^idG6~8*E3`O~SHVr>g`5FV$+3(8a@s Fe*xAq;%EQ> diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co index 11efebcf9d11bf33c52e30eda6a19c679a4e85d3..0828e434f668754fdb2d7715a34c31eb062539b4 100755 GIT binary patch delta 5474 zcmds*eQZ-z6u|F&{TPh7m99Vt<7)$9U7hdswd-15VePttvW(fl9LPsU(9F2RCB#Sw z?v2tZ#-Wo(D1OAuEaFITK&}iFvWL+GG|>zaB7Y#MlSHDBEP)|kGT+C&uY3FFL1IYZ zCEGdsopaB*=bigb*LP;b+}mmCT1fSVKXmTryB9zpUqk|%@+Xj~RptR~U!#|;svC$~ zu!7f754aAm7goT@#l|^M0`}XBM#1g@DSZ1V*sI7$f_vZ2+1sabQ`?0q$C()i@;k&G z;N$?;07RywLrh|wF6j{dUOpc!oAMFGzW_(?P%GOCSHaQJRk0#PUoEqSw!(FA)cNAd zBAFL}c98?`o-2hA1Z6oF0xo!RhoXnZ=uNWz?T$c%+Z~7m_m*?E_Gg2W6Ex>&XM$4_ zcc@rfVLi^d5{?Ro>+G|JrAVPAR&20isix7=V*+zTq8j7QfB{z*~LWA?GEc}mm6QXp4l^>ru@u*uiQ9G9p z+OoD+AEIUT}E5^>htEv~6C4Vaw{hoMrCmyGc=Q+tMWW0rwmbFPS>Tr8^~k zJNGuX9f;S^SAnCH!6`xUVA>3?p-|Y+RT!Lq938CTtklr}`fK@#Cud~Un)n%c>bT0c zW)4XfkE@GjgT4y!AdPAdxGciuwQ1gJ%2bub)2ep;XGV1)+rV0j5NW>5kKE2otV!0# zVq!f+*S_AI=5O80_O2-d>+1UZCWF(ccFn=3i?82KigM@3(8{ywhnCqS^ac8a`Y#Tm z*6sa9r$IjDLhXTGl)k^#EHu7%kTRQ)wbN!b*pFZvxS*^-%Tg0el=Ep6`*?}ziTM|- z%7nPjD=Gfa6m_Uw$juz2Y#nMsU+fb4^p~v<(sF6A%`4zvvt+8u-k~O3Ea^sFe;C#! zJtB<-eA!55sGZM!42Igk&y5zNhfVz`8h#3Se)m9LxRdQ|LtR5#jf#@mH76e}yy<~+ zOGlET+&41P`X=c|+IjTWKf4SkQf8737Qpkzj^sn*HR=?Ln^Eicn@qP^G%rVdtUvfPpq$F-k2gpmQwz$*x<_SMIjxS-*`n)v|nAA(=8fOlut09GcL}4 z%p+U${>qKyK@Mn6>J2CHf^BLh&3GE~J%==;DCaOw_UQRNz97z@jFA?lq}39@x)ESq8K;ZG5?9gPRRk zXVe<3(xK`wQ7il&qPo2yqE`4x1iB_fBGA?0D0cApYbvnHL>Nu=(Q;f~&dku!2=Q=x zNC?J9R}$*=k{nE>mYE5ANe(yb_F`W*?bdv27!twFFeEM;t|1OA!_x%2D_ByU+ZSR< zG=G$YU>Vv^XoMykc1J7Rn6hpkxmBzuLKEt%AzNcZEV*kQpI=$FSSsJcgh@0uLX*sF zG)yvk+!0OnRuW%NWi=}>c5lBAE~KdL1u3T=mgjbp_n3jyMn9~u+g?)pZ(F3wVRNd? w4`-%M`{8uTwkS2^hl{CGi&Dh_xH$LL64j{{s7~($;Pl*P^`Qa*N`Vmk2WeT7kN^Mx delta 2537 zcma);e@t6d6vyv-?V}ZNtE8LR1ch$I1{u7*{zfdN^fAc%FDqYp~tp#7@CBU9 z|Kvfw6k+bN{r4I;#*F2er%=zt6nbkys<+N;zkFvxK7p=GR31Qbq$nZB!QNYhJzHc} zN;DNn(@nf|aDSo1+RO0O{ll!Zd{2u-HYH?J%qTA!;&MAWB}Hf$@XXC?MX86M&v zH%j?CN6kMm;O=O$;pwIl2KRfF57r*wtUFRM8KDmCyw<-&b)ao zBO~?PUXv!9ONmmaQEGc=)K-*=%c|n;c;)HJhmriHAtCSXQtacEot4Lte3MCDV40J6 zE>NP?C=Kylc&^%RWNzX=#7`7vA5-Ro23)$QmoTsq`AOOT-epf;rF}0 z<=7M}t%?^h->ZO;z6vwzFUCKCv@P|ZkBO(?vN z=FJfAt4jag5byKAQ>O`0M`it^;PAb?-hog8cuOPA&j7a^ruo~zOE>5Y0B?Jdj=!W% zqsW97U!@&aw5bUNU#IyOz<>Xc=GTCm&(nMXxUrY!OTd@k)%Krp{sN9>6WzKs;O%Ke zJ>d>5%!E3yjx@sxeDE^O?*snLl+M%VUjmNS4GXw|pP$teG$ROn?mEpM1>XN1%?|*d z|AFQWjJCcB_5MaXS|MRZdyHj#F9BalSEf5+z(?=Z44roazm-GtbHMu>w$hFdz)`q^ z<`;p_@1psqz=tYm?oc12tQiF($eX%1(&hEPw4tnBdn zd7y&Bsjnfr1QJ{0B1JfAonfF(zr#<`#HgR-2`)ibb9+cVf;U7uA&7obk5FB$)0$dR zuuvlK;jGcsT)f_Rmk=UqF1Hwx9C_~X zLm|e_vcdf1$uLuv9VCCJ%9GP!#*;S;^I`bA^%zRN5oQXK_lwLncDOtl5}5#dh)>3p ZG5ee=eOuZf=zAq7+YEQQg4Hq^^f!SwKlcCt diff --git a/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co index fbf8ff45ddd15697e8ea53a562d2f4a117892935..ba6315d5187abebf05ed778990e0d04acf4e7f22 100755 GIT binary patch delta 1744 zcmdn6h_Pc8;{*++j=+gp%jy$s7{CBVGYCNVj6jMFh!gxE{DeGIx&XpuSO8K80#JQS zKqfKzp!z5KGYTpZwqtV>W0#=Je}+l>fW|lcXP5@2U7BqAglvQw#B9VGHj4$( z{{O|~S0@n4_+QQ6M+VG}1c?RI3B)q~R}b`)0g3Z7H0-xw=)9!H$jD#|q7@k#Y;5;5 zgV~Nq>_iZISF@Fi%_J)q+i89_0uBDQf=vN7A`OAIqD{Y;eCh-Opmv5Z{+~QWOi0f3 zqFN~fgF1uhMRg#p0i-p6v?h?&1k#fa$S6)eDrQJnnbPEa&U~9y#3h*Pi#b~Nvp2Eq zw_<78U%}9`A7rl})USd_eielJRgk)VWd(b7vX7J!VOJ||?v*lSBA{;b0T~5EZcCgf+{N1!4m`cIeoTZ80F~$i`p2Wr& zuNFZcZg$WC#@M63Ln6kG8WQvmJjOPdw6G{1|J|}56l?A+HvKl94Eygfx9)fE0J0lx z+H4vb_P^a6YPpqAq6QkdWAgzU4Hm|?n_oH*qg-k8K4&hD`YS%5@)H4gz)W_AAJzyl z5K9~?E})Gnt^^f-gQi{=DjpGos@?)Bz9a<95QfXn1F@Jm7!vX(NBL~Ca&$6f$Vp5} z&51WKGk1kCjZ92nOfz#Q7}M3#490YJw47WZr~||Xo0s{%Vq{9NnJnlpH+h4<02jO# z@tb_g-%RoXv{JbMtz-;fmC0mXLGj530TN6X{3d$^*fBNuPwoOz1p$*!1=w*u2!J?w IInZ2r01tY|7XSbN delta 777 zcmeBJ#kgS+;{*++4Zah#meqGyFn|G!W)Ohz8G#fV5O;V%_#IiObOD6RP{9giK=m;J zxy0y$>Ywb-D5yl(j?GPsU4k4387A#J$S`f+W)0y|=E;KMvXeLIC``^1H<&!hTyOIm zaS3LD{p_v#*_&AQKVWXzU%}9`e{zA25>Qp#W(}EdjFS)9Z=0N;&a(M|{u$=UN0hd3 zyk~?e;}HibQPlo;Hm|OR|cL3RqHf=VI4EvvMervOp zY4aibdn}AkH!pPA$^`V&w#_;oCG7P;^(cS`%wuObVTmHlAPyDx&_Wegf{O1!Q?CmZ zZ-_uu4@nX13~vI!;=*v*c_0=O2SZ2JWTSv>R+i3g3^|EOsX6fmrfxY3d4B z;NoZiQ{d#}G`T=f2Z#+e*9E>}Wa_Y(d{97ma&53Q7d(}DO|oi+>>-kzFroEey*42#Oe?%6CxzH?&hUY_w*YD=_vGV1XSimetR3VE_Xd%^(2bGXg0#AYKp#;V&pdr3)Zjh6GM11FDY+ z$R$P}RR3guMnNUQc5H59>=Kmu&oF5p(D;V`4Aa20OOs8Xkd07-n2lJ&W)b0B=K4U! z|G$|0>I6a=|Eu}?$bi|AAhCctfmp`>>VbYTAaQ<%hW$1SotM-Y85wLrv?3#ejqRRh zFxwG{od{y@YPNE*nPlZ+JI&8Vpuyi(uqnVsq#@8&wCNX_&!DrL$bvJTT64s7n@5hw-S|vy=+!- z0F{4rAx8O*%?TbWxa-gOKnqJK#RI0<89rE}ii<Her*aBFuyY x${{Wn2#3%gI3_QOFk=!3pL{FAj_E-7WT{9yrUwy|qay7%10o@64g(F72LSDW#5({0 delta 829 zcmbQSoAJPA#t9ls4q+3umen6|U;qOc%^(2bGXg0#AU+WU;h!i%r3)Zj1`A#&1FDY+ z$R$P}RR3guMnNUQc5H59>=NWS$S`T&L56AjHfspyGEe@fEIav-w8CUR6@kfR;tG=& zsp)NgA}+x!u%EqkKYJ6){s+u0`zsiF_D^n*R@!_>I)rKRBl~TW9V}Qje~@cn;dsvo zm42hF1Ttu|pNa|7=0$3DtRl*Pn)Wj=G=TA4<|eo)KxJ{8Ids1H-k#irlJlVSh;$yaU686RxswcX0J`H}r8 z4j_B22aw(59mi8|;t35{D8&P&*%=O4qKb<{#T~Rz#g(AqThP?&LdEwapsKfkivI`! zGo0bF5EeT_MFLz1%uSGWQm)PXD3$})6EjWbT;0+A~Jx9>4?K*LqUjQd22>vt zkV}j{sQ$_RjDkvp?bzJJ*d-|QpJCEIpz#g=8K!}0mnNG&Ase9vF&nXl%_73N%=Lkc z|9>(0)d_?${#Wz&kpZ(KL1F=Q0C$UI@+{KwjavHta+!5(kG43@BYhb=)*LF4WlhvLnjE&D-H=iXw| zZ{x|Z{|syEe)kR_yV0i2rjcR)lg&>(wi1yPc5DvtVc}qWvN<$_80AWn&)HAgd?4I` zyFMibT8Kg^9x%<$aKRZ>TpTKHpp7c71QkDmrd}5+zMvRYy#-WUAOp-0hRZGkv6wg* zX0%S8l)TN#)zXq7Cow5CC*Huq+yurnGI546EzAv}OhXeh3mDVT!ep|KV9MraDX$or zW&}?TOp}}ZAx(e_UWcSk=1Mmc;%I}|%aI15e{fC?N;hNTNSoZ1ZpXAEZSpA~wIY2o RSB4#@1jMBb>w$*J0{~C_%v}Hg delta 753 zcmX?cgYm-!#t9lsA5tc2EvrA^&j1E6nn3`vt zkV}j{sQ$_RjDkvp?bzJJ*d@qukYUokgACL5ZPpOZWuE+1U3RjOhQZ`AafQi`qVzUD z5qDq~n9bd~pS_7?|2fu{{S^#7`zNoERsyP0*vutk!ZbO^f7@me`2-G*XG~D(59&&r zjWj}-fF^9)9HX7WA!7QgX+Hx)0~nuXZ9+9=l?l|8E#^Q|B>p0qa>)T|${*(h4n^Z% zE&D+xxwqK#+juhU-^1Fv-@OCKZnSB$X=K=cXLGI3R;JBC{<}CB?`-Cc0J0xNedDe_ z69ElQD8&P&*%=l%qKb<{#WS=}#g(AqQ_$4wLd8$yqpG)nibte?8NzVcWgr$42g8BJ z$yw>!tX!Oo8FCVnQgh-B%v~*DOd~@_7}MO<6v{L-HaCJX4J^$j`v|6N-k0%;k?DZ{ z9hxgZ0*BD=|Gtn!G66 fj`Kn?L~Q}oP|nHhIg*?X5PvX0l<&{tWMlvU9(K8N diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co index 03e31ba690771eff9190e5ec5f558796204ad880..6a92da410f03619aeaa06b298583f6640ddb5916 100755 GIT binary patch delta 4463 zcmds*YfKzf6oBu&EW7Lq?(*u3wFRUp)K=MDc3Ij5cb5gU8jGg(fqg8dl7=)XX<|}n z1H-a3Eg_aNNr(6dn$pE673x@~t%M;Fkf>2wF{!AyZJLVqWooP~rsAF1JHYT~rv2GT zHs{QD&pG#=nR8AUCciU`{%UB+M5SFvSBB~aTM-a0Ndl923m_I+{2MtDSsxja%c_VO za2kk0`~)jCe#D<%598C7_h19CH^Dp&TKjQ$Y8o69r7z%mE5VgOEpRQMXm`@!lBR@4 zz5(?0yEX#;A6p|IBzo;mc4=}us(TquPoY4q=~X!GZlKJfzC~ccTGKW-?P&~{1#Scs z?*fM}xFH1jf=q+JmEYGO>Yh1zxuBnGSmSfGuJPp`+v+k=+w$WB+g(Z2j{M}n6ryTP zrDV?yaGEalCoHv}cfSdM2iVVhu=HZ-#j+5~LM&;&{}lxjw#Q!;MYpBI>W>&2d0jxm zKS!TT$x`sbfz#>niV1eiT!EBqsjPIizw%r(z4e(`I(7M&_Baai`v6_w&ronZPE|wJ zyh3pcmdYB?8b6;3@c4a-zoO^>-JP;RA-6fV*RBzqZHiP@YkP3e_G~@irSMFpqC||r3WsJ(BaEp(Fb?%k4)iTjMf_yBSn9R zK3bus=-Kg#L8XUBM+*M!$F*N*R8|N}RbN#K%bDWffjBnYcn}s_z7O;rLQ{x8bofx4hS}mrmV#fbw=4;X3yBVMP~Q z(izE7hqJ|UW(O{vepCi{_)1QUp*tl~#~0)1Pf{`!x?31~Tv-pCfUNJ9;&2tpnJYLA zr;?ae=`e=-MhnkC?KQ2+`sj3zLBw?#pyEbCfrl3upSnF;+dssq9phZh-G|JlD__Vb z@t)dz{AQ#+pUh+~wg7zynedY^HZALRkQdUEYJb2eul)(`Mj5P*R(gG8wsYAf=)gk8 z)COa@m|f@qWZX4SgC=yGp%OJgZgf9X9Q3_aXu~27hse}Nbv!HeMk=JUFK@>4{##~E z2mJmK8TjEVnK9h^C*dxnU!M_amrN$U2dWvKLc0vAnMy*Zj4#zD>Q$B#huh<-QedGo z%*mYlCcL7L`{~APe=KJ#ch#LAw}O23Ccm*Yj{?x`a_MwO`$dmzXOH!X+Pcw z@q!liv!@ZooQrQXDe0a4+_|V|O?;$s=^X(CCX0KRL)d9`l zvrNzy626KPzGr5ybWzoTu8p%y&{h-P=8DqC*fdT0+Bp*AUHJF9`9d69ID#qE&$L-cxknqw-CN_p8Y$7|9MMpA0m8YT+UAk zw{<`VGUq+gKH?Cpl0WJ%3D3@x^9zKJ;*YGk*Dr);%(I^)yrmoB{pVbNk%dJq@<9I) z-gi3ky1NsP6UEW9|+!A^MF$MQgK!bj`nyiedL z28;|bw?9C=bvBCy>@I)(jomwJIZjD+*t5NaW)~LCsJR}8M6LEBNzL`xh-$GqW@xcD zn@}8AQmsypw1uVEQiu)egq0I7Ijm6yrefKR%ALaR2}S delta 2406 zcma);e@t6d6vyAaw6t`EI#L83ZUq%OwweBTr6p?&lom!7p-UF0$+BNM1U4OuiE&86 z55X+NOx!azW&;rwGBf#+2i%Fk6GN0tWn>`)H;4S;%v44q5)vSw_rBiy+DDeSm$di1 z&pqdR?)SW=?c}e9dp8WtR#^7g`&k_|b4`#V7hAbUyErn+eK;Z>hIPi&Ol=7vVR&vVkcqfu(0vF@eD z(5_DKl{lYt7YKorCxrTxptzV)6=NtdG)lU9jbzmd#pPDaE#hpdKSuOf24jIBtY_@R z%qC1!Pl$eNDHPjlZ)t_p45QFvn=##9t(Q2iR(?ILtrtqlGi8MhJ61U18jOXf)`g7> zB5M;C+BZu~2By+~UTu(wOksofJ#P^wRcafh!t*$;^Sy7wDvcN!NP?Y!DpEk zcORP92$lnv!GJh?;G9NiuO-Z^hl>`J)d}i>@XPPlg$u_e#pcC7u4@HPxiRhro({2E zCOlV8Tv^a|1dagFe5w>8I0UNVh13grG4(AEfTD(-Tb!uwi@~->5=@ttBX&>5DR$Aw zzYmb{eu8E8uPHJM?O5j9j=_YqkCq!GBE>A9$B!V|vQKR3U(v{J-~HNSf<7&pI13Fx zocsX~0UJg`Z^GRP;@qqy78pfvw=yQmtV3@&vn_1&T0DEE#OW+{x)gT$cv9$aCeWjy zE#Mk}w`18M(Ly}nHOQ9;W*;@}UyQizk5{>UfK+Nek$Q4Pktz;Qsf;8vegyP^w4%iZ ziHI`aJtnp9Y1DEIoYNBH$oLA-KL)3D#Ca1yZFL}Gy*hcI6#O;pV!EdE_8DocYrglo%63p|O3YH>9HJurRceuG5Xuqn-$)l;f4 zpw?pm&!o8sz9xT4$grxD5kB(aEuaGZbQZ2?*~dh`2aSIYb3u*wRr=*b7o)uDIQ`#h zo=my7mwekru5>xnalajmFcR&dJeVs7%A1Gso?UWKqip2U4S%kJ zah^>h9ijzO73xucPdUe{`Ay0#>vEvH-JyKRsm{MgdD|6$`$t{>(2adf>Vgf*XTD@V zVBw-TIZemy3#z%9a$c|IJ194Qqv#)!m9prDbyIkB0bY0UEmO1+0wcc^fOs_IPN>^NK>Y zJ&#dWj-MSO&*As6V|siphTS>x*+}PXH_v)t%8r(2bFic3xbmIC-b6Qk?M42pa2~+9 zrjVlxK4X|;|8ZJE6EZ}0Z#q3qrdHL)#K>gZI1IwZ9Y(_l+R z+i|?}ga$0Jr`P0|hD18an2g4hy6TQ8^)y49WJ(gWPLt^vhe_?|SX*gJ8%=HKyL-?3 z9rK|N z+qDr*sGUiV$xFFJHpF#bXh4XoH^*wkh3W_QTDCgL-fY@$?yZZ24%S8LpWhI2B{tU2 zPCgzgOElM)CqLs8yIqZtvk6Tc>~dGh=Nq2pIF*ynt9aD#sNpe;$1ol<4TWg6%^V|D zh}soj8*=AYV)m4*zfBR+qqQ_rgGdGjyMIY&-)mIAnxsJ?B zOHi%Nf|p%gNWW2Q8(5#e_-s+>A8seJ_!zCP#YM^VFDoCmi4Vmu)!Ti4oLceo>u~W` z?@XomSA`$&Q>%Vcbg-!U{Oyjc*}?LT&`(1vx$M@W&g}9oZ0`yEIJAe$ihBL3l|?oU z8F+_J6sAX8Tr?;&?Q)suc;Wv1tKH8!<9G4$TrXMJOP;c0J`yLYa zaACS_dr9%pIXQ8!y)XSnrJVpxf#~mVPucArQFcM0?mDt-rm<=sUyIV0o+;sv6i!7e zx~uEBT@L;1vnf~UvGKgf*W5QPkzaXfwOtOTkAY&;lsK5&;5bj`I zsXq3u57itU@5q|15Vb#vA32lN?kzHw9zexNZybKpu4q&AjWg4(p@QS%JUTt2fB=m@ls@a zvU%N{nuq$zP&D)65@aVFEcyh>47BqJV@R;iVT}Hezv4DNT8?_1r8{osuR*tKI@iD` zXhaw6B*pY|&8_$7wJ76Sf_nJVpHAgIo6mlJI$d>jqn|mepT60VH9I4>qTS8j3dN{D zfTB*jM>)O^yS{`J<9{coFc}NK;z7rbN3BP`k@y6iyN6_7JHxp6ceIr+-up;S{BM;{ zL;NYbx7)l-j6@UPhxj_%m{{wr zWk%ay`3|SC;X8bv!$|#y?{U7`@E*^jWrq3z|6AMpg0Da>#-%EwvDndo;tPHRe}2GF zVvSC&zuf!-CjXjinNDuiEtb~+zp*ht*R&P`SE4LefbU#n8FA_j6Ni@~%lJC*MRx^S z4}7Gao!OXLEbj%Lh}W== zmx-g6nQ- z3w*Gh%@`r&dhgdEGAIMck>gVc#kNlpUzrx~i zCs+15>sSK|`VE#p4t($}mVX!c_z26}fhR7pJPy3?D&}LW+ zWEVt%kEK}N2>i~^S^jO{TVG-MlfW+?15VC=J2(m|EQ1!b3;2wyY@la=5B!?t2Z4v) zX8DW2*Ii(F{55d2f6O{`;IH3c`5E8}KD^1VcY!zM&W^e+0&mNm9p#M$*yLw8I6=(t zWey`A4sxxVw>;6>EQM7C%z!LQUj^fP$FD+Ax(IQlz zM#@y8R0x5fL^d;^MyMH~Y7E3QvJ;BvB}a}UkmeLwqoyK7xOh&`iW(#pE8!UFIVc24 z#X(6Vn-Y`*By%tju~dvj9Ml4&;9w+1dR7H0HdWNf{%8>&(dY)vE0Cj6{UQmW1_c^I zqdC+ltw$66qzFy+k(1FBFU_Gw!p8mz#}=L!CNlq==U7rqKMHYY7FIg^?tb`aq#r&i m*)%Kjea!iOcjic?V=3SGh0JwxobSmj^x!e&$!zmD9{NA+Ew(5C delta 4517 zcma)=dvH`&9mnsv`?$$=k!5FFsRSFs)^38NckgTO;_76d1X=#g-KpFHte&;MF z|7d2C`^)F|JKx{y?9R?!`J=z^s=p9n19$$U;b^{jj%BPL{A2?5zYH=TUh}ur=dwX- z1O43YgfS%#7;w%~`+v^C`Qm!Cr0##`(EHJeVAW~nnX%K%fw5wHXkyvxp~3PK%&D=& z2BsYT!eDs-?DgLo)L!tQyY-ISzsDA{OP3e>m$vF3E#2wz-Q^!FbN%jez^m;JqKGEU zqnrAv<%(Mm)}8bst~eQ~H>3ID=BLl%4|phYbSy{DTn}+M=&`q}D~6 z+q&(7r~1)5oaU5ba!0*kwzeDTT`vl5{rF287VD*_B*Z@F!3FA@pYHSE?%W=opV(Dx zj5f0oRNQqWuz2!nrCISdr&)daNXcW04F}ln>sZH}RHHugN*k?J@~wFD>Ok32!R>A^ zM}KR(^*Va%b+6TEOLcMT3AcV@ZXc&LpF}r`P#M6(0WyHyNGn}KeUxlunfwGkXPp3ObPlq=yA3#~o8qdzxPc9`wGs+(+w> zw((tb5#Ix&?K1k2&zD~14B{8J+qc}h3MH>KgRNn9%7@Fey6^K=`Zl_rWZP@3JD{?Q zwp^;T|AO{m3tMk)e>XDB<^_)>)mUe!y{yEtjsM1icHg1zvuz7Rhdx`^d1;_*N#b_D zxs+}<<~OXnk2L%?(PfhWi|TI|0ZGqXVGYrsbmbbL4PskJnJ4ESvOt=!DN6C8za z-|AqPL44pLniqg~chUUk!2RE#`BC797fkeJi{}}R`TSq$1E=6X*D;#^0r=1!nx6wc zyO-vF1AgAAm^f#Fx10Kv`J3=#a5Ntxv9@s&xcU;!tL>}6G2=STL%>IWL-V_UhtCl1 z2Ih77#mw+-TK0{)&;G2vgq{2hX9V*~JxcWJ%_+4;>&H%~iy;J^)+Z4l>o zvCx87K|k>D5jy^d!23FB{!`#n`Q^0ZMR1f>()<|kgR5!&D)6cp%_o5GSdjQF;C&e< zp6vfG;OJ>_9Ju9w0)P8{nqL9l^#IL30p9!=&As*~D2{ohm*RP52{`z@l!K`OUQ?j? zJ-~;4Msor9_$8Wu5jgXEnyVBI43iw(?^Ao^FxC*!Pp9aT%XBQ;F zpMW#VDhDj*Mc_SQnok3tSWwVq;4{vSi2gil_s=m?30lZy4}fD5Ihub8_{e&i-wB*| zc0>|{f!91l$KMNl=y7Km_I`D~uG=C5{e-t?0|Ks4O z4cG=LXczFzJG9V~!2Q3b`7^+K%QQa(eE%7m=SRR%`ZMj&fzQs+d<=N|6`H>ReETOf zKM#DL*EUFFt=5MSw#vn{HfYn!Tt;4sF+E$hKG4%Crjl_mV`2tVJ}Y6$t6~}gctIeh zs7RQqB8gR$B&P8g2}s8(I=FQB}D8YPA#u8CM6+yx*O$AfP@+oWz8B9$90ccds zksPU*T#3bIG#kgdLX^*EafBrD*i@1vM?8(|5z-RwLrjvMMM)&FIbw=JQ~@zjuEeTh z4yZX{Cm`R`Dd$w2L&^!b2r-+&=_H;^TN09ZGm^~Xo~2wg1HzK3Uls6^O~%}tD5f-?UZChY?n-|(Md8klxzvgs4D5o!>#5o_2iBHYVdAISLs z7n5I|Kq%vXHGdx&Fgp??7EmV;%lKbC&`$;=&d<=W--e;{k{Tl;gDr?wWMr_h-O~(a zJ0h_YLF`@4RxUP^tXyoT>DdT0=-Uc58Q6$47}|<9{bKT|69|CX8N&E~@)|KAIn#@3 zr3?(}45k;=fwTsY)&SC)Kw1+>Pd4ySoP1ZzkgzhP&3fWd%=N#yTKBUzvFxv7ZP{PJ z(6b+8ry$gyf=K=pg!)sEy8dJZdvx*)DJ8-#R@{73%9WY0x*eOZ$o*lh-^YaTkpTT; zZt{FFdBSc5MV8=ZEmadD{H3(HPGcHdee18L{lHWT#?`D%^o})7c=9AR)-v^pi?tME zV5~X(9Wt@D%aXWQjr`T;)E~W+yIf+TBIq?SOE~YT1 zk*PV1Y3^bKWf~eeIl-6)29}c(;`BBr#3?W_dH7CdOpu$rAVGi&UW>#}-j!e`xC2_L t?1+ch?*UXOKUpDBf@w$mWUE9wri_HiRX{2taq_N2JI)h{5M`@D$^f!!!Hoa_ delta 769 zcmexynsLDq#t9ls3*shfEvql^WB>yg%^(2bGXg0#ATEf8@C&NZXb6{K0t=J@)yD+n z5~HsGqHVH2qo5LDJ2p2lb_sGEWSF$?Aj7nMn>B=cnI|8$k)6zBYcP45xWeR3VS1Zc zBz%|!+__r!vp2EqS7B}0U%}9`fASeaC7>#W$$Cb1n@eQam^V){eZ)GsM)d;6L?)<` z12!Nfaho&L*_eSQZJQjRz_QuFQG<1Ifx#7#`@fp@GcYuOu_9{|im99Rj9fq_Zerue z{fjVdGm~u!vPnN!74QCP*$*A#*<-x1Z(Sl_YNSt(WcF&kzxO;&5`bfER!dh zZri-Tr-7Am)n>(@LYB>&!me@FuLy<)F_hu~)9ef?_Ne0GQ1K~RsNzacaS1f_x=?YE zOjPw2Q1P5NFhdwFy9mT$;$SGKo-CEP&C1!#ks&8BDK#hFz}&rPL9$x_ diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co index e86513e2aba54d76335279111a969d85b6d6f1ff..cfcea28779f4a1b396873c731d9ecc528bc012e0 100755 GIT binary patch delta 4861 zcmds5e@qi+82-L%Pksy)N)a;1N>F%!IwH*H(B#S9F zY2W9b?|r}T-acQSeAk)F>c}7JP${M}e`-8Uhw>p1UQz_v;w2E+6#{QW<1Tsh#M6KW zM*!7fi>-^7G238{UU?1H4=-p6V_oXw@+MblzH@}Ug?fKZaq^QnPp1b%URtz7gWkO z*G;mz^%vE&6-#!O`}h@PUU9EixL4NWE{B7S9n6Vm|58fz8GJbX*+W0aZ)Wj*K<$6r z!eu0O=wgMNy|gspIydsd{rYdp3zyx!XyIn;NAcqD!vH6U*AyT>A-4c-l{&~Z_3^le z{D|xU9L}_Qzh!jmI@pfJG02Shi7SKEos%ZR%fv<=EN7 z&0jrfo;KFHRYj|wihG(LPi~-8_B6J%eZP`$=SHJAbM{~s)4lr|J+mAvZNExsFGdEg zvyln4(ARCcXD;b!G`>7MZArpu-?>KDE(c3nf^aP#-YTZ&0-PY8nlB5+*c0c1riBib-!$biv3#G#x8=hV>4cR8QC`ZLGCk#Rk4O zh|#^daa;Q_BlT^Kt&Ln%`u(39omxfTns@F@&C}T_j~fnMhdy@ixS~rlaZ9_il6E{u z+8LLT`gX?7UQ!f{jr_tZR>YrSq$hlds_d7D7>-@{`AEgpDGBA!a_wK7F9%aJrJet~ro{dq+5@Q-D{6@~_ zBfew-uSa}zn>_zH#Mjr#`6|SZl#75)j5eSF>wvIF0x-5jp>za7lwS zuyuv}PN&4YG|=+9yx=FqZ=RI%R}oJw$iIpBNwa+ZJBSYzKu|G%s=v{|lucgnrf`X+ zgODqHQV%cCPRas~bnV%9#7z(lo09N(1kEp2O03yVnI)F6+a+jkpd{$DR!NYuRw8J% zH4+F~8XPtWZX$#kXklfG#bHg-QXN4M(rgx|-GLG+tEfbR(;}4-Hk-5t;j~B{Su5?0 zQduJ*F+0F_=W-6HQ*)m mHJVVw!}<--jsa76*9KUNoi&9|3bCppd_#z5D#A~fLc@PC8bq%E delta 1943 zcma)7e@Giw9DncbVp6N_6lF!K$ugB-S9{4NX(~Y%6KU&8Es~X1?7EKHIp$mj{bROb zYL*n~T&I1|Yqd!Ja6(Jxm`@@T=cO>V;Xe~`V`D$qV02|a#8Kupon?2I_mW&n*#hDF zd7tm+`?+_Yd&$Lr;*v|;u#HJpjnqztg#rlF53PXLmOy3mRNhSbjrtl&$-ok+bdLF- zb7UU1!eZ-xUjs*AqQrCuTv@mSt}dWp>8It#iemB`V1B`S6v*wCm>ebexCOh*WCs0E zU4hyvEU2M|M~lx|(6p@%dyan3psJl+yxA0!jS)`fxv2a)7?KfRVa06~XBkCi!Bh50 z4&APrF&P=s{-)Zhn&F0;AcLoE{f5%}n-Lj+3);7ki)6WkY9a2cY=Ok8-C+82E61ki zd_Kf$o))Nheas5gqfzcDbf%uc%l$ta&`i%t>DL>XjH(T~ zj83y!JgX-b4Jg^WWP17GhytwQG?(|pR~u1Tb)w0H4Ekge_}y49r`e5fUNNAxQ68Cw zZ!if)O9cxft1MdX{{ruP^H*+qYzNEKT52Jk8>jce)tUtp*6Zt0S#_jA3@VQ*`-n6dlp}HpudVmcG*%yjO|x%p zhktWw0JD9?FjyY$evbZRQ?)=VMsQ(L&p%H1!5rR7_`o5(emCKHub%f1ercBm!s)J` zG`O6B!-PN3k(O%UIN^gqJ%5_;wmqr8DB69M@Se+*QLA$ObEGlRo815)MtH7M&kqy+ zT8@5<>L>L2RA5H%)O9`ozPbzprLn1R%xg;{xc0f8|Ag>=y{G4w2|tvh|CaE_s`W|S zBm5B_g1T(0`=rrPs~>npT{0#hG)f!0VItfdXae1*Ug+#T=@6QQ40I4E_*`}h-A*Tk z4lxbg&a|M}l^*lETtw>>1s{QSpTC(xmv5f}!|Ijof`?kzT|PHaIE1vZJ>a3c6x{-a z4u3`~iqu$PMdIKT_E9sj$wfEpv{ z`2u|@nx%RNbQj0TI!<1i&r1V#XqMs*xPytCr8x&|V2)U%4Yhu0mFh*M9(YnZC&HS4 E0lqEuU;qFB diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co index c60a789b11faf6de81e909f87c15aa0c4328cd84..c5f4570b19f8f76bbe7d6677a93fc326778129a9 100755 GIT binary patch delta 5489 zcmds*e@qj16u`gVYmXw5fu-tT$&r4tvxFFV;YjIee%K9x^1>42s&9cpGZbEn>}`6vJLv`} zGg8nwVut6?d0_*bQ(&A!jKZ_l{7KMtQ4SxO1jkevh;koWIcsDdH!mW1Z9@y!<#b5x zVBjLR2}op7hZIA)xu`>ETVEtr&pl4?AvpPfXgHA92q%{|*5|AGW`%7!koO9lEPb&d zU*S0*B5~k@YfB*n<%(Pa0arfIuIg*1=q-wVp?!6L>slQsKem#~jjSr46Qwv?gf5>O zeL&P7$lHi=Ho!@N3R$0XUR}BefKuSRS}MzBvRo$1<+5BZOTineH*gU%#5) z^~?^*5;aRZ%`s^oI4mjoY^X-tlp1ZQM%({eBdHcC=i!YgZmPnPA_xM{UJRe$YbghH8o5gV9Ji@?Fj1)&m{`kYFB#*HA^SU|uxLY<$ zo6DKDtpANiXj$_rxn{I?@MOaZgt$!(fPn$mNmb8FAm|B_`rziFoov zrRk};58G`L2YR;?{HpoMrPe(=HAv~Y)P$zSn_}IM7iQ9Pd9k(T@;{KNs?v9<3E{#o z#KAud>yj5+cP`&Ho?2>0XFmZ;ZP!Tm4F45TIb!QA7V^x4bZ?tDc>97;QF6PQa}pya z$igK%#$%GwIK0z#$@dad@5Y>o6(qc3xRjiN%zZ#<#rzaeh#SL;t1 z|3U`VuK(!%MSSefl&_-AQr#>d@~xiR5a0iuo-anc^SZ{9{Vzp^8`Bav5$_q(4ycM3 z@pHfHdGcwG1&rR+^G(SAj{G!a>Z=8+`7OYlqc?=nfoB)!`P+!!3+Z_j@j{QDe}wo% zp`Ldm-q}Lw4X2P{xLnWs5pP|m=jRchNS3G-$0$#-h)git7XPpcp5=o98iZT6y%v7e z;q*``)qzyX$CP2!6~GRZ!{ySbKY-OB>&9w;Wsu5NF*q1q<) z2lhK~2!`=FF%7!#1v7M26{3t&`>2>QpC3~ng)fZ>xbd~KKA(E5LN1?mx^Xm?D#Mvs z#)UJpE}y1)JesdR7>v`^&?Pvms}pV2@D?P7-SL-e;L5CF{8+O#-c xC4;8;mKwMyezyiLAiQ;PYb~rG?$pKWWtmliW_UM9Iwfl1f~;-?IYNaH{0HN5jNkwO delta 2511 zcma);eM}o=9LJyMdVLvVawBDkfegx)Ap*PJT}#oXqi+l+GGxHe5XOjXjUllizA%k; ztHmsaX~#tiaWSZ5(PU1?vwF9LvB7@m`+i%AtuiX`f#| zzu)isd%3pP?@yZ-Q|4p|)mwYK^kghr4uSF@Ct%dKK%o|eS7U$2no|n{$t-N(JnDbu zMYS|>H{9s>?;0o$$Fgk~z`)c6@WzzbWWRJz>s@`)OJH)!+XAHG;yx*XZAUS3=SY-z z>p+3X9c0Cyn+wI!k_uGUGDC@5@*~V`eO)W!|QG1P&=Nps3tO=diJwlmc-ndwqvlT@eU;no?)R~xVnc52M z7Q+hU8LOYprcuwK>t?ZaC{z5X>oeOrQ+fb-4_>p_Is7eRuF__)u%8iod#>4kUrMiI zqs6!mv32_uqB>!*-TEu7rmO?85g)x;ai(Z)DIqC}WF2i5<2uSv(XmG?Xh3YYz%q1W zNVYqNf3Q(!wjtm9J52w$@K-)~&&)i(Nt@WWK0UE~wN{Yl+ysrXCk7bXTjq3YsPpp2 zWYFm>`{xVk0Uuaz91t!3`j{E@%*qNKf38xvxTGnZi^9pQqE8nRlA=O}YL~CT>T6s+ z{xAc-R;C&?r>0$y9-q&LY$hxpFS=LZox`K%ahW&d#=6__Hx3_}>S+ZdXN)`-^HXcM z1M^UwF}@sg#%tsr%)hx$HJ;GDLF}N{B-CSG+op%A#-o@wH5vKim`574pBUnOGWB*n z%#A%q+%FJ^cJ^U@cbAbTF$d{RRq`h0ZEJK+VqW-+YUnT8+N?3q5Z9S%OscO|5Pj9i zzr?)cLnFV6dG9+${uAa4Lq!v$MgatoWnL54)x6jP_6&YMqkTa@VOHvVGb~8$ZGm~zOm(WK1^TF@KQ$_! a8JEJ^M=1)lpxnJMFT;aB*j`G75c~}Wj67BV diff --git a/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co b/hsa/gfx950/pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co index ec6ca3da31518d4214cd44c7c0433e57f5090dcb..8ce62e2e23293e8861c668ea610bc6ec6025dc84 100755 GIT binary patch delta 1767 zcmdn7oN>Z>#t9ls0wEK%meoJ7V*mpf%^(2bGXg0#Abt=4;Xf!qrJ-B~1%8M)Odk`F zON>6K{>lD~f=Yz#*xba}B`EWsVbVUJ@eThOrh#dfCYwGX8=(d<8?lDXBEqH2^?{84 ze=+&h34}8KSM&Fg0kb1PVgYpmv5f!K1N~$`;`|H^`)wFHFR3vyGT4G>MMef2+da)- zwj&Zd5yalrY~^Az$;!ocnxBn8gTJj{Q-F;~L!hl_(=R5UI)MPFogs|>Cr=R*k~6)i zR?5Jj&R}{`9Y|{cX$>H)38XcF^ke~9#mPs-3<)b!n*7d{Z?lTH2Xp;pj@JF`O)UGf zSX%a1F!byP*((V3t00nJ1)+Wwq^@6C!JeI5Bc(*x)ry<k{j47BMVNsg?yJbHp*4$fc`fWTJ_Vcl{?sxA1vKwvM zY#JH%vodXFv%W$oRRhi3v3Y`B1`88V{VykC)GKX%=NiRP-{J=?M4=Q9m}X~qVS_3z z4i$IMMip0rir+y~uL~7F5r?YY0xG^G49pOQ%Ps)1m^c_76imM5x6R7Q$dDl?F)1}C z-oVVs5ymt!HGnbAoXlZNH)A6h)5XYTa-pE^=I{QBOiT~#CL8*RPSy{SAlcj@XCvOjuHY|o0}NB1UU{eOxkylVcNdU8p5T_lLf_PCoj@fn4BkWFqz3xZ}S^* z4`u;*j@JF`O)UEbSX%a1F!b!7oS>@&R28>5LPm^f@+8M?lLa(bHh(bSVVNAFyoH0E z395`o9HdNPa-W&qW))R2rp-*2$5r{JUvC149EC^RhI-P2Idmw*=Y511uch z|Du|aw zIm)pz0Xdu8t}p?8wQX~Z*Ae#mH(t;{hEhCWnw?>X6{@&6RJ=k9Ra^-wz6MRbE>!$O z6smd)sQ8~CFhdwFy8y&u;$Yy&oxCY%o0X%n6GKj7Qff}TftisjjA>*DXPOzo6}TEZ z!xT6hyG)ZKhOSdoc|!+LUIsN7_MFaa)j8uOj36=o(`0Znoh&?MLaOKOvK1;r;% k2$Nu{@SVIW%#KOJZ}KZ3CE-6=E8LE=!yn?@l|Xal0TVf-t^fc4 From 8089c44c232d0b05ecf3a8a5b6987dd466a4d10e Mon Sep 17 00:00:00 2001 From: yihonglie Date: Thu, 21 May 2026 02:26:04 +0000 Subject: [PATCH 6/6] repro(pa-asm): standalone reproducer for fp8 PA OOB at bs=128,qlen=3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds op_tests/repros/ with a self-contained Python reproducer for the HIP illegal-memory access in pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co (gfx950). Triggers exactly when batch_size==128 and qlen==3 on the fp8 per-token-quant ASM PA path. The bf16 sibling kernel (pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co) is clean on the same shape. Minimal repro (≤5s, no concurrency required): AMD_SERIALIZE_KERNEL=3 HIP_LAUNCH_BLOCKING=1 \ python op_tests/repros/pa_asm_fp8_repeat_call.py \ --bs 128 --ctx 1024 --qlen 3 --kv-dtype fp8 --n-repeat 5 See op_tests/repros/README.md for the full sweep matrix and negative controls (bs±1, qlen±1, same total_qo via other factorings, bf16 KV). --- op_tests/repros/README.md | 127 +++++++ op_tests/repros/pa_asm_crash_repro.py | 423 ++++++++++++++++++++++ op_tests/repros/pa_asm_fp8_min_repro.py | 153 ++++++++ op_tests/repros/pa_asm_fp8_repeat_call.py | 149 ++++++++ op_tests/repros/pa_asm_fp8_seq_repro.py | 171 +++++++++ op_tests/repros/pa_asm_fp8_shape_sweep.py | 71 ++++ 6 files changed, 1094 insertions(+) create mode 100644 op_tests/repros/README.md create mode 100644 op_tests/repros/pa_asm_crash_repro.py create mode 100644 op_tests/repros/pa_asm_fp8_min_repro.py create mode 100644 op_tests/repros/pa_asm_fp8_repeat_call.py create mode 100644 op_tests/repros/pa_asm_fp8_seq_repro.py create mode 100644 op_tests/repros/pa_asm_fp8_shape_sweep.py diff --git a/op_tests/repros/README.md b/op_tests/repros/README.md new file mode 100644 index 0000000000..52977a4280 --- /dev/null +++ b/op_tests/repros/README.md @@ -0,0 +1,127 @@ +# aiter ASM PA crash — standalone reproducer + +A minimal aiter-only reproducer for the HIP illegal-memory crash observed in +production (Kimi-K2.5-MXFP4 + Eagle3 spec-decode on 8x MI355 / gfx950) when +ATOM uses ASM-force paged-attention. + +## TL;DR fingerprint + +| | value | +|---|---| +| **Kernel** | `pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co` (gfx950) | +| **Trigger** | `batch_size == 128` AND `qlen == 3` (specifically — not a boundary, not `total_qo`) | +| **Shape** | GQA=8 (8 Q heads / 1 KV head), head_size=128, block_size=16 | +| **KV dtype** | **fp8 per-token quant only** — bf16 (`pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co`) does NOT crash at the same shape | +| **Failure** | HIP illegal memory access, reported asynchronously. Surfaces at the next `hipModuleLaunchKernel` call (so call N+1 errors when call N was the offender). | +| **Min repeats to crash** | 2–3 invocations of the bad shape. Single call sometimes survives, 2nd or 3rd reliably trips it. | +| **Sequence dependency** | None — pure shape bug. Calling the bad shape in a fresh process triggers it. | +| **Concurrency dependency** | None — single-stream, `AMD_SERIALIZE_KERNEL=3 HIP_LAUNCH_BLOCKING=1` still crashes. | + +## How to reproduce in ≤30 seconds (inside the eagle3 container) + +```bash +cd /app/aiter-test # or wherever aiter is importable +AMD_SERIALIZE_KERNEL=3 HIP_LAUNCH_BLOCKING=1 \ + python /home/hyi_qle/yhl/project/002-kimi-pa-asm-fix/aiter_repro/pa_asm_fp8_repeat_call.py \ + --bs 128 --ctx 1024 --qlen 3 --kv-dtype fp8 --n-repeat 5 +``` + +Expected output on a buggy build: +``` +[aiter] LoadKernel: _ZN5aiter40pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1E + hsaco: /app/aiter-test/hsa//gfx950/pa/pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co +[AITER] /app/aiter-test/csrc/include/aiter_hip_common.h:244 + fail to call hipModuleLaunchKernel(...) ---> [HIP error](an illegal memory access) +Aborted (core dumped) +``` + +Negative controls (all pass on the same build): +```bash +# bf16 KV — same bs=128 qlen=3, different kernel, no crash: +python pa_asm_fp8_repeat_call.py --bs 128 --ctx 1024 --qlen 3 --kv-dtype bf16 + +# fp8 KV, bs off by one — no crash: +python pa_asm_fp8_repeat_call.py --bs 127 --ctx 1024 --qlen 3 --kv-dtype fp8 +python pa_asm_fp8_repeat_call.py --bs 129 --ctx 1024 --qlen 3 --kv-dtype fp8 + +# fp8 KV, qlen off by one — no crash: +python pa_asm_fp8_repeat_call.py --bs 128 --ctx 1024 --qlen 2 --kv-dtype fp8 +python pa_asm_fp8_repeat_call.py --bs 128 --ctx 1024 --qlen 4 --kv-dtype fp8 + +# same total_qo=384 via other (bs, qlen) — no crash: +python pa_asm_fp8_repeat_call.py --bs 192 --ctx 1024 --qlen 2 --kv-dtype fp8 # 192*2=384 +python pa_asm_fp8_repeat_call.py --bs 96 --ctx 1024 --qlen 4 --kv-dtype fp8 # 96*4=384 +``` + +## Sweep data (each cell = 5 repeated calls, fresh process, `AMD_SERIALIZE_KERNEL=3 HIP_LAUNCH_BLOCKING=1`) + +KV dtype = fp8, head_size=128, block_size=16, num_blocks=8192, GQA=8 (num_q_heads=8, num_kv_heads=1). + +qlen=3 sweep over batch_size (ctx_len=1024, fp8 KV): + +| bs | 32 | 64 | 96 | 124 | 125 | 126 | 127 | **128** | 129 | 130 | 144 | 192 | 256 | 512 | +|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----| +| result | OK | OK | OK | OK | OK | OK | OK | **CRASH** | OK | OK | OK | OK | OK | OK | + +bs=128 sweep over qlen (ctx_len=1024, fp8 KV): + +| qlen | 1 | 2 | **3** | 4 | 5 | 6 | 7 | 8 | +|------|---|---|---|---|---|---|---|---| +| result | OK | OK | **CRASH** | OK | OK | OK | OK | OK | + +bs=128, qlen=3, ctx_len sweep: + +| ctx_len | 128 | 1024 | 2048 | 4096 | 6724 | 8192 | 16384 | +|---------|---|---|---|---|---|---|---| +| result | CRASH | CRASH | CRASH | CRASH | CRASH | CRASH | CRASH | + +→ ctx_len does not affect; bs=128 ∧ qlen=3 is the entire trigger. + +## What's in this directory + +| file | purpose | +|---|---| +| `pa_asm_fp8_repeat_call.py` | **The minimal reproducer.** Repeats one `(bs, ctx, qlen)` call N times. Use for bisection. | +| `pa_asm_fp8_min_repro.py` | Single-call variant (does NOT crash by itself — bug needs ≥2 calls). Useful for checking shapes that are individually safe. | +| `pa_asm_fp8_seq_repro.py` | Replays a fixed call sequence from the stress driver and lets you also `--repeat-only-bad` to confirm sequence-independence. | +| `pa_asm_crash_repro.py` | Original stress driver that mimics ATOM's call pattern (random shape mix, multi-stream). Useful for end-to-end "would this build crash under prod-like load?" | +| `pa_asm_fp8_shape_sweep.py` | Sweep wrapper (4 qlens × 9 ctx × 5 bs). Each cell forks a fresh process. | +| `README.md` | This file. | + +## Production correlation + +- ATOM ASM-force path calls `aiter.pa_fwd_asm` with exactly these shapes during + the Eagle3 spec-decode target step (Kimi MLA absorbed → 1 KV head, TP=8 → + 8 Q heads per rank → GQA=8). Draft tokens are 3 per step (eagle3 emits 3), + so `qlen=3`. +- The bench runs at concurrency=128. When the scheduler packs 128 in-flight + requests into one step, the resulting `attn_metadata` is exactly + `batch_size=128, max_seqlen_q=3` → triggers this kernel. +- Production crash signature (`event.synchronize() → HIP illegal memory`) is + the same async-reported error this reproducer surfaces. +- Crash req# varies wildly in production (131, 857, ~900, ~3945) because it + depends on when the scheduler first assembles a step matching `bs=128 ∧ + qlen=3` — not on cumulative state. + +## Build versions used + +- aiter: `pr3211-on-main @ aff40475d` (also reproduces on `main @ ee28d47ac`) +- ROCm/HIP: per the `rocm/atom-dev:latest` container +- Target arch: gfx950 (MI355) +- Container: eagle3 (podman) + +## Suggested next steps for the ASM team + +1. Disassemble `pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co` with + `roc-obj-extract` / `llvm-objdump -d`, then look at how `batch_size` + and `max_qlen=3` parameterize the kernarg block. The branch that's hit + only at `(bs=128, qlen=3)` is the suspect. +2. Compare with the bf16 sibling `pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co` + (which does NOT crash on the same shape) to find the extra fp8 code path. +3. Confirm with `rocm-debug-agent`: run + ``` + AMD_LOG_LEVEL=4 ROCM_DEBUG_AGENT=on \ + AMD_SERIALIZE_KERNEL=3 HIP_LAUNCH_BLOCKING=1 \ + python pa_asm_fp8_repeat_call.py --bs 128 --ctx 1024 --qlen 3 --n-repeat 3 + ``` + to capture the wave dump, faulting PC, and offending V# descriptor. diff --git a/op_tests/repros/pa_asm_crash_repro.py b/op_tests/repros/pa_asm_crash_repro.py new file mode 100644 index 0000000000..6498ed4a00 --- /dev/null +++ b/op_tests/repros/pa_asm_crash_repro.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Standalone aiter reproducer for ASM paged-attention crash observed in ATOM +# (Kimi-K2.5-MXFP4 + Eagle3 spec-decode) at 30k req x 128 conc. +# +# Background: +# - Inside ATOM, attention_mha.py calls aiter.pa_fwd_asm via ASM-force path. +# At 30k req x 128 concurrency, this crashes with HIP illegal memory access, +# async-reported via event.synchronize(). Crash req# is highly variable +# (observed 131 / 857 / ~900 / ~3945 across runs). +# - The crash signature (wave dump) names kernels of the form +# pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co (fp8 KV) +# pa_bf16_gqa8_1tg_4w_mtp_msk1.co (bf16 KV) +# so the offending kernel is the ASM PA backend, GQA ratio 8, with mtp. +# - The crash reproduces with both fp8 and bf16 KV cache. Forcing +# self.kv_scale to be a 131072-element array did NOT fix it. +# - Gluon-attention path under identical workload (30k x 128) is stable +# (30000/30000 PASS). So root cause is in the ASM PA backend, not in ATOM. +# +# This script is a self-contained aiter-only stress driver that: +# * matches ATOM's pa_fwd_asm call signature (incl. qo_indptr, K_QScale, +# V_QScale, max_qlen, high_precision=0). +# * uses GQA-8 (num_q_heads=8, num_kv_heads=1), block_size=16, head_dim=128 +# -> selects the same ASM .co binary as production. +# * mixes mtp qlen in {1,2,3,4} per iteration to exercise the same kernel +# variants the bench hits. +# * varies batch_size (mostly large, ~128) and ctx_len each iteration +# to imitate the request mix. +# * launches many calls on multiple CUDA streams without sync between them, +# because the bug is async-reported and a strict launch-and-sync loop may +# not race the way 128-conc inflight requests do. +# * periodically forces a sync, catches the HIP error, and prints the iter, +# batch shape, and current call params so the ASM team can inspect. +# +# Usage: +# # bf16 KV (no quant), default: +# python pa_asm_crash_repro.py +# +# # fp8 KV (matches the wave-dumped kernel from crash note): +# python pa_asm_crash_repro.py --kv-dtype fp8 +# +# # tweak shape mix: +# python pa_asm_crash_repro.py --kv-dtype fp8 --max-iters 200000 \ +# --streams 16 --sync-every 32 + +import argparse +import os +import random +import sys +import time +import traceback +from typing import List, Optional, Tuple + +import torch + +import aiter +from aiter import dtypes +from aiter import pertoken_quant + + +# --------------------------------------------------------------------------- # +# KV cache helpers (same layout as test_pa_mtp.py and aiter PA convention) +# --------------------------------------------------------------------------- # +def make_kv_cache( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: str = "cuda", +) -> Tuple[torch.Tensor, torch.Tensor]: + """Allocate K/V cache in aiter PA layout. + + K: [num_blocks, num_kv_heads, head_size // x, block_size, x] + V: [num_blocks, num_kv_heads, head_size, block_size] + where x = 16 // dtype.itemsize. + """ + x = 16 // dtype.itemsize + k_shape = (num_blocks, num_kv_heads, head_size // x, block_size, x) + v_shape = (num_blocks, num_kv_heads, head_size, block_size) + k_cache = torch.empty(k_shape, dtype=dtype, device=device).uniform_(-1, 1) + v_cache = torch.empty(v_shape, dtype=dtype, device=device).uniform_(-1, 1) + return k_cache, v_cache + + +def asm_v_shuffle(v_cache: torch.Tensor) -> torch.Tensor: + """ASM PA expects V re-shuffled to [B, KVH, block_size/x, head_size, x].""" + x = 16 // v_cache.element_size() + num_blocks, num_kv_heads, head_size, block_size = v_cache.shape + v = v_cache.view(num_blocks, num_kv_heads, head_size, block_size // x, x) + return v.permute(0, 1, 3, 2, 4).contiguous() + + +def pertoken_quant_kvcache_symm( + k_cache: torch.Tensor, + v_cache: torch.Tensor, + quant_dtype: torch.dtype, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Per-token symmetric quant of K/V to fp8 + scale arrays in ASM layout. + + Returns: + k_quant: same layout as K cache, in `quant_dtype` + v_quant: same layout as V cache, in `quant_dtype` + k_scale_asm: [num_blocks, num_kv_heads, block_size, 1] (ASM-friendly) + v_scale_asm: same shape + """ + num_blocks, num_kv_heads = k_cache.shape[0], k_cache.shape[1] + head_dim = v_cache.shape[2] + block_size = v_cache.shape[3] + + k_perm = ( + k_cache.permute(0, 1, 3, 2, 4) + .reshape(num_blocks, num_kv_heads, block_size, -1) + .contiguous() + ) + v_perm = ( + v_cache.permute(0, 1, 3, 2) + .reshape(num_blocks, num_kv_heads, block_size, -1) + .contiguous() + ) + + k_quant, k_scale_asm = pertoken_quant(k_perm, quant_dtype=quant_dtype) + v_quant, v_scale_asm = pertoken_quant(v_perm, quant_dtype=quant_dtype) + + quant_x = 16 // quant_dtype.itemsize + k_quant = ( + k_quant.view(num_blocks, num_kv_heads, block_size, head_dim // quant_x, quant_x) + .permute(0, 1, 3, 2, 4) + .contiguous() + ) + v_quant = ( + v_quant.view(num_blocks, num_kv_heads, block_size, head_dim) + .permute(0, 1, 3, 2) + .contiguous() + ) + return k_quant, v_quant, k_scale_asm, v_scale_asm + + +# --------------------------------------------------------------------------- # +# Iteration: pick a random request mix, build inputs, fire pa_fwd_asm +# --------------------------------------------------------------------------- # +def build_iter_inputs( + rng: random.Random, + num_kv_heads: int, + num_q_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + device: str = "cuda", +): + """Construct one paged-attention call's inputs. + + Mirrors the production mix at 128 conc: + - batch_size: most often near 128, sometimes 32/64/96/256 to exercise + edge cases (matches a serving step where some seqs finished). + - ctx_len: random in [64, 8192], occasionally up to 16384. + - qlen: in {1, 2, 3, 4} (eagle3 draft / MTP draft tokens). + """ + # batch size mix: biased toward 128 (matches CONCURRENCY=128) + batch_size = rng.choice([32, 64, 96, 128, 128, 128, 128, 128, 192, 256]) + + # ctx_len mix: most short-to-medium, sometimes long + if rng.random() < 0.08: + ctx_len = rng.randint(8192, 16384) + elif rng.random() < 0.4: + ctx_len = rng.randint(64, 1024) + else: + ctx_len = rng.randint(1024, 8192) + ctx_len = max(ctx_len, block_size) + + # MTP qlen distribution: weight toward 3-4 since eagle3 emits 3 draft tokens + qlen = rng.choice([1, 2, 3, 3, 3, 4, 4]) + + max_num_blocks_per_seq = (16384 + block_size - 1) // block_size + num_blocks_per_seq = (ctx_len + block_size - 1) // block_size + + qo_indptr = torch.zeros(batch_size + 1, dtype=torch.int32, device=device) + seq_lens_qo = torch.full((batch_size,), qlen, dtype=torch.int32, device=device) + qo_indptr[1:] = torch.cumsum(seq_lens_qo, dim=0) + total_qo = int(qo_indptr[-1].item()) + max_qlen = qlen + + query = torch.empty( + (total_qo, num_q_heads, head_size), dtype=dtype, device=device + ).uniform_(-1, 1) + + seq_lens = torch.full( + (batch_size,), ctx_len, dtype=torch.int32, device=device + ) + + # block_tables: random page assignments per request, padded to + # max_num_blocks_per_seq. + block_tables = torch.zeros( + (batch_size, max_num_blocks_per_seq), dtype=torch.int32, device=device + ) + for i in range(batch_size): + idx = torch.randint( + 0, num_blocks, (num_blocks_per_seq,), dtype=torch.int32, device=device + ) + block_tables[i, :num_blocks_per_seq] = idx + + return dict( + query=query, + block_tables=block_tables, + seq_lens=seq_lens, + qo_indptr=qo_indptr, + max_qlen=max_qlen, + batch_size=batch_size, + ctx_len=ctx_len, + qlen=qlen, + total_qo=total_qo, + ) + + +def run_one( + iter_idx: int, + rng: random.Random, + cfg, + persistent, + stream: torch.cuda.Stream, + log_shape: bool = False, +) -> dict: + inp = build_iter_inputs( + rng, + cfg.num_kv_heads, + cfg.num_q_heads, + cfg.head_size, + cfg.block_size, + cfg.num_blocks, + cfg.compute_dtype, + ) + if log_shape: + print(f"[repro] iter={iter_idx:>6} bs={inp['batch_size']:>4} " + f"ctx={inp['ctx_len']:>5} qlen={inp['qlen']} " + f"total_qo={inp['total_qo']:>5} -> calling pa_fwd_asm", flush=True) + + if cfg.kv_dtype == "fp8": + k_cache = persistent["k_quant"] + v_cache_asm = persistent["v_quant_asm"] + k_scale = persistent["k_scale_asm"] + v_scale = persistent["v_scale_asm"] + else: + k_cache = persistent["k_cache"] + v_cache_asm = persistent["v_cache_asm"] + k_scale = None + v_scale = None + + with torch.cuda.stream(stream): + out = aiter.pa_fwd_asm( + inp["query"], + k_cache, + v_cache_asm, + inp["block_tables"], + inp["seq_lens"], + inp["block_tables"].stride(0), + max_qlen=inp["max_qlen"], + K_QScale=k_scale, + V_QScale=v_scale, + out_=None, + qo_indptr=inp["qo_indptr"], + high_precision=0, + ) + + # IMPORTANT: keep refs to *all* input tensors so the caching allocator + # cannot recycle their backing memory while the async kernel is still + # running. pa_fwd_asm uses raw HIP launches and may not properly mark + # the allocator's stream-use tracking on the input blocks. + rec = dict( + iter=iter_idx, + batch_size=inp["batch_size"], + ctx_len=inp["ctx_len"], + qlen=inp["qlen"], + total_qo=inp["total_qo"], + out=out, + _query=inp["query"], + _block_tables=inp["block_tables"], + _seq_lens=inp["seq_lens"], + _qo_indptr=inp["qo_indptr"], + ) + return rec + + +# --------------------------------------------------------------------------- # +# main +# --------------------------------------------------------------------------- # +def main(): + p = argparse.ArgumentParser(description="ASM paged-attention crash repro") + p.add_argument("--kv-dtype", choices=["bf16", "fp8"], default="bf16", + help="KV cache dtype (both observed to crash in production)") + p.add_argument("--num-q-heads", type=int, default=8, + help="Q heads (production: 8/rank with TP=8)") + p.add_argument("--num-kv-heads", type=int, default=1, + help="KV heads (production: 1 after MLA absorb -> GQA=8)") + p.add_argument("--head-size", type=int, default=128) + p.add_argument("--block-size", type=int, default=16) + p.add_argument("--num-blocks", type=int, default=8192, + help="Size of KV cache pool") + p.add_argument("--max-iters", type=int, default=100000) + p.add_argument("--streams", type=int, default=8, + help="Number of concurrent CUDA streams") + p.add_argument("--sync-every", type=int, default=64, + help="Force device sync + check error every N iters") + p.add_argument("--seed", type=int, default=42) + p.add_argument("--device", default="cuda:0") + p.add_argument("--log-each-call", action="store_true", + help="Print every call's shape (for bisection)") + args = p.parse_args() + + torch.manual_seed(args.seed) + random.seed(args.seed) + torch.set_default_device(args.device) + + class Cfg: + pass + cfg = Cfg() + cfg.num_q_heads = args.num_q_heads + cfg.num_kv_heads = args.num_kv_heads + cfg.head_size = args.head_size + cfg.block_size = args.block_size + cfg.num_blocks = args.num_blocks + cfg.kv_dtype = args.kv_dtype + cfg.compute_dtype = torch.bfloat16 # query/output dtype + + print(f"[repro] device={args.device}") + print(f"[repro] kv_dtype={cfg.kv_dtype} num_q_heads={cfg.num_q_heads} " + f"num_kv_heads={cfg.num_kv_heads} GQA={cfg.num_q_heads // cfg.num_kv_heads}") + print(f"[repro] head_size={cfg.head_size} block_size={cfg.block_size} " + f"num_blocks={cfg.num_blocks}") + print(f"[repro] max_iters={args.max_iters} streams={args.streams} " + f"sync_every={args.sync_every}") + + # --- allocate persistent KV cache (re-used across iterations, like a + # paged KV pool in production) + k_cache, v_cache = make_kv_cache( + cfg.num_blocks, cfg.block_size, cfg.num_kv_heads, cfg.head_size, + cfg.compute_dtype, device=args.device, + ) + persistent = { + "k_cache": k_cache, + "v_cache_asm": asm_v_shuffle(v_cache), + } + if cfg.kv_dtype == "fp8": + k_q, v_q, k_s_asm, v_s_asm = pertoken_quant_kvcache_symm( + k_cache, v_cache, quant_dtype=aiter.dtypes.fp8 + ) + persistent["k_quant"] = k_q + persistent["v_quant_asm"] = asm_v_shuffle(v_q) + persistent["k_scale_asm"] = k_s_asm + persistent["v_scale_asm"] = v_s_asm + + torch.cuda.synchronize() + print(f"[repro] KV cache allocated: K {k_cache.shape} {k_cache.dtype} " + f"V {v_cache.shape} {v_cache.dtype}") + if cfg.kv_dtype == "fp8": + print(f"[repro] quant K {persistent['k_quant'].shape} " + f"{persistent['k_quant'].dtype} " + f"K_QScale {persistent['k_scale_asm'].shape} " + f"{persistent['k_scale_asm'].dtype}") + + streams = [torch.cuda.Stream(device=args.device) for _ in range(args.streams)] + rng = random.Random(args.seed) + + t0 = time.time() + last_log_t = t0 + last_log_iter = 0 + keepalive: List[dict] = [] # keep refs so async kernels don't free inputs + + for i in range(args.max_iters): + s = streams[i % args.streams] + try: + rec = run_one(i, rng, cfg, persistent, s, log_shape=args.log_each_call) + keepalive.append(rec) + # bound memory: only hold last ~2 batches per stream + if len(keepalive) > args.streams * 4: + keepalive.pop(0) + except Exception as e: + print(f"\n[repro] !! exception at iter {i} (sync-launch): {e}") + traceback.print_exc() + return _report_crash(i, t0, args) + + if (i + 1) % args.sync_every == 0: + try: + torch.cuda.synchronize() + except Exception as e: + print(f"\n[repro] !! HIP error surfaced at sync after iter {i}: " + f"{type(e).__name__}: {e}") + traceback.print_exc() + # dump last batch shapes to help ASM team + recent = keepalive[-min(len(keepalive), 16):] + print(f"[repro] last {len(recent)} call shapes:") + for r in recent: + print(f" iter={r['iter']:>6} bs={r['batch_size']:>4} " + f"ctx={r['ctx_len']:>5} qlen={r['qlen']} " + f"total_qo={r['total_qo']:>5}") + return _report_crash(i, t0, args) + + now = time.time() + if now - last_log_t >= 5.0: + d_iter = (i + 1) - last_log_iter + d_t = now - last_log_t + ips = d_iter / d_t + print(f"[repro] iter {i + 1:>7}/{args.max_iters} " + f"{ips:>7.1f} iter/s elapsed={now - t0:>7.1f}s") + last_log_t = now + last_log_iter = i + 1 + + torch.cuda.synchronize() + dt = time.time() - t0 + print(f"\n[repro] DONE — {args.max_iters} iters OK in {dt:.1f}s " + f"({args.max_iters / dt:.1f} iter/s). No crash observed.") + return 0 + + +def _report_crash(iter_idx: int, t0: float, args) -> int: + dt = time.time() - t0 + print(f"\n[repro] CRASHED at iter {iter_idx} after {dt:.1f}s " + f"(kv_dtype={args.kv_dtype}, streams={args.streams})") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/op_tests/repros/pa_asm_fp8_min_repro.py b/op_tests/repros/pa_asm_fp8_min_repro.py new file mode 100644 index 0000000000..f604d4f9c5 --- /dev/null +++ b/op_tests/repros/pa_asm_fp8_min_repro.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# Minimal aiter-only single-call reproducer for the fp8 ASM PA crash. +# +# This is *single call* of aiter.pa_fwd_asm — no streams, no loop, no race. +# Reproduces an HIP illegal memory access purely from one bad shape. +# +# Discovery path: +# The stress driver (pa_asm_crash_repro.py) crashed at iter 3 of seed=1 +# with: batch_size=128, ctx_len=6724, qlen=3, GQA=8, head_dim=128, +# block_size=16, fp8 KV per-token quant. +# This script isolates exactly that call. +# +# Usage: +# python pa_asm_fp8_min_repro.py +# +# Knobs (env var): +# PA_REPRO_PAD=1 pad block_tables to max_num_blocks_per_seq=1024 (default). +# PA_REPRO_PAD=0 use tight block_tables of (batch_size, num_blocks_per_seq). +# PA_REPRO_BS=N override batch_size (default 128). +# PA_REPRO_CTX=N override ctx_len (default 6724). +# PA_REPRO_QLEN=N override qlen (default 3). +# PA_REPRO_NBLOCKS=N override KV pool num_blocks (default 8192). +# +# Expected on a buggy build (current aiter ee28d47ac + PR3211 aff40475d): +# [AITER] hipModuleLaunchKernel failed -> HIP illegal memory access +# (kernel: pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co) + +import os +import sys +import torch +import aiter +from aiter import pertoken_quant + + +def main(): + torch.manual_seed(0) + device = "cuda:0" + torch.set_default_device(device) + + # ---- shape (matches Kimi-K2.5 MLA-via-MHA, eagle3 MTP, TP=8 per-rank) ---- + head_size = 128 + block_size = 16 + num_q_heads = 8 # per-rank Q heads (TP=8 on 64 heads) + num_kv_heads = 1 # MLA absorbs to 1 latent KV head -> GQA=8 + + batch_size = int(os.environ.get("PA_REPRO_BS", 128)) + ctx_len = int(os.environ.get("PA_REPRO_CTX", 6724)) + qlen = int(os.environ.get("PA_REPRO_QLEN", 3)) + num_blocks = int(os.environ.get("PA_REPRO_NBLOCKS", 8192)) + pad_bt = bool(int(os.environ.get("PA_REPRO_PAD", "1"))) + + max_seq_len = 16384 + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size # 1024 + num_blocks_per_seq = (ctx_len + block_size - 1) // block_size + + print(f"[min-repro] batch={batch_size} ctx_len={ctx_len} qlen={qlen} " + f"num_blocks={num_blocks} pad_block_tables={pad_bt}") + print(f"[min-repro] num_blocks_per_seq={num_blocks_per_seq} " + f"max_num_blocks_per_seq={max_num_blocks_per_seq}") + + # ---- allocate KV cache (bf16) and pertoken-quant it to fp8 ASM layout ---- + x = 16 // 2 # bf16 itemsize + k_cache = torch.empty( + (num_blocks, num_kv_heads, head_size // x, block_size, x), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + v_cache = torch.empty( + (num_blocks, num_kv_heads, head_size, block_size), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + + # pertoken quant -> fp8 + k_perm = ( + k_cache.permute(0, 1, 3, 2, 4) + .reshape(num_blocks, num_kv_heads, block_size, -1).contiguous() + ) + v_perm = ( + v_cache.permute(0, 1, 3, 2) + .reshape(num_blocks, num_kv_heads, block_size, -1).contiguous() + ) + k_q, k_scale_asm = pertoken_quant(k_perm, quant_dtype=aiter.dtypes.fp8) + v_q, v_scale_asm = pertoken_quant(v_perm, quant_dtype=aiter.dtypes.fp8) + quant_x = 16 // aiter.dtypes.fp8.itemsize + k_quant = ( + k_q.view(num_blocks, num_kv_heads, block_size, head_size // quant_x, quant_x) + .permute(0, 1, 3, 2, 4).contiguous() + ) + v_quant = ( + v_q.view(num_blocks, num_kv_heads, block_size, head_size) + .permute(0, 1, 3, 2).contiguous() + ) + # ASM V shuffle: [B, KVH, head_size, block_size] -> [B, KVH, block_size/x, head_size, x] + qx = 16 // v_quant.element_size() + v_quant_asm = ( + v_quant.view(num_blocks, num_kv_heads, head_size, block_size // qx, qx) + .permute(0, 1, 3, 2, 4).contiguous() + ) + + # ---- per-iter request inputs ---- + qo_indptr = torch.zeros(batch_size + 1, dtype=torch.int32, device=device) + seq_lens_qo = torch.full((batch_size,), qlen, dtype=torch.int32, device=device) + qo_indptr[1:] = torch.cumsum(seq_lens_qo, dim=0) + total_qo = int(qo_indptr[-1].item()) + query = torch.empty( + (total_qo, num_q_heads, head_size), dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + seq_lens = torch.full((batch_size,), ctx_len, dtype=torch.int32, device=device) + + if pad_bt: + block_tables = torch.zeros( + (batch_size, max_num_blocks_per_seq), dtype=torch.int32, device=device, + ) + else: + block_tables = torch.zeros( + (batch_size, num_blocks_per_seq), dtype=torch.int32, device=device, + ) + for i in range(batch_size): + idx = torch.randint( + 0, num_blocks, (num_blocks_per_seq,), dtype=torch.int32, device=device, + ) + block_tables[i, :num_blocks_per_seq] = idx + + print(f"[min-repro] query={tuple(query.shape)} qo_indptr[-1]={total_qo}") + print(f"[min-repro] block_tables={tuple(block_tables.shape)} " + f"stride0={block_tables.stride(0)}") + print(f"[min-repro] k_quant={tuple(k_quant.shape)} " + f"v_quant_asm={tuple(v_quant_asm.shape)}") + print(f"[min-repro] K_QScale={tuple(k_scale_asm.shape)} {k_scale_asm.dtype}") + torch.cuda.synchronize() + + # ---- single call ---- + print(f"[min-repro] calling pa_fwd_asm ...", flush=True) + out = aiter.pa_fwd_asm( + query, + k_quant, + v_quant_asm, + block_tables, + seq_lens, + block_tables.stride(0), + max_qlen=qlen, + K_QScale=k_scale_asm, + V_QScale=v_scale_asm, + out_=None, + qo_indptr=qo_indptr, + high_precision=0, + ) + torch.cuda.synchronize() + print(f"[min-repro] OK -> out={tuple(out.shape)} {out.dtype}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/op_tests/repros/pa_asm_fp8_repeat_call.py b/op_tests/repros/pa_asm_fp8_repeat_call.py new file mode 100644 index 0000000000..6327497ecd --- /dev/null +++ b/op_tests/repros/pa_asm_fp8_repeat_call.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# Repeat one specific (bs, ctx, qlen) call N times and report when it crashes. +# All input tensors are kept in `history` so the caching allocator can't reuse +# their memory while async kernels are in flight. + +import argparse +import sys +import torch +import aiter +from aiter import pertoken_quant + + +def build_kv(num_blocks, num_kv_heads, head_size, block_size, device): + x = 16 // 2 + k_cache = torch.empty( + (num_blocks, num_kv_heads, head_size // x, block_size, x), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + v_cache = torch.empty( + (num_blocks, num_kv_heads, head_size, block_size), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + k_perm = ( + k_cache.permute(0, 1, 3, 2, 4) + .reshape(num_blocks, num_kv_heads, block_size, -1).contiguous() + ) + v_perm = ( + v_cache.permute(0, 1, 3, 2) + .reshape(num_blocks, num_kv_heads, block_size, -1).contiguous() + ) + k_q, k_scale_asm = pertoken_quant(k_perm, quant_dtype=aiter.dtypes.fp8) + v_q, v_scale_asm = pertoken_quant(v_perm, quant_dtype=aiter.dtypes.fp8) + quant_x = 16 // aiter.dtypes.fp8.itemsize + k_quant = ( + k_q.view(num_blocks, num_kv_heads, block_size, head_size // quant_x, quant_x) + .permute(0, 1, 3, 2, 4).contiguous() + ) + v_quant = ( + v_q.view(num_blocks, num_kv_heads, block_size, head_size) + .permute(0, 1, 3, 2).contiguous() + ) + qx = 16 // v_quant.element_size() + v_quant_asm = ( + v_quant.view(num_blocks, num_kv_heads, head_size, block_size // qx, qx) + .permute(0, 1, 3, 2, 4).contiguous() + ) + return k_quant, v_quant_asm, k_scale_asm, v_scale_asm + + +def build_call(bs, ctx, qlen, num_blocks, num_kv_heads, num_q_heads, + head_size, block_size, device, seed=0): + torch.manual_seed(seed) + max_seq_len = 16384 + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + num_blocks_per_seq = (ctx + block_size - 1) // block_size + + qo_indptr = torch.zeros(bs + 1, dtype=torch.int32, device=device) + qo_indptr[1:] = torch.cumsum( + torch.full((bs,), qlen, dtype=torch.int32, device=device), dim=0) + total_qo = int(qo_indptr[-1].item()) + query = torch.empty( + (total_qo, num_q_heads, head_size), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + seq_lens = torch.full((bs,), ctx, dtype=torch.int32, device=device) + block_tables = torch.zeros( + (bs, max_num_blocks_per_seq), dtype=torch.int32, device=device, + ) + for i in range(bs): + idx = torch.randint( + 0, num_blocks, (num_blocks_per_seq,), dtype=torch.int32, device=device, + ) + block_tables[i, :num_blocks_per_seq] = idx + return query, seq_lens, qo_indptr, block_tables + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--bs", type=int, default=128) + ap.add_argument("--ctx", type=int, default=6724) + ap.add_argument("--qlen", type=int, default=3) + ap.add_argument("--n-repeat", type=int, default=5) + ap.add_argument("--num-blocks", type=int, default=8192) + ap.add_argument("--kv-dtype", choices=["fp8", "bf16"], default="fp8") + args = ap.parse_args() + + device = "cuda:0" + torch.set_default_device(device) + head_size, block_size, num_q_heads, num_kv_heads = 128, 16, 8, 1 + + if args.kv_dtype == "fp8": + k_cache_for_call, v_cache_for_call, k_scale, v_scale = build_kv( + args.num_blocks, num_kv_heads, head_size, block_size, device, + ) + else: + # bf16 path: no quant scales, V still needs ASM shuffle + x = 16 // 2 + k_cache_for_call = torch.empty( + (args.num_blocks, num_kv_heads, head_size // x, block_size, x), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + _v = torch.empty( + (args.num_blocks, num_kv_heads, head_size, block_size), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + qx = 16 // _v.element_size() + v_cache_for_call = ( + _v.view(args.num_blocks, num_kv_heads, head_size, block_size // qx, qx) + .permute(0, 1, 3, 2, 4).contiguous() + ) + k_scale = v_scale = None + torch.cuda.synchronize() + + history = [] + for i in range(args.n_repeat): + try: + query, seq_lens, qo_indptr, block_tables = build_call( + args.bs, args.ctx, args.qlen, args.num_blocks, + num_kv_heads, num_q_heads, head_size, block_size, device, + seed=i + 1, + ) + out = aiter.pa_fwd_asm( + query, + k_cache_for_call, + v_cache_for_call, + block_tables, + seq_lens, + block_tables.stride(0), + max_qlen=args.qlen, + K_QScale=k_scale, + V_QScale=v_scale, + out_=None, + qo_indptr=qo_indptr, + high_precision=0, + ) + torch.cuda.synchronize() + history.append((query, seq_lens, qo_indptr, block_tables, out)) + except Exception as e: + print(f"CRASH at iter={i} bs={args.bs} ctx={args.ctx} qlen={args.qlen}: " + f"{type(e).__name__}: {e}", flush=True) + return 1 + + print(f"ALL OK — {args.n_repeat} calls of bs={args.bs} ctx={args.ctx} " + f"qlen={args.qlen}", flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/op_tests/repros/pa_asm_fp8_seq_repro.py b/op_tests/repros/pa_asm_fp8_seq_repro.py new file mode 100644 index 0000000000..35eb81c86c --- /dev/null +++ b/op_tests/repros/pa_asm_fp8_seq_repro.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# Clean fp8 ASM PA crash reproducer: exact sequence from stress driver +# (seed=1), no streams, no keepalive games. Every call holds refs locally +# in `history` so input tensors can't be freed under the kernel. +# +# Sequence (recorded from pa_asm_crash_repro.py --seed 1 --kv-dtype fp8): +# iter 0 bs= 96 ctx=1540 qlen=3 total_qo=288 +# iter 1 bs= 64 ctx=6361 qlen=3 total_qo=192 +# iter 2 bs=128 ctx= 919 qlen=3 total_qo=384 +# iter 3 bs=128 ctx=6724 qlen=3 total_qo=384 +# iter 4 bs=128 ctx= 168 qlen=3 total_qo=384 <- HIP error surfaces here +# +# Run with: +# AMD_SERIALIZE_KERNEL=3 HIP_LAUNCH_BLOCKING=1 \ +# python pa_asm_fp8_seq_repro.py +# +# If `--repeat-only-bad` is set, the script instead calls the iter-3 shape +# (bs=128, ctx=6724, qlen=3) repeatedly to test whether a single bad shape is +# enough vs whether the *sequence* matters. + +import argparse +import os +import sys +import torch +import aiter +from aiter import pertoken_quant + + +def build_call(rng_seed, batch_size, ctx_len, qlen, num_kv_heads, num_q_heads, + head_size, block_size, num_blocks, device): + torch.manual_seed(rng_seed) + max_seq_len = 16384 + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + num_blocks_per_seq = (ctx_len + block_size - 1) // block_size + + qo_indptr = torch.zeros(batch_size + 1, dtype=torch.int32, device=device) + seq_lens_qo = torch.full((batch_size,), qlen, dtype=torch.int32, device=device) + qo_indptr[1:] = torch.cumsum(seq_lens_qo, dim=0) + total_qo = int(qo_indptr[-1].item()) + query = torch.empty( + (total_qo, num_q_heads, head_size), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + seq_lens = torch.full((batch_size,), ctx_len, dtype=torch.int32, device=device) + block_tables = torch.zeros( + (batch_size, max_num_blocks_per_seq), dtype=torch.int32, device=device, + ) + for i in range(batch_size): + idx = torch.randint( + 0, num_blocks, (num_blocks_per_seq,), dtype=torch.int32, device=device, + ) + block_tables[i, :num_blocks_per_seq] = idx + return dict(query=query, seq_lens=seq_lens, qo_indptr=qo_indptr, + block_tables=block_tables, max_qlen=qlen, + bs=batch_size, ctx=ctx_len, qlen=qlen) + + +def build_kv(num_blocks, num_kv_heads, head_size, block_size, device): + x = 16 // 2 + k_cache = torch.empty( + (num_blocks, num_kv_heads, head_size // x, block_size, x), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + v_cache = torch.empty( + (num_blocks, num_kv_heads, head_size, block_size), + dtype=torch.bfloat16, device=device, + ).uniform_(-1, 1) + k_perm = ( + k_cache.permute(0, 1, 3, 2, 4) + .reshape(num_blocks, num_kv_heads, block_size, -1).contiguous() + ) + v_perm = ( + v_cache.permute(0, 1, 3, 2) + .reshape(num_blocks, num_kv_heads, block_size, -1).contiguous() + ) + k_q, k_scale_asm = pertoken_quant(k_perm, quant_dtype=aiter.dtypes.fp8) + v_q, v_scale_asm = pertoken_quant(v_perm, quant_dtype=aiter.dtypes.fp8) + quant_x = 16 // aiter.dtypes.fp8.itemsize + k_quant = ( + k_q.view(num_blocks, num_kv_heads, block_size, head_size // quant_x, quant_x) + .permute(0, 1, 3, 2, 4).contiguous() + ) + v_quant = ( + v_q.view(num_blocks, num_kv_heads, block_size, head_size) + .permute(0, 1, 3, 2).contiguous() + ) + qx = 16 // v_quant.element_size() + v_quant_asm = ( + v_quant.view(num_blocks, num_kv_heads, head_size, block_size // qx, qx) + .permute(0, 1, 3, 2, 4).contiguous() + ) + return k_quant, v_quant_asm, k_scale_asm, v_scale_asm + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--repeat-only-bad", action="store_true", + help="Skip iters 0-2, repeat iter-3 shape N times.") + ap.add_argument("--n-repeat", type=int, default=20) + ap.add_argument("--num-blocks", type=int, default=8192) + args = ap.parse_args() + + device = "cuda:0" + torch.set_default_device(device) + + head_size, block_size, num_q_heads, num_kv_heads = 128, 16, 8, 1 + print(f"[seq-repro] KV pool: num_blocks={args.num_blocks} GQA=8 " + f"head_size={head_size} block_size={block_size}") + + k_quant, v_quant_asm, k_scale, v_scale = build_kv( + args.num_blocks, num_kv_heads, head_size, block_size, device, + ) + torch.cuda.synchronize() + + if args.repeat_only_bad: + seq = [(3 + i, 128, 6724, 3) for i in range(args.n_repeat)] + print(f"[seq-repro] mode: repeat-only-bad, " + f"{args.n_repeat}x (bs=128, ctx=6724, qlen=3)") + else: + seq = [ + (0, 96, 1540, 3), + (1, 64, 6361, 3), + (2, 128, 919, 3), + (3, 128, 6724, 3), + (4, 128, 168, 3), + (5, 128, 1024, 3), # extras to keep going + (6, 96, 4096, 4), + ] + + history = [] # holds all input tensors so they cannot be freed + for idx, (seed_for_call, bs, ctx, qlen) in enumerate(seq): + try: + call = build_call( + rng_seed=seed_for_call, + batch_size=bs, ctx_len=ctx, qlen=qlen, + num_kv_heads=num_kv_heads, num_q_heads=num_q_heads, + head_size=head_size, block_size=block_size, + num_blocks=args.num_blocks, device=device, + ) + print(f"[seq-repro] iter={idx} bs={call['bs']:>4} " + f"ctx={call['ctx']:>5} qlen={call['qlen']} " + f"-> pa_fwd_asm", flush=True) + out = aiter.pa_fwd_asm( + call["query"], + k_quant, + v_quant_asm, + call["block_tables"], + call["seq_lens"], + call["block_tables"].stride(0), + max_qlen=call["max_qlen"], + K_QScale=k_scale, + V_QScale=v_scale, + out_=None, + qo_indptr=call["qo_indptr"], + high_precision=0, + ) + call["out"] = out + history.append(call) + torch.cuda.synchronize() + print(f"[seq-repro] ok out={tuple(out.shape)}", flush=True) + except Exception as e: + print(f"[seq-repro] !! CRASH at iter={idx} bs={bs} ctx={ctx} " + f"qlen={qlen}: {type(e).__name__}: {e}") + return 1 + + print(f"\n[seq-repro] ALL OK — {len(seq)} calls completed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/op_tests/repros/pa_asm_fp8_shape_sweep.py b/op_tests/repros/pa_asm_fp8_shape_sweep.py new file mode 100644 index 0000000000..672cec6eb3 --- /dev/null +++ b/op_tests/repros/pa_asm_fp8_shape_sweep.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# Sweep shapes to find which trigger the fp8 ASM PA OOB. +# Each (bs, ctx, qlen) is tested in a fresh forked Python process. + +import argparse +import os +import subprocess +import sys + + +def run_one(bs, ctx, qlen, n_repeat=5, num_blocks=8192): + cmd = [ + sys.executable, + os.path.join(os.path.dirname(__file__), "pa_asm_fp8_repeat_call.py"), + "--bs", str(bs), + "--ctx", str(ctx), + "--qlen", str(qlen), + "--n-repeat", str(n_repeat), + "--num-blocks", str(num_blocks), + ] + env = dict(os.environ) + env["AMD_SERIALIZE_KERNEL"] = "3" + env["HIP_LAUNCH_BLOCKING"] = "1" + try: + r = subprocess.run(cmd, env=env, timeout=60, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + except subprocess.TimeoutExpired: + return "TIMEOUT", "" + out = r.stdout.decode(errors="ignore") + if "ALL OK" in out: + return "OK", out + if "CRASH" in out or "HIP error" in out or "illegal memory" in out: + # find first crash iter from "CRASH at iter=N" + import re + m = re.search(r"CRASH at iter=(\d+)", out) + crash_iter = int(m.group(1)) if m else -1 + return f"CRASH@{crash_iter}", out + return "UNKNOWN", out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--n-repeat", type=int, default=5) + args = ap.parse_args() + + print(f"# fp8 ASM PA shape sweep (each cell = {args.n_repeat} repeats " + f"of same call, fresh process)") + print(f"# OK = no crash. CRASH@k = launch error surfaced at call k " + f"(0-indexed; means call k-1 corrupted device).") + print() + + qlens = [1, 2, 3, 4] + ctx_lens = [128, 512, 1024, 2048, 4096, 6724, 8192, 12288, 16384] + batch_sizes = [16, 32, 64, 96, 128] + + for qlen in qlens: + print(f"## qlen={qlen}") + header = "ctx \\ bs |" + "".join(f"{b:>10} |" for b in batch_sizes) + print(header) + print("-" * len(header)) + for ctx in ctx_lens: + row = f"{ctx:>8} |" + for bs in batch_sizes: + tag, _ = run_one(bs, ctx, qlen, n_repeat=args.n_repeat) + row += f"{tag:>10} |" + print(row, flush=True) + print() + + +if __name__ == "__main__": + sys.exit(main())