From a75f4a4aabb9c1d742c7258ffcfbda3a57fe2d6f Mon Sep 17 00:00:00 2001 From: "Jim, Guo" Date: Mon, 11 May 2026 14:33:36 +0800 Subject: [PATCH 1/4] update for mi400: add asm csv & host code --- csrc/cpp_itfs/mha_bwd.cu | 16 +++++++++------- hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dq_convert.csv | 2 ++ hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv | 3 +++ hsa/gfx1250/fmha_v3_bwd/fmha_bwd_odo.csv | 2 ++ 4 files changed, 16 insertions(+), 7 deletions(-) create mode 100644 hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dq_convert.csv create mode 100644 hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv create mode 100644 hsa/gfx1250/fmha_v3_bwd/fmha_bwd_odo.csv diff --git a/csrc/cpp_itfs/mha_bwd.cu b/csrc/cpp_itfs/mha_bwd.cu index 69f55db0e4..2cf92d4845 100644 --- a/csrc/cpp_itfs/mha_bwd.cu +++ b/csrc/cpp_itfs/mha_bwd.cu @@ -257,7 +257,7 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) std::string arch_id = get_gpu_arch(); if((!a.use_asm_v3) || (a.hdim_q % 8 != 0) || (a.hdim_v % 8 != 0) || (a.has_dbias) || (a.bias_type != 0) || (a.has_dropout) || (a.is_deterministic) || - ((arch_id != "gfx942") && (arch_id != "gfx950"))) + ((arch_id != "gfx942") && (arch_id != "gfx950") && (arch_id != "gfx1250"))) { return -1; } @@ -311,7 +311,7 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) return &cfg_fmha_bwd_dq_shuffle; } } - else + else if (arch_id == "gfx942") { if(a.v3_atomic_fp32) { @@ -321,11 +321,13 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) { return static_cast(nullptr); } + } else { + return &cfg_fmha_bwd_dq_convert; // gfx1250 only support atomic32=1 } }(); bool need_post_processing = - ((arch_id == "gfx950") && (a.hdim_q != 64)) || (a.v3_atomic_fp32 == 1); + ((arch_id == "gfx950") && (a.hdim_q != 64)) || (a.v3_atomic_fp32 == 1) || (arch_id == "gfx1250"); int mt = asm_mask_type(); @@ -449,7 +451,7 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) auto pre_kernel_launch = [&]() { arg_size = sizeof(odo_args); - int bdx = 256; + int bdx = (arch_id == "gfx1250") ? 128 : 256; int gdx = (a.max_seqlen_q + ts_odo - 1) / ts_odo; int gdy = a.nhead_q; int gdz = a.batch; @@ -546,13 +548,13 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) auto dqdkdv_kernel_launch = [&]() { arg_size = sizeof(dqdkdv_args); - int bdx = 256; + int bdx = (arch_id == "gfx1250") ? 128 : 256; int gdx = (a.max_seqlen_k + ts_kv - 1) / ts_kv; int gdy = a.nhead_q; int gdz = a.batch; if((mt == 1) || (mt == 2)) - { // causal + { // mask kb gdx = (gdx + 1) / 2; } @@ -587,7 +589,7 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) auto post_kernel_launch = [&]() { arg_size = sizeof(post_args); - int bdx = 256; + int bdx = (arch_id == "gfx1250") ? 128 : 256; int gdx = (a.max_seqlen_q + ts_dq - 1) / ts_dq; int gdy = a.nhead_q; int gdz = a.batch; diff --git a/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dq_convert.csv b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dq_convert.csv new file mode 100644 index 0000000000..d6368def22 --- /dev/null +++ b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dq_convert.csv @@ -0,0 +1,2 @@ +dtype,hdim_q,hdim_v,mask,atomic32,pssk,pddv,mode,bf16_cvt,ts_qo,ts,knl_name,co_name +bf16,128,128,0,0,0,0,0,3,0,64,_ZN5aiter30fmha_bwd_hd128_dq_convert_bf16E,bwd_hd128_dq_convert_bf16.co diff --git a/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv new file mode 100644 index 0000000000..f17eacd60b --- /dev/null +++ b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv @@ -0,0 +1,3 @@ +dtype,hdim_q,hdim_v,mask,atomic32,pssk,pddv,mode,bf16_cvt,ts_qo,ts,knl_name,co_name +bf16,128,128,0,1,1,1,0,3,32,128,_ZN5aiter28fmha_bwd_hd128_bf16_a32_psskE,bwd_hd128_bf16_a32_pssk.co +bf16,128,128,2,1,1,1,0,3,32,128,_ZN5aiter38fmha_bwd_hd128_bf16_causal_br_a32_psskE,bwd_hd128_bf16_causal_br_a32_pssk.co \ No newline at end of file diff --git a/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_odo.csv b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_odo.csv new file mode 100644 index 0000000000..b36a5f6235 --- /dev/null +++ b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_odo.csv @@ -0,0 +1,2 @@ +dtype,hdim_q,hdim_v,mask,atomic32,pssk,pddv,mode,bf16_cvt,ts_qo,ts,knl_name,co_name +bf16,128,128,0,0,0,0,0,3,0,128,_ZN5aiter23fmha_bwd_hd128_odo_bf16E,bwd_hd128_odo_bf16.co From 01665115f626261c666264675746f6689c8b7826 Mon Sep 17 00:00:00 2001 From: "Jim, Guo" Date: Mon, 11 May 2026 14:47:27 +0800 Subject: [PATCH 2/4] update hd128 batch mode kernels --- .../fmha_v3_bwd/bwd_hd128_bf16_a32_pssk.co | Bin 0 -> 52816 bytes .../bwd_hd128_bf16_causal_br_a32_pssk.co | Bin 0 -> 61544 bytes .../fmha_v3_bwd/bwd_hd128_dq_convert_bf16.co | Bin 0 -> 6488 bytes hsa/gfx1250/fmha_v3_bwd/bwd_hd128_odo_bf16.co | Bin 0 -> 10696 bytes 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100755 hsa/gfx1250/fmha_v3_bwd/bwd_hd128_bf16_a32_pssk.co create mode 100755 hsa/gfx1250/fmha_v3_bwd/bwd_hd128_bf16_causal_br_a32_pssk.co create mode 100755 hsa/gfx1250/fmha_v3_bwd/bwd_hd128_dq_convert_bf16.co create mode 100755 hsa/gfx1250/fmha_v3_bwd/bwd_hd128_odo_bf16.co diff --git a/hsa/gfx1250/fmha_v3_bwd/bwd_hd128_bf16_a32_pssk.co b/hsa/gfx1250/fmha_v3_bwd/bwd_hd128_bf16_a32_pssk.co new file mode 100755 index 0000000000000000000000000000000000000000..beda8958a9942fa54d9e8a2b48cbcc7a51ce3e7d GIT binary patch literal 52816 zcmeHw4R}=5wf3HK&I}1g5+H;m4rB-sgN6{oU-(f534%<7h^Wz<+ax4`Py?AJQ3E&6 zcoG8gldCDl$e%bORj##`wzj3!o5bqZa@*8WtHoAtn_6$nwYJ3;t#PWudrEZ0=Jbu`9n@zZ>PLG z5rcn6i8$dvHpRNx+z-30!W70>ew3|_X=U{RlM2S4LXp5a#i0s9j9^`i?~CU1tSbzS zqaw`{mi=^J^s*{URjzeB4f#WfQ}L>OtN*9Ig4#FK<5}G#ZbY2bbgM=~13H%MPpUBe zg<+=7*$P$PQ*+0{<^nFhA%9un%|dK_x@==bRb5&39hJ44Dy!?hc;A<@%WBuxJ-F@Z z?6UIuJIks|%F4?t>+1fawz90Eq-Im~W*90eYLQh^=PfI*e0+U%&Dyf+C$ekSt*fi7 z7hlS*+pwiF=F8c4l~vzaS#rmQO%-2Zmy)%2u3J}GJKT~J&b#C%Wu5P?-B4dSaHU;? z7QI!qFNa-?(Dof&ySYTRHt6LZ$m=`V>uYQ7^p@0BuHRU>slH_0hK9==zRZ{ z{4DSx1Rct%H>}?j^UZ$w6jFFRdt+Hc$-3&Y`jWeAYVWX$hULcwK8sCd8!La2S#tZz z*=6X3+T6T#8>`An*4|xFQdKb{H?L&vx*2my%4X)4cniI?DmJ6S8_G-S>dVUS_#XN=cto`3tJ!y#-Br1+wg!DhO?Q1K`{NdSYisB%R+aefg$o01}+<)26Lo*Dxb@c8)s#@2ae=+fcJyTx<5;z)H4!8AXc3b)6uOwBbV}vQ(GH=e_a&eR6!-MoywZ5)0zu(b^ z+Ph+?j`$d?`kg<>tFb7w0=up2W~6e?#0#-V#C5lf79vJnfFe2m1M_0t<}q8qwmT3A z*u*q(7Ki>oWQ((C#s?2tG|B~b$D=$#*xLwUKSuU(;%qk%C)C+q%7=c2@{>@G zhH{Xf0~ClqhT{mt_X16zi~NvZNbxAoBt*O)XaY}?j&d+wd|(1y!1{ppKnkupn(63r zibuO_4)Z&7&WdF%osWp^otkM-KgCH*u+U{;j)iUu3oR_Q&}*S-VVi}13y%>xy5ho) zYpUZuXZiJ5_=bh=U@{_p(3Kiveuuv=W|t7#yB-l@&YfawXI&r$pRo{if!GHU&=Hr$ zX+HDPmL}V+nh&FNS3uJeJ__8k$`-HrG?8`6&@^<(v{N>_CLk+ta-52hd0D4ym(lqf zQ_!S2R$Lm>(RgW$v69NLjnFj19-}pBhRnB79&{aO*c*L>v8#=aYi_i67Qk^-h#ZFt-S1P)0{rNIo54-7z>fV z5c$nUi@n+2BFoOrHO-kbn~j_t)0{D*IkpHsw_34zd8RpUUUTg2$h*zTn?K()7c2;t zXDmXwD38juFM_CYXU#Ir*|VExiC=Wi7QgJIc2hqG#C@F$o;<#2q1fFyQqz2+(4RJY zvTsqq=rD1z7~({yDSp@)(Bi1S0|9N6L^om&ipyRVX*SU!=FeQ}-rn`Fxa8G`C(#*q z$*TwPwM?Mo0^14=W%1SX@>fMHrGT*Gdx6jD@>lUamT4Vn%}TqMTII*v#8#{PLpa{U zx=l3AUof*hol7K?l+gTIq+7 z?y%CTUu?m)8M!-9Ce@MJqam%K{2lfL&rZbcu;O+hz0*qXMtWDI@?=|#TpeX=)DFYQ zeJ8wZ#8EqRmHM<#~Slxe;|(P`Jw%NkIkw# zikE$5P?`AuL;dtUAL{RW&e~6V)cNosj`<-z7m4Jzl>9!p=PdHiTKan`jY8f>`aQ}= z`fo_TZ>2-qtcloH7rAXh)4gNv7up;fbX%-V*uLPE=ONY8@OoD)YQgu!Tf4%jtzTi( z-?hTHt6_!Fuw{j@rE!JP=v%P}=Z)btVQ+@L1@`T*?|^+5?5!&d-+e10eFyfU{p#Eg zo|ic0-UTx}dr{}TRA+n+st;ry`e2{3;XAP(>3!sf^Z}&zTWQtKfX?5|tBwC|qBc^S zHME(ock=s$&*^jG`d4hv3Q-EwfTs8XzQ^}>#&=(DE{uP(DWI*A-^o?tWT&}mRa2m_ zut}r~(Ind7_eIE_&VYCT*9l>+l6pJx9)xs29)j$J?1Suw9OxVu7$>jONr9x@LIeWP zTM=tZGLu@z2gXYqQ065ElBG|wKe^Qu+d8jBIre~E`lJR@r7hK-+S)#*y)!Y8i2Ow# z0TJuAu^u3#vK&p~Addeqz)H`7}u2PR7!Q08R>GNez2KSS1c3CdCRof4QLZBy)1P+xziE8vp#12sjZ#?fZ97)eR3z-`9(@vYPc zWT$oq?dj=kpG=xq8>+Lv6#Au26 zF4mX3ST7*^g91}t=)g5(2NsbXSWI?&DN4x>tROqEitNB@vOgq*m+ZiLvI85)4n*I4 z+#dpUqWO%A(Qq>+lrkooj4nT8MmJ+lFXJo$1eb_$F~+$WlS>($CS#_bF{_&~ua|M* z!uTII-5_4-#H0pusl#3V#hwm-enE%UWb6)@tM<}06yJMuRiMzME43+p+IeGOwMo~U z)q$HN-V|6P(G;(C-W<3^rf&({D)H7pk;H%K^`ech7mGhD*NZm3UgUp*>qY!x1D~0F zcbnsZX!j2j&IY|R^XWg3P#)wDMoxNh+E=;P~#9oG&24UV0IefQ(@L(0$R@>e=u zYFcTo+?9>;r_1sW10g?zwhBzTENCqSmk_E|u%!1@W7DC*Re8_T00i+PJ22unm zhLl1oAXSiRh!;{1X@E3BOh^l42c#9!25E=vg&c(VAx9ubAx}Y$L7ssehdc-AhP((l z0qKF9guDWI4e|!07xEV5G~^w~8OT}42avx*1p2~aR836MlcGQUoQBE+s`7F|%F7Ka&rWr0vf=Jf?5p;BxDG4K zaWD2%nQl8(LVO?XiagKrj2UB&HODp=Oq^(@nQ6^8_0LrBcQF>Y8H-98 zF{k|d>zGs*k{!5)?7&j811n4*;;a0O)!mE@y^M`yho6`1z?wLGVa2?1sn_4e*s{6YnSlIa#1mFeT_W|y>ls@& zEOENXj(EcAd0VC4cpGEu=B>_{*zH}dur0S~sf0z7awig2EoN*e%bj?T?4z|*!lD_q zQm?v^vEd7~6Wftrhu4FuF{9 zW8JtG#R%K0BSvq*WykiaW9$~&NYck`v0Xy?_${_L(o?qBE+zf4Ew)jlPuOCMCp~S8 zErIk&TWq6ApS;C3hV;v~*v686#THv4=~K7Z#*u#27F!bO)3(^glRkZmEt&Kg&9)TM zXExhXNuS+}TdmONHrt$}&ug|#AbmlzZ6fJcH{-GheNnT`Mf&1q+a%JLG~3ciU)pS& zO!~5BTL$UZH`^{J-P3HFLi!EOwkt?q)r`yK_q(1FnLSTU5YDH8PXmtuj{(09{5tR% z;4{E)0lx)24m=M0F7Ug+nEbS!n3Q9OMgm8s96uBXj7#Z0GzvH><;0-`U_#2tLt}tr zQeHch2uw`rJ(L7YN;!Qf8JLXtRA4INoj@nzCjuuT-UW0aJ{_2j_zYkM;->(oAU?*E z;EVN)@s0E(`Y!P#`QkjuzDqr+zEK{hFWxiJm*8>vMtjnIV>}tYv7RZuL{Fw~oaahk zk|)bI-jnT1_T>0dJh{G9&n(|%o;g0JC(k#*Gv7DSv(T63xyI-6xP6m6*ZR^u`M$}X z>wFoW<-W^3D|}Nt1->ghD}Aln)SlKGwc`7L_uZ@&w*lL3(~9>1_x!6?+zxCn(~3KQ z9hF+~Uf|vhTJe71{*7AkLEu5e9|9giydUUC{9)i>#2*13LHy&u#}R)NcogwZ0-r>D zD?W#q6y13raO4Et*#?Z8q&xQjM_sNv+kpvFb!P`~%rxD(7nnFhckTx!&DNadqrT*Y&#dabWrly7MS7<3`>2Byh^jy7M`)@l?!M&G=8? z$nl!d35>f;GrEAI(lnzRm@rv0o(GP(LNmSxOuR}nUH~Rd*Nhi|$ul+M{{U0xYR310 z&IOwB1K`9(nsEZ?TA~^M1x#P289xMOcr@cjz$vRV+#A^5Wgy*pnQqvD_oeBE1K2iM zH%0*WT%j8=!1k+jBNo^(T{lJo_s-OfOMv_5>P8&!-~!#a6nJQnZj1u@m*_@3@bEI- zNB|!3=*DQ^JIY(%G{X+v6ORe$szsCR#=y5;?^aO#2Pv7sNd0leYbE}K-yv!x;|91Q(M%Rgn zj|T6XscSlM;~bCxX)F}*>%qC)Gfp^P0=@(^ z(GPN7!B~9~`6rS8GVo=fIu4DY6#okHUqSxQfjE#$w2{NDk82c+X8-lXFLTTUbY zH1gjDz8w-zV?Ff~-HSs%VYm>!98PCZn)->3LvsQ0bF*z0*$O$mFw~Fc3glPHwtTXA zIbERq)LcS+K7SSh!=UidXM{9{G(N7HYpD9cmPt0MAJRFYel!NAtr%#$#&EpNFr2^v8EZ&3TkR%^%61=8xq6V{n9d zVg58XQvNi*B!8M;lK)HapC9H=b13Cc^H1`p`6v1R1pW))pM>vX4C?LuvJjM0;@xPXnsaM^tr=UPkjn(4E3S88~M;@4qFTLC$Lrd zsCk`$57qZ1>buZtAJtdQ_musouwNZ!r+J`i&mVEmnc4#^4Lt_U5mkF&tD^P*y~;<; zA60u`YoPW3%}^hjTdMZJwu9OOYzy_Fd8exH%c$=)A@!x>(0o+ce+K)a5WAYED*G$2 z=eco_!skcN1Jrzs`bN!VmG7(Yod@6Wxtj8&`L6Q)Ieh2CH+&waeC3?j`b`V?7oUrX&p2_jK2f~O-?6iFEiul#4lGdfV>;JgL%%LWzlP71 zsf-;c!?}4fm!ZknD&de=t>)5H#xGFD)hHu;UQK1Rp^S{pOM>-ykFH~=2XKv=cT*X^ zL>bqhjPN-)m9ZCPWNj`F)&sF#st2%M&C#ih*HOkIlo38(r!u;!KX&SAsz2Vxca-V@ zELQV*D&q~5fg*&O+tb{g;!jXt?3~Q;=nKR(P#u7cYW`30|Bd*?h*w;I=KU0ZlKNri zlHl?GM%M-$A6TLI0>%Fd@z)|=aR=fI6#p9a!OrsF@eyaz@qsNA_iMy0L7bZF6VISH zKjLO>-YWY}&HssiP)sjk@)4u>0C5tEIf|Htn;(-giX#wLp_tzwW+`G6XCVGUF~<;- zzxf3jqj&`I8j5)nG1nnR@e1NP6muLg1)F~>V-(jQ4n#4(Ma(k9DDFYLh+=xFJv+C` zejuJizHh;oJ~YKoh&z$*X==yL$D}WDDf0av_%4R8;xNRo$oCA@f9DI*SMeJ1{~i3V zg}>rB#Kp+}1FHMZ-%5Y-`#t=Yz>l60(lv^%lkdS#z;8{g<`nO3?^67TxE#ftMod0p z6kj5KM=>#oDUMB-F^WSG@1vOiMa)vfC{9IOkYW-LQxSWuj8Qy`I3mTojhO2Yqj(qb zMv6&BOm*y98Kbxu@kxsL17emTMsYLZm=xot_M~VN)O{r zUt{cH>8Cg%abU$0i3^io4*XhTzvooOLwR!_8$^lrmPSd#iMuDyn+N7EM9Gu^%m6ueMo)d<}!*(8H?5Z4=RKFykFasM%A2u~g)YW5#Fy2566N3U(>+}OQOx1x=Zm&cS?0TYWc}%wD5sQ;39KNV|90x5U5fY8 zy=2MX@a~4>*Nj8w*CuTIzGx7oL_VQFu&$> z7WI2Se$6)K*PM>bui3`@n$rcUJQ}O$UcLH!z-^gd!w+d!s2|I)eo^J})Xcp2q2mCCuG{2FD0 z>lRUafTgOgbc`UsW?L1t2R>fq(}!QPt%2GDA08V}SMmw+Yqss6_P~e71eS)%;<6Y%Sq7Qbe^IQ*K+t|(t$=$t}1=g+SZhjw}W zjOWj-m|t_cK$X{rU&9vU*PM2R z`UUwl+n8T-Iw#aG$gkPP{F>A5P`@C*W*hTsP8TY_P=4JF9!t-!Pcjb5ugRbIHTe&P zUz0!aYw{lozb1d;*W^DWevP(Ce$5z}U$deyjy}JR@cbI{74W<@F2;+)uet2t`kQk;zlNXW*Ng-6Yxe2SuQ`AC{MuTt za?$xU>M8j(xYC`jPl`0^^|kn*51hlmAfoHTe_2 zCjX)EYw{<4P5wjT*JzvM*Nl<*HQOq37^@X$RqOZU`_yc!FJz3&ui4h(Wt7*{3(l`m zw&d50k@+>-s;E8Z=4YuUCe60H6 ztieWna1Hcu)?iDHkK+TYd$D%#+0!yJ~=K@Qu8W3!DpET@AUmi$83 zV57g8!?ItH!}j6Wu+cmG#OvrYLfRS1#e>hV*~a$(IGq*h7kq}zHogbI>EJy8I&Sb8 zHrx0f0H+I7d7;m+-QcpsnF|@^nriyprGeI9hp*WlYz;PXW-4EC=E2rr6DKEs#mNU- zgDp8J+6WAC((pCdu=QKB+=pXxe7`lzHRa?JeMda{j(G4LahxaeooH*#@o?8*%lD$Kb;rY9gDu~U zw$>gGcMZ0DKiXPQ1AX57n)|1U&fQ`0YmN)90UwlKbN?{E z=5%C!%{J!OoQ}+|*~a{u(~B#(=ZOpGZ9hqOVjrldFBlByvF~8>n&m$Hntl4M zS?|-Hh5cb-Sb7^*H{mUV^q_+QQ*1xHSFMgYg~*2 z^K16$&#yUu`26}GxCR@2(MW4efrImF_G5m{>B#(=+t2)((~ryx2uZ27sKM$92Z;zJ}AHD{$YO2>B#(=ZOpGZ9hqOVjrldF zBlBz6g8Z7(k@+>-m|t@`GQVaU^J`8==GSave$DBK{JI-FmY!dqWE}MQHTe_2CjX)E zYw{<4P5wjS*W^$9n*4{vuhBNiuNforYqs@Uv)qSYvroS@%YFDY`}AA0d_nm&jv@Iq zW3jwvYWX({F>8|`8C^^Uvv7i#jl6^{2FbQ{F-raem%$ZVU&BZ8YsP{3^&o4oCBNo!B)|TJ9)VwD{iNjAj6>(w+!n8&@%-Np=eXb+==0{+ z+^#A*{;>Ep#|77b56Z8(f0$o$Ix@dz8}n;UN9NaTV}8x)$ov|%Aiw5xWPZ&y=GUB# z%&*zT{F>8|`8C^^UvoMlzZU4n@HN!~y&q2g#IF+=2Yr4`{=~1ze<=K#{E1(a|B(1K z+9vrmV`P5Kwtj1t`|xY_>9=ON55H!geruL5D8I%rB)?{i%&*zjZ%uh0e$77p)|B_* z*X+}8O?i}GNBMR5Z-`5NZT)6;^!c^*n{Vh3@a%%<^K0I-H@ZjH3(-Bg{#p0v!a7IE zuNf~6zvi-odmW$i{c!k6e$6;AzhXD zYfeYz*KA{c&FRk;zaH-MYqVAJYsSI(wK2f+Yc3)(z8M!q78v7_(^KVCb_785hzX!O-))j0F@gxj&rJ}uE+kmt`>s}!3vDyu!eNs=59s8Z0Bs=ykeU0qc zf7B18{X~xfX^+rjK-%~7IFR=8>?J$)=R8gJosXrRAv^Zf{DAD%ewhMw41IPYH0xbM ztS3Z%HQD96gIJ>oTtjyGE+Fi{da{Q;iz0jIx=6Bzu5l!L=z2x9zCV0V_3-x}!uL!M zzyA`xCwgz#`up%b&%^JZ&>r^mUZc7fKxy@!I;GVbZ%V86+mu%OQ&U>)MNMh7&oiaf zp3an3`!`cs?axeUwf{1u)&9zqR(k?dTJ7yiX|+c$rPW@$lvaD@Qd;eQOKG*2Ev0{L zt&OC#`duMPtNmmtt@eqfwA%lb(rSNKN~`@_DXsQqrS$KqeC$_BX|=B^rPY3^lvewo zQd;eAs`hcD?+NYmB52k7T&(lJ(V_(xEvSTk( z7m)TU%>>e3q&Yy^Ycvl?`-qm39s7b-k)8JYBs=!XtXpfYPPo+c1MR{zqey{VD|1nAYgOso6|CT&g@ply>qsW z*f+Oj&fGbzBkrBMZEn}x-E#wTTLnHF+W#^7o#Q>v=^XrL{my;U?&q-?>pw8oCemJh zKqu|-2+W{;`+!+)##yvSAM}N#jQO-5AM^qN#2A}Q@jw^F12ZWem_zZvJc(}FUMxaVv2_zT(d&)(1Yt!C?5L9Yf+%fv6-=y;^o-P=%si$HZz(OFUMxawom2Q z9JX#igslS*Vej>mU5?Gxd-`M#y^lq9IX1%%Y$UrJo2|Xs$S%ib*nutTniIWFM6VOk zu~|!vj?Jcf9;Oa7UPkA`+z%IXKKznyjm_zDj7_9{fH2lNU5pvD=Z_ql8E4V{K5}ek z%%{D4+WHZ|-?}oNz{YbCfqnd2?pZNqiSx z#yzK(fzf+a|Nq~s`n}B>oAFKw#@NIJMyHE0gZ2xMV>9C{+8aQQ&5Zf94}ct-ff!?x zDIVyecwi>Q19RMrc@z)bUCOwe;-MD`pd6bSv7Xc#n;EMqUXIO-jTA4(X2w>Emt!+y zDaFgNnbAw}a%^TaDPE4vjGxZ2Im(-(W3$CmqW83;_q3y9^FP;fBq97=0c}->z+%l+ z;$&xF)v6|uFGQfQut}^FqDi!Mn&OL)J)HsZKqoF0<|?VTBkw^-2jn5hUdTSke#n8& zae;Aoz9K|YAPKJm1Om`o5o=2_lUm0I#!DLzc?q`UK(h2n_9wTRVq51@lw%LrrB7-g zRoYVRsjck^?VU~HAj*FjatQT#MAoNJ>VD)s3ONjU3~~hWCCKBDCpsquCdu;B1L?9n z=(0ZPW_s)7z+`Cy%KBskGNez2KSS2%I+Ua8GbJ!Z+NRj2pg#UiJz#X0Xqe`=RakZ>RU&f(PAVewF0*pkIOEVP=mQCz5~i0zEt2cuMG+&@d<0kY|IgjnISv-GHt^H!jhH1G)pc4&4#2i5TcH z(CPOhW5#M?B=nKc?a)V#*F+rjIOq=OahGXg6!cNhM?fEyrildT3D9GpCrsAF80cf5 z$3h=-g@zwa&+JKrJ`#H3RhmeGo&^08=t2K}0QG(6hO>~TYPL3eM}#I?|`g+2-TwfAbc3}p7? zLr;gEzgrX6LB9_AWa!u3ui*#qGkcap&w#%C0Zpucz5@E?&{sU9;XzksPXY8P&=$X(9AJfEY=&PYmg}(X;4KGk+_N;+^CG<64*2FE)Z-IUl z^jp5Fi6ZDl(6gWyeM1wsL%$vRH0ZZ~TNB05i=k&jFMd`NpM(B6=+mKpu1muY#%K1F zLeGI-`hq6bLSGAg2K2Q*&_o6F3h24eD}JQma+uk(4*E>!>wco)N1QWzs-Vw;UiC9g zd>;Deq0ff?`BNHR2+8cJhCT;+_3N701bq|qxzIQLTEmYpXZCoZ=Rx=WRui?*YoX7B zUi*6uFN|dN)I*;Sz5WlHxC{DS&=)|z>(3f~lr*!a0s2Dd4S&_d7U)}`Uk!cB-!we5 z%Is-`ehu`-4>jR~?t{Jvx-Xy!6KLu>uJIdBHIL9mGqB|nU2FwzkJs_TrW;S~7^{nI zz+K~Yu^rfYnJ(@H-j}9}9l*B9y4VTabA>K;0o$+A#cp87bR9pmy7AQBnY#FQ;QqO~ zxDR-6fiCU`9$KVhLz0cB{7ZE4Md0COy4V9e;?czez{gkVq8)hjCS5!TeDYRZbO4|F z7hOCAe7Zyzdx6Kwb+Hfl_4T^g4}9hhT^s;@t40?Gfye7~@i6eackAL1@VS4}#UsGZ zdvxIkc5l_iqrmUotBb?H7kBI8G2r*_*ToUwi3fD?CEyPq(#7Mzo&&mg0{GG+x;P3v z`Is*L1Nib2x_A=!%9nNVW#G@hs*9(9uYE%oUjhE|+q!rf_{Ouk_$u&MUAj01?0rEO zUjx4R16_O__|}hf@eSbbexi$KfTw?^i*EwoKBbFq0pEFD7vBc{>DRhA4m|T)U3>@l zm*4B+yTG%5(8aUB_y4Sm=YSvlRTuvW{J($GMJMp@AL^nD_)!4;fcJQQ{1dwVH@?0X zgHr;b0b)%U142z02O>?FKm;0?NE1-;WB@@%JC(_yc~9KX8QO4;9FHmbDKg*1GGI0ktnDxcy$ zaL|W`zf3#hB@CD=@Sl|}p0N@mt^8XoeD9+`4JEyn9bX6e*RBFBT|vQyO?On*&UY=o zA>UO{knhUP&Cbcrab-@guGzG{vbMxqR$sNLY-43fT}?@SRat#Wt#@NxNqO0(lDa#+ z-kREac$D8!wtmB=^{`i#R8+3LbA8E%P3vk(*45OOlvP)k)YO%DYirhRsIIJ=URPT_ zy}J6Yjnk0x`O5Noskkc2=FGz1-173uyt%XHmCv0~IcHu)`Rv?^^0LZ#Gb-lIn^`_{ zU2g7FRqOsJ+pon4^K@UE4eA(u1SXq(^RI+D8Z$z-)!}Ys&fZj0Un#OHHgBriyisJY z-*jhoRasq?u>Pg|`dX1)TT@Y1Una8aD;w&iciF}b<;bqAuB}0rR8*itM0R=2#*LMm z;DEB~%hm#`5X~t@QF8mr*<~B*D{FJ})@`gRD_MJYMM+h~jNH7Ewd-cgDJh$oTjH&& zyJP8)9kcJK5GDDytz3M=^|UF%C;X%SL+=Sqm7@|CX6nkosLnT)OR;ngt1xw5D_m|l z(&Iu5N;ufLo$YlQDY3@7B0#FL6AgI638@(8aR34 zP6!B?V^gG&TX95Gj0iKaCbGh*fG+wbo-jEw*Sqzdp|Ye(T$7m?T3k z1NhW)Cc1dPZ+(}w_S$>)-s_#c*W9%z|9Z`4a~p#HbkQShmQ3KbZTV5PYSXIf4Pz};VI1oehb{=A>L+o%u+Tigx+&^kdjwf}P{`%mP zqqth38v7_zKLDExIRD1n#RXpyV$%~P>&h!@OETA1)U2o#~R z9$GUs+sg59RrTuZz!rAqW-rn#`$uvHh^b$|p`>cf>h;k- z>{m}AFKAsyGHHNTRh85g-&tL=)=Ew`L-3vHZ?cPTTQs~~I@$f=GyS*yAoKRx3Y_*gU^=^| ztQeosvbE3P&N$Dwum;ef8>9^;5E^XQ{dP9ThdTYpU1pu&IeX&*%I8OFY6D9kY7H zi&zfRfTO`=bdBK(w^dL6sw7#mAxx9%O24!FxBg2m)^x~J)L(E9$WIJ^tiuSw?ol1+ zg-qCVS)FoQyiG3`OF0FeW1fiZY8fp=v_*i#xXr^hzwI8s-)|FFi_>uZSY(RRNb|7? zbdi3gIGsg0&`tVP;&cJ&Ko4PtIPIl+ph>zWPPYNO#){KNf#}oeQ-nJ5+koScA84XJ zi}caR3+x(&e1x#K5yF0$?Bm7hE+BT`=^mI*0y`AkB@`+z3!DCx)pzUX5U=mOUHwfhqB`PWQGr&B!GX>*uAk^6aZOUDCZONV9} zG)}P+<1BPpm}Q~c!U79D7J4l-Eo`&UXW?N&N9VXc?V76Hr!2p23r|}3zJ;G!7;EGH zIW5ev#R~C@&ZKDbCw!=wokDEsd_ag9w~I|3wf<-v147jLWA2N?KwTWGHJTTSoV2%{g-d`HXqUi+q&V zJ`Y9Zoi@!hr%!L5CVtZ~UA)#o+fC!_7x#9|dF;r%x#FIV5t`OG660yJCp6CU8}0ba zfV+IY!xaC~;lI`}?}F|Zo1ug2rTt*va@PIe2(ce{-(WvnguK>%uwy^iL+l5t@4X*> zF86~e-tK4;f9TMgjFu(?`=U(mtZ1&-<=0~Ac<}qRkrLh5wteD~<08c-TEy(Bi`-i} zza=g@{;f;t6LZn=eK@VekvPY;RKvFN+4i#IB8DU&?D$;cSi0;u&O=hxA#0ZGUS#DT zWfPmM{QJ?~w{)9mnmuRg&Y(7AyNEMELhncYe)M7IoGbEnUVBBJ)@8KsMBbff_nU;2 zA3bP?{7p-~A9A}TtNmsRY@3n26*;Mov^^SR4f(g)Ly@s8{p;3whtS^8Oj}e_8ThAphBtRsUOr zmVJtH&$o$Fu%9A(t`Hw68DsEQ$RAj7e}nwjJ~`wVSbhG4a$&pA^gTLTk1=M;@jx8) z^Ar0=c{Z!xNH53Apq%*s6aC%0KG8qQJ8geAPaO}Rpv_OPUnG*>BJ%tAuG6SLZRsB> z8A(2Z{2|pt{yXH4EID|aH4$@sp4%og-8<&%LYrZOZi}%A+twNCG z*7$7`Z6j^7hHa+vvHUz?KYc~qaLo2IAv{10Xo_ES_;tHH=Uh9trO9r**;H!kou%fe zU8R1bNsfzY@6>SpF~#d0qlB-siLkV#iH@bMnl{EJCZDjOuU{9&4Bcu|i!zpeEG|2t z>6-X&;Kyj&6el}Ib=_dj9rb3DUt2Ce`^&}A4s-ePCVxRelSmb!NwncS_YIUC9e!~i z&X>YmF7=(L`zA^|%Ka$2Plhu(@A3d+JLey z!Ji;~5_}1*CeEu1kjL(~OP?fvlC&k+lUjF<+1WA9KMwWtJ_91wZDZX}NVyzMVjtT7 z7Rr9~_W{}80;&5@_aMpvl!s6bqCAZ92+E@!m-;W2?WOutWqZ(N-%`!g)=Bu~v9vcEvtU)KFL)`dg2t}mL;x{fWdw_jXffjBTY;O62@_gr49HH~9k8dSvV9Vb!4%v! z;w#8;4Gl_mx*Bku`eo{X~ZEnVW9!8(Z__&Ypa2MlKJ&avKoO*pH4iYh1Vy=tzB`((U$^K0t3djyz zNp@f%*?~o5$A`j0c3?T#ft6$jR+0UFA-rS<){z}pPj(>2=FE5q^oj24T#SaBG0wv{ z&SZ4?7}L5Kvw9dYJ^J)@%yFV!jInOU1P`OrWK8!lW^^&;^f1nyJL-j|8^wzqxVghU z<$+G${JeHwZhpJgWZdI7m+zu;H9q&|a({tIR|HeM)Nzx4g-PeO75~p!~U^SH6mN~W!cJBCBT(A7%mpJF*%p$ra1|Nfd9D{V-?R6Y>tuR+~-E7|6wbGQw zUe_(=EnT;ow{{hp=l6IsME~oPAp+MY9B(b61%2M!VVv!li`$9&^7DBOGDQDtkRbxs zARJR?`gE4-P@J!06z-z}$Bi`?QpZg;_UqJhJ#IqeSI12j@=rO(<7SjKpEJIU`P*&N zo9B*dX__au!-2Wz-R;x;)8%#2v3o|#^cmAz9e2;z9JPC9%Z!;bTBGisxp`*i%zI|~ zXSU*MAZ$%rI?dU;?lNcZL-C;;L^*`=ILcv^CsB@|JcZJQ@*K+ZDBUPWQI6g9w2g9z zMsv1-5{(jz5{EJlB>}~W;zCJ7Nk_>*$wHZil7liA#f_4SvIHd`r2u6mN+C)SiU*|} zr4pqI#fwshQjgMrVxqL5Y(;5BX+znGvI}J&iVx)=$|01;Q4XU#iE;$xDU>dh=TM$U z=|(w@i}F6oDU{PFAESJVA}|&PN;FC=N*u~KlmrwfiVGzTB^@OL zB@1O5N)F0g6gNsP$`X`(lme8MD1|6RC?1q@luDE;6fa60NfD2GrUM>&l0B+3z#r%<|3otGthsSmwJv$U5qU~j9bYLzXq}cO|k=9$qsB2;8q0W?LsVZ zG3L7&3q6dur~LGF+*B8k9k`P0Ko8l06WiPQtny7+cm%awd>HQBNW)pH(LHx-T=fY$$Wap}q+5gjH88 zlzPJrjIC=HI$dN(JYm(WO;T_8GGpt8P0r|;EuF2fEwO1ygoT%8ClglAXRI&DPToiM z(OME=;glMwSKh=}|FxRrov1HBJfY{(6;k)kXEaMzBp)O@;t4%dc1YcO6QlXH9m(pv z^G~MeJc@hzq86ib8_q{Xn~fHnYfgL|LwE9*o%nB{PMiSd{05MCF+=1Yv)ODSTG)<9 zj~c@|xULxJF&A69-6lprAEE3~I0HeyNZE0&iG?1k>=)U@#n3NS_KR(J2G@3cq_Tsf z8U=lnW!KSu0`vsSuA}`#=!wdX_LHC|DLdM~1o|b)j`p3}Wq3da|;k z{S@damL1O(gGsj?kF)H=^^Jx;TG`S580ce^9qo^WK33V${y6C4lpXDlhdy4}(SAJi zcx6ZX6QED9>;~F*L3dep1MOc5{ZeH|`>D`Vl^yL*f<8&v(S92AG-XHomqEWw+0p)F z=#!Nl?OzW4a@f_e^Ur>x(}d>6aYM&K)aZ@4WZ8~8#%{FHzKf5670_ok+h|`$&1$yMK6cD$ zw$Z+gy0#gYROs`XZ8Vlq^P6onhK_~JHX6IAMa?!EGsog)8;w=e4b3(hBS&7djm9SG z#%3FhiDP*)_QuH{({=B0k>36IMB#h__yq7U@G$WE!0!W}1U?D;5%5RABfulTp8|gh zjLuEzj!rzhe*|zu;*tHaz}UpD{Ud=R6QAE72aHQRx_=CCOyaBi#{tJB_Uw-b#wWhJ zKLMD4_#|Kw;+;Sz;*){Nh<5>9h))HkB0de6hWN?A$%v26i))O@8`C% zOw7w}Ov;73~7< zTB8;10q$9+73~A=L;QZ=e#HBLKExjY9zgs-;6cPc0(=DVhk%C={}}Kw#J6HUL?`Ob zdx0Y+>drP`?4`PM2XN$Nx^pKm?h4)64jgl}?%V|&H$`{u0me_)o%?_ZvvlWvVA8d^ z(+70U*PRD|$%}O7L7?jf-T4SG^+w%!2$*)0?tBb5`AfRHc2<4fIBYN zjcDM`t8^m<*nW*}i~#PMsv8#p_srCdSm3@nx^XdZ|2*9o3G^-0jZwe@i*+LocrZ^l zMgt#Nt{Y>3hi=x5vB1Y})s5hLGcnq_FJmlpGnRW8E6I-gry>`lhwRX+$PV=8z z?=c%*|HQxvQ)_tr(`cXpJr?MI9w%@;{COwM3zBg@{rta|#`oS`BGx6o{^L;>8J$Ze zJs99))%l*dmIp^4=nTGxT7r8S;xaM$7#GLrTx@+$8ncA&wdfv8z^@y}c=veWd=dB} z(8T!2`&GuOqo_ZM`j>$(164b8Z$|OQP=5^duK-^Gc5z&N4`aitsDBmpuK`~Ril=)z zia&|^lc;|K_y&;L$Nus980&ga--G%$fp1#z>RyoI-$wo0sDB6e4v^YMyh-f?Ti!+e zyQqH;_+C&v-9yqi5yy*h!lgtsX0e<>GK~|pL-&~E=Vse9vK6pAH`tHvLCMdK;|?pT7vYT@sD7k09f6$L*?nSk*t+(#b~sgPaxYNB6j@f3W3{jrs@K9qdQ< zz^Z?+Eg>8A4|0L>Q}@U;u5Qd9Xx>r4cv#IPbMVoS{`l;uduioQ^Ahr>c?tQyfVt`>4Na9--_n!G3KYJIycD_WT#V%b@K6dV<@axrf>w*eYpzfL`UJ z<|S%-V5_I?0h+-+G-pxU1KU>G9$;Ir56x#(e_uv_uM6rgwL|k9W&bto^MdSZ{-f;2 zV9#;mB!&G)-!rNEG#VQXErvWG;4&bFmNbxi)c*7`H)B79a3A`ZQgOj8m@z^VPhI z&Xup?b2%5E%f53k%CQwWoEs)_4o${Z3WvlhHAkZyzd?>`k)!W?jdHXhN7{ykfqr~Q z*Fp3HxKhpMD93BaaUF8>o!e24UC5EKp)Al3#CoY8z&bVeqa3dz$2{cdI}fBBT{Irs z^b|E7AK@HN{QwrJc_QUFi5y5F)SQv#gcScgjm5S}9FMU;Ts`#x*r4W>6#rYq&qutP zW72$*;*ZieY+D#;|L=5dLHodRH4mluHxR!7@oG*=b5V+amBwIOS)hHynbbb8h2nmP zxP^#Qb4Z%MQk)NQ(>839W2feoG_R$Y9>nA#M$I#6u1hh85HokfLo!CqJ!uY1F~3L5 zBE+bLt1Zy{ze zV$__L=Gzq0L))`$lN<+{f0OUq@THxm=Djp0C*OBzJGMO}eQAzPzJGx4eE6!lG0oe_ z_Z0Pi+q2SF&6mml9r!PRznVMK9H0C@roL}`OZt=FAK|wUe)Q}DUE}CF`yu=U{8q+j zPVwQEPBrf)j(}p`MNBSY)I6Mc1B!`8Oi@g#j8SuQ;u9$5KM}JCF>0<(90SF~A*MWL zfs9e}cj6u><~_t*j~F$dCq9B=5)e}rQz~QB9G`d!iun^_79&Q@`H8!r7&mQCqBc?S z-A+8evV{7Zm^#rVo|e~k+E&F85Z^&@1=PR91%Wtv2BwhOPb{5C{)$T=Zbb13#F0>} z3x0Jm&9a}0V<7HC@eIVFkY76d8e$GeKgB%|x1#t5;#kNp3w|vz&p6fhFaH)OsChu<;k|2Me*ln*vJm!N!*`99q8 zEBM}u6uRH-%T0~CA*OT5;>ElAa#VEhOZf|5;<&*$E9LL`fc*#Ku*m-;@>gQ~4fIPy zTf7R*y9~B@AFN4{whTP#e{bcAwfGt<*W1(wj3JGU zn{yO;7>g9IMmfmO`@J10#I2AowF|zWz{R+d_!h;#DgXMHc5wekGB+gmFSdCc#AP(E* z^)U{gpW!%@+DD$j`57J`=4V*WpmFcV&#;a88J0uyGi+mihUI*fkFHhp9irMl;6|9A zfgfa7upfQLNPhHeAZ*Ofu$&d_7vN{u#{3M+?qI(FKf^ZWXIL&!essN|?=XoEF`1Wn zyp3^Ceun&spCSLD@H6C3{0#XIg`XjR;%CTzNc;@84P42}IL6TY4BN`H7^`R;6o(Vw zXV_L(z!;jJVOxurQJzyN&w1r%kQ-b{A#D%Pqxwp11o#=YRnqpr$E$pL@iT0zr|p3c zUmMU@@(J)WY}-oP10TLFSUwScCh}c5Jqw^mzAKM>S3b<|%4z;+;ye@JXQCyivP8t5 z13xpu;%7#l13!a)NPY(W2%DclKO{eceuT}>pdXT-K|jLcXV3@9&!7)s@iS;&@-t{Z zEPe*--la% z|M2{b7=WKq&%%V_XRxi3pJ5!FpD_mDXE;Y89oNV?Fh2ty@C5~QYy$`8XB-3YGaT=s zHjzj2Gc`#e_?c9TpJ5z2KZCrIpJ5z6Kf`hLbW9=7@c9|UVY|FO#^Lib9A{Gd$TK)U z!{fvJ49gjcuOJ>j@cazhn4e)eG(W>O=4V*WSNVGJGq45t8J1naegS@lZOqTGoE7XB z;Ahyz{0z(PV7~xA!#3t;ST0b0!Td}Yco2Gi<|yN!{0#XMKSTaQ;b+L7_!;sa3O_^s z#LtlbkoXyFo8)I0L-R9iE6-xAqH*ZQ&#b!?qSLqdcdccYX%B!Ic!oF?v*A zd+{@DtEBBYD?h`wdfJ||@-uAPO4}3RXCm)1NIoX={7mHenaJ}qfp;T_YubhTgUItU zk>_XVSm?pA5P5zk^8Czse|`q{yWj~{x)?7EKf}2L?O znut6<(|er@8jmi_>jUpMMV_CL@32JPZ_Ci3-fyD!o3KuUhXdhVBgSBIZ zy9N#70&Cz5%F*!nFbBeNfCK5p(Xfp<5S9ZR2>AuCL4)yT4ut&z97r#Y1~z&Jjd%gt zN06PtTu$H_8n!Wq#BxTkU*H)UwlRmqa)3jkb_37Qu#Gt+mh)A<;Ad#u;6jLVDPWXq z%FueI1Fb>Rcg>o?)}SHIh4L%TWw149h|?i|#pw*T294x2u#La~r_px}8rb@++0u)n z;rM=QwvbQo8Z^k$Z_SqT`i>L&Avq_;0O!=}85*|rTT`dkJ5KD=Z%v(E?>MnfzcqEp zC-RO{dM?x&G{{{=>t+DGXXR*MmwXXp;b3dfz_&cq8Z@Y{ z{35JDgRy9!wKjm}P}ZP`57J`=4V(A&Cjrn z`5Bf&^D}H?eum}H{0wXXeum}H{0!TepJ6#PKf^ZWXIKu+&#;a88J0uxGp5DQv<;D; zA%Eg$$R8-5xfzI`A%Eg$$bTsO4EYm3L;geJXRvLOpJ5Em&#Ya_8MF~!{%quzT{`Db$BBDOk|B<`ff0?2Ft+DY&d&Vc$)%izzmz8;rPHBFvI3&c)Kd;a}gFl!*PK%a0caPczl?j zVL3EE!#3t;SPsq4u#Nc{mP7M1um$)TmP7M1Y-4_g<0Ca|B(0@Y@6g~7(??jZ0omXOD}$g zefq80(u<#ApMGn$oL7DZZAgBGu}E_6);h$2HGHRq`{8gYz?ktU)86pW$5P1M)NQk^Btf!2HZ0YtTr3 zhVw{%W{nYopTYVOlAmE5IzPkP;`K2O|NSP83# z%+IhKnxA1C^D``m=4aT({0z&X`5D*({0z&X`5Cq`Kf`ipeuiz#&#)YtpJ5yGGc1SX zX9UKv@0v0Lz28Ls#LvVr4*L8I`4c}w{zKts$e;Kb@*fgEgKd-i3}a}1hHd@UZ0W_% zuus1=TYB*`?9*?}mh;Nbpbg2-Fox!5*w$}NonHJ5`}A8=rx!oNKK<6ziSRQKekQ`t zMEDuU!e(3K`I$az1V^5q=@gOYXCl9`jr9nk6Ln|gH@0=>dHIcPtn(oG8O964&v5Q4 z598VS8Q3L1!#FTM!#@4_8LkhXpZOfupppCx`a2|khW(hIVL3EE!`sjN49lVU8MZM$ z!}1r4p9%N*8EmWMXBY?PXN&=!pWz&V^E2K#rf; z4|Ez$j!r`Wn;o71X*4O4?UgYfmsv}%%OOoo8o~>C>~fq@p7)p2yR*Gfps@19`?W*8x#-y%z7Ho z<=maoL-Ei9>yS`9^uU@U6c0VHzDOL#{OM(8hu^=m^8>s$XyO@8`rXj2K<9=@$!);2 z4GWXiZ#<}V0G}uOwn-^R$-Zr2%B#SPZDlDw;Is{!k`DprZg?p9Ffez+v&lz*`5WF! z?jifOO)2k^ecMARr^vqT*_4mTzU?h}Pax38;P2jqWKN*eq5c)%E5N?| z-AUA+MEx7UH-J6B;Pr>zhW<9{-vPb@?90DBFluWjJy)%`B$Cy8RwS!6Zb(+^*N}V^ z@5#@_hlJ#pAYW_AFGIf0l79_(o+TfHoa1KyJdzsRb86Y}+z{1)WJ zmi#tkYJmLz0C~P8zXN%JCI1ofLQ8%Za;_!+C*(zz{2t`%E%{H77egL(LrmwA#fx{* z=Y)P|Q{XsV8KXI)fkiQ?&Nv`F^PB*ziYaxv$)2c9TtfE5)QJUTPh2puknD-26J5Z% zm}X}>up#DvGYi-f^NceG*c$Vu(?j;e=82VLPdqTuOZLQPCf1WZ@y&_!p0q$;qOs;) zH|mx&X-R|y4`!zj7Cy~L?+70w`!}>CLVCxSu<#{D&j%|~PC~ClJfZi&8mZSm&1n9< zCgp9iBc9Ovy&Y1oe~HojU`LAjoXl-%X_DvM%|^>yqorwHv%STzJK7EV-R;x;)9>;7 z{Wiz$87|p@z4YBS5iFmGw)JDm)B-S55>!CGozQ{<+Yj7q^&XPw^IlP3o`dZ2+HC#yHreI18S%gdvde3;_1oBFm)B<4fi3Es z6FE;ruFa8abL83_xgX~H-p}iP_(k2iHm6$GW=5xrF^zt671!(xH{&$=eO2glJ&d{Z zTdL6W1rXQRgg8bQ#Y0c0cwiRA19K=I=%#q!5{d^FP`tc0GZs<2yf!mdQM|l1Gd57X zyf!noQoOu2GkPdqUYi-c6fdvMj3&j)Ycu0N=CwI8Z|;5%^XSODIWljK%$w7@kK(iN zGUn4S10!73_~-tob5(z|S=VMflO*TOj7}G08vWjuyf!mVqu^4iSkrFeO5W;7{YUYi;JF|W;$d2{6242mF|EfBEtzkt>A1prA>t5~4}8b(rECC_6g*;=T@?D$M0l--)_!qO_yjkFpD8 zH_9HAy&dEIV#?0Y&Ah+K();|_;;xotCv8~4S_V(HEnInDn2Ygce zDeRMy(yIF%fxHg8!@Sn9rODxQY;%~7?KGw|wlt13wvINV#fXn@1#UJbOlYNTBs*rh(szgJP6%BS8t%z8)Y+hGQJOHI8_+fA#zmTNKzBgb zp*u!tA{u%$bXxy7daNc!Kpz3!4t>M~O~gWvh3ulA$L<9|b*mo+ez-UC`s8yB2C96?!W4(a=*DYa$JL8gyD0 zIW12Ulc7(BJ{J1qCnePPrq3cS3Z(6gW?K+h`EL^kwn=!wv?S8HM#^l8wOpif(?i5bvmK)(d~jA~8fK+l2hgq~BY ziP_L+L!StJ_MMuT3wJB{Zi-) z?$&S`Nbk;to(etp9!*>i{d(w=pkM!W4L<;s-n|5R8uTUiX<{k#rO+>fzVv<#WLkQ6 zKJ>}Z^Y?0E8T4h)FNePD0S!0g>D>j;)1enUq=^;KS3ti4`ie(2JV24&y%PGB&{uw2 z6SqLW1^QLcZ~3k!3ZWN5&wyU|15Mlp{Wj=VL%;3Enka%^1U(aa(a$vT73g1qehu`m zbZWu_-2*)fy60IUuYx`m`l?@P_;M+|yAt{| z=#{_L#8;tz75a4OUp=AWfspj>D(EwySG}%@_0ZQtp9y{a?=<{~Kzg?qdJc5&TbihW zUITp=^qN0vcwi*GyAJwn=yiY6#2wJ@fIbKM9e>fVsCIgHJ@mQI>;I;SjnFqjzZUw& zziar?D!sb_`gPD7KG8%Y^hW6Opf~z8VFFEE$2or8iRLI>Gy_{M(#0m=mQgwu=3aMV z>sVcE25z6Ai!H#`OLTEJ@ZJ<%Yz4MW(#1C5j>~ni9k}xtZMH(9OE|Ch)Oa zbf&3#pWdm9{lKUG zMHdeMJMPkj57@Oy7Y_oTxmy@l+K+Yd1n}g~bn#u_8=bm1 z4D5MU7vBTE`3qfqANck!b@2n>JHOJ!lfZX>t&1N5-#ej;9|7NgT^BzF{`q&hI08KN zmM(q*{Occe@l)XGKk4FUz>of-i>H7e|4kSF4*Y+A*F^{L(@%8K3H;2DalmstFZ_zm z{|&G2!o?|$&=5E;0;4gZCXB^InlO$g(7PX@1_TW5WmOd_&qHgzh^7Q@4*$B+TYX0@q2c1{2p|e+TXK}UP4Gm@VEQ!9U z$tK?Zae0N&xug34#%(*wVU!0VB{Bx`7;w&HA+!HM3pwZ_IV&=jXbzvoo_Yvs~%dR8_BET~Sl)Evc(q zU$U;ExVE~uuCk=AxW>D#wz#ZheR1vWUT<|x9X!g`maJa0el_eB#pM;Hx34ZY;*zSW;_BLBZ%y^8HB}Y0*VNXOT~k$c$GWRg^VN#7I;ptIOJ+>N-^{YIikz9# zW|hsHQZZvzdD-;r^0JbOSyReq&6-*^byaru6@hN|6Xf=5(ZU>I>t!Qtj2?xX*hhAh5|LR~ zQC}y$OV+I^Lv=+}O*Mw3yc`1}GRvyht*clM2jr?NDFs#{nk7b2eA}|=C2Q&`YNqC_ zT31<8TzY4Dab@|G?407#Ra0hQq;Ib+sVXkTP)^M*_SV*}T@*%0=Gt;mocrZv^KZOi zq1w=MLmF3%s?by#C0O|6C}m(&$DyjFSUR>r!C|-RY6{^ppl}2uZk_j(Iq3u%n z)p4lM4O{S^J6M5y4GP7p^F`qJ9?OpA3C6&`7fK6dpSPmc&V&A0RdkF)C8iG+7)2fpDmd3T=r3qLo?+4h);!^-p8(y1VP} zQ!pX@IHVPa3ipk7o^b*KTRG&y zMB{JXoA>6uH#;-CZ}Q=e;n&?RSEQaerH=ebT)fR7!rOOOIE|f47;g&^;J2B0$WmZ4 z-H6lu7#8?ohys^mY_!Q)bD*%yArXn!p_v}IK)?Z~bj(G&(V-D5^_Z`A8}HvR7t;<& zY!7d6du|}B(L;T7>+QcRg~eB`J)HBmK#$q0wJ=+y`D6~WG7qQuWFvBtuxUO5L=bS> zuI+E^d54h6%Th{-o02%H8k(9kKRIz;l#G~p=GbLXlC6xCq>?PFrunU*N($Ar zPAQp)X~GYW@tOFo+tlLnD6wO%L0ELWZg)WHZ;nawp zyw_L#=;EkyP0VPCsBWZ6-qJ_zy1o}PrfO2RsUoX8i+$vk^nEyb2npM$LL z6!guM`fRs2?jfJ%o5*K*FUjRw$me;X;LEvvzFgf>Kh!fhpFd!C+MU_@W&T|KveUT$ zqgQuk*9!ie-}mK&A8ZBMJ$C1crfehXE(hJ^Ux99?9k2u0`UcQ#IGyWc^y)x%Z6oTQ z%6G2x+JRLq*(uT;7P`AzPm-R2>7Jg}_E&m_R@4dO(}FM80si;|VaRrS#*cyAM0azL;1;cK*>0-!6;? zV}&OjVc$!|hK3afX*RwvvA~$Q{-VUpZPy%g zc7ZW-{Y8nH+pan0$pyyD^%o^(ZoB4~rxqA9*I$&Fx$T-`KDNM^x&ETW%x%{k^G6Ge znd>h~%-nX(F&|%G%v^s_V&=AMjhVfF3AqaO?DP734sPIG&opoF>#K5_Q@ZPd@2&vP zZyMMq9F%;jSk*JiI*^4vQf%6zpD~)U}vu z(6nU5S>{Sjomz3oY>jlvq_U(@Gm}p1kgUS}&ZrbiXffzlsiKZ#Vl<&ebqc8^m6A!S zn-ns_XdrF#mpG=OW`hfGEDqAHYs7M1Fp=^drq%E5_4$7PR)qz1p-V|13NgeE0 z1_w9D8=~Rx%josvSc=~*5S!oQp#=NHttC$+qNZCa5h3@5q?(8^EhEMyGfwy~<68z1 z4PB8eiHMeZ$SO&*Oc~f}($HZ_6a^-Ph_aqymkNX>=$14B7zb^qS!O=H{v?3kNv36*~AA8Ha7O-en1RzLF`BW5xbzj@@@e4A0qBQK8%xEAp!UjR2CPG zkM|RZ5$LNdkxByE>!C9J@;1ha;enR#w^5G?SCmcAX1Xi}R&bx-eKh3nrGn=R&Yzd_ zUj<}5)W6U7+5I+0z>D|fTz;N;vJ3c(2loPV3hG~(!*ygC3&(jE+BkQWqUYKj>Hudp o0WvxCAE#+Q@X&WI>Hz;g2F8zlXcy@?G@hiN=K3-Ua-6UKKhEypt;h9o^5!Pw+U-_Y_2fK z?eVRc=S^HvrEp#lTYh-Nr}cev!`8g@`5cd8{uJ`O@n}9ik&Wez$0p3mHazVG4S>+| zMYn@k!)Ox_A3Goa#>H;|th}mbwRuYw7j!eHXRMz+ds|e^l=V;Oc;;16P1prBqo|35 zZdt!EbyZW0T;>WIG|fbnV&&C@{;O2Rm{T(^iAFMM={DRJt@ID{(97bYnkneYLOQ3t zQgKn{3dy8y?ziWTPu%lwy*Xbt)3*NclNR>udWF^}Ra_fu*Z1qWD~h+;s#naH*F|4m zmwb8UeR zQ?vU00p+_FCRKbYOnE$+omZ8)%bGH;jmcArp&80taxD7zAH}?xUQ}(pI&RIRUKXuX z-c%AsA!kp&BwAOpb4KR=kdJP~1?>&7kV_{GGwaPyc`(7i@5O?pTZ(oCXPHhYmaQfh z{)`V=b*zEc#LMcUo-_>%5?inErublydDCD!m0>DbL(>DV@V(8(;OX{T*h(dGBM7YG zrZ+N@N-mAblM-!$i@Is0jogg@?Wnj@?ksHoKHPx2DZP)l%L&9ze1pXJiy_AK%%5s6 z)A9EN)3uHMifGuxKpBeLMb2cy5G+kG|o*AmTu!%oEgcGod%lHs~Cz}GI6M#*LwS%yEk+s`k^Ac;f z!={56L;WgZ8SxHlH^b(;tc|=FL-fxhI%r>KZM2sW0seCL7C^KB%O%Tg@aAB-jW;{+ zj~&}RcC!cn(rI_wQ6J&xMjUvy-+fC2r z1fM9j!Ou(W@O-HQUMO|KFG^kTV(AF{vJ`S7YYyP=DH2(0XcE>M4<28O2>nj~@qXuI zNLUjh>sJMwLw~rfVeQ5`j5Un)6xQD2AbeC}{T;69yM_1tHrlrL=>(^W zZkrTkxxCsY%WmI9Med8r;SH(SyD?Gh*^rAzH=@Np_`*NJlV&5so-<~ndx-N6A@3OH z9rMBK;Jgmx37jYR;2q|?!^rFBy#Bp;0dVgACny#Beg|%Nda9|~zOb#`*z8IeqrQ>T zO<}AoUX_?*jJo*PT{!k2AA4}`vE6RY>qg!X=MDMbg*h*bykX88_QC7ryk6v;;=EIP z^Qtk~M~p&OYjT`qKRMn|&+#44R6WP{-go{VIlf$z<0SjZ@dNc7uje?vTOWpfe5EGG zN%oWD2kSXr&+&&O$FJ4oILUr;yt$s^pLvdV)&Cyf^M96IVibC&CdWzkljE)R9Ixm2 z=Wrk2EysUUlj9`&$?^7jj@NViA<6OUH91bQpB%^Ul-s}a)xRC`?_!_Ux5HibsQS0V zZQoaSiP6(e(>vXQ6RTV`DBlIRgY{OuevV^6cTFUA5d=$X1t83kUmJlxaeE8b&vN|N zN3OxdC2prG8yPQkEaB zPJ4&otv`r|y78?hb`1JS&VfP98MY20o^IoLIf$uTL7Z2udEh^p-Znur4NbLG5N&IARZFLTD9@!PCf_~0}{rIpmVznM9|AfLeKFyyng7(_8etO;r>3QSbcxxb;;YqA)T{M36z5$%6f5J?eo(!~K=(KG!^3&hGo8fk1jK)h6dZ|9_-)iv>}p+w6P4B_u53qSel&l|pYNfL=FVL` ba5$gVPc|Bt+Vd#v#{Yox@mk)NvYr25>YdBA literal 0 HcmV?d00001 From dad4961bd62678af7119d8bacf5ac8443248f8af Mon Sep 17 00:00:00 2001 From: "Jim, Guo" Date: Thu, 21 May 2026 14:14:12 +0800 Subject: [PATCH 3/4] update --- hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv index f17eacd60b..2ff9e1eafd 100644 --- a/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv +++ b/hsa/gfx1250/fmha_v3_bwd/fmha_bwd_dqdkdv.csv @@ -1,3 +1,3 @@ dtype,hdim_q,hdim_v,mask,atomic32,pssk,pddv,mode,bf16_cvt,ts_qo,ts,knl_name,co_name -bf16,128,128,0,1,1,1,0,3,32,128,_ZN5aiter28fmha_bwd_hd128_bf16_a32_psskE,bwd_hd128_bf16_a32_pssk.co -bf16,128,128,2,1,1,1,0,3,32,128,_ZN5aiter38fmha_bwd_hd128_bf16_causal_br_a32_psskE,bwd_hd128_bf16_causal_br_a32_pssk.co \ No newline at end of file +bf16,128,128,0,1,0,0,0,3,32,128,_ZN5aiter28fmha_bwd_hd128_bf16_a32_psskE,bwd_hd128_bf16_a32_pssk.co +bf16,128,128,2,1,0,0,0,3,32,128,_ZN5aiter38fmha_bwd_hd128_bf16_causal_br_a32_psskE,bwd_hd128_bf16_causal_br_a32_pssk.co \ No newline at end of file From 8820afc6f1af9367b1b2c4afbebfa0ed5d5efac1 Mon Sep 17 00:00:00 2001 From: "Jim, Guo" Date: Thu, 21 May 2026 14:14:59 +0800 Subject: [PATCH 4/4] update --- csrc/cpp_itfs/mha_bwd.cu | 139 ++++++++++++++++++++++++++++++++------- csrc/include/mha_bwd.h | 36 ---------- 2 files changed, 117 insertions(+), 58 deletions(-) diff --git a/csrc/cpp_itfs/mha_bwd.cu b/csrc/cpp_itfs/mha_bwd.cu index 2cf92d4845..df42f73c99 100644 --- a/csrc/cpp_itfs/mha_bwd.cu +++ b/csrc/cpp_itfs/mha_bwd.cu @@ -1,10 +1,120 @@ #include "mha_bwd.h" #include "aiter_hip_common.h" #include "asm_fmha_v3_bwd_configs.hpp" +#include +#include #include #include +#include namespace aiter { +namespace { + +struct KernelArgBufferWriter +{ + template + void append_raw(const T& value) + { + const auto* bytes = reinterpret_cast(&value); + storage.insert(storage.end(), bytes, bytes + sizeof(T)); + } + + template + void append_ptr(PtrT ptr, bool compact) + { + append_raw(ptr); + if(!compact) + { + append_raw(uint32_t{0}); + append_raw(uint32_t{0}); + } + } + + void append_u32(uint32_t value, bool compact) + { + append_raw(value); + if(!compact) + { + append_raw(uint32_t{0}); + append_raw(uint32_t{0}); + append_raw(uint32_t{0}); + } + } + + std::vector storage; +}; + +struct fmha_bwd_odo_logical_args +{ + const void* ptr_o; + const void* ptr_do; + void* ptr_d; + uint32_t Hs_o; + uint32_t BAs_o; + uint32_t Seqs_o; + uint32_t Hs_do; + uint32_t BAs_do; + uint32_t Seqs_do; + uint32_t Hs_d; + uint32_t BAs_d; + uint32_t Seqs_d; + uint32_t seqlen_q; + uint32_t head_dim; + const void* ptr_qseq; + const void* ptr_qseq_padded; +}; + +bool use_compact_fmha_bwd_kernel_args(const std::string& arch_id) +{ + return arch_id == "gfx1250"; +} + +fmha_bwd_odo_logical_args make_fmha_bwd_odo_logical_args(const mha_bwd_args& a) +{ + return { + a.o_ptr, + a.do_ptr, + a.d_ptr, + static_cast(a.nhead_stride_o * 2), + static_cast(a.batch_stride_o * 2), + static_cast(a.stride_o * 2), + static_cast(a.nhead_stride_do * 2), + static_cast(a.batch_stride_do * 2), + static_cast(a.stride_do * 2), + static_cast(a.nhead_stride_lsed * 4), + static_cast(a.batch_stride_lsed * 4), + 1u * 4u, + static_cast(a.seqlen_q), + static_cast(a.hdim_q), + (a.cu_seqlen_q_ptr && a.seqstart_q_ptr) ? a.cu_seqlen_q_ptr : a.seqstart_q_ptr, + a.seqstart_q_ptr, + }; +} + +std::vector pack_fmha_bwd_odo_args(const fmha_bwd_odo_logical_args& args, bool compact) +{ + KernelArgBufferWriter writer; + writer.append_ptr(args.ptr_o, compact); + writer.append_ptr(args.ptr_do, compact); + writer.append_ptr(args.ptr_d, compact); + writer.append_u32(args.Hs_o, compact); + writer.append_u32(args.BAs_o, compact); + writer.append_u32(args.Seqs_o, compact); + writer.append_u32(args.Hs_do, compact); + writer.append_u32(args.BAs_do, compact); + writer.append_u32(args.Seqs_do, compact); + writer.append_u32(args.Hs_d, compact); + writer.append_u32(args.BAs_d, compact); + writer.append_u32(args.Seqs_d, compact); + writer.append_u32(args.seqlen_q, compact); + writer.append_u32(args.head_dim, compact); + writer.append_ptr(args.ptr_qseq, compact); + writer.append_ptr(args.ptr_qseq_padded, compact); + return writer.storage; +} + +} // namespace + std::tuple get_padded_hdim(int hdim_q, int hdim_v, std::string arch_id) { if(hdim_q == 192 && hdim_v == 128 && arch_id == "gfx950") @@ -369,6 +479,7 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) int ts_kv; int ts_dq; size_t arg_size; + const bool compact_odo_args = use_compact_fmha_bwd_kernel_args(arch_id); AiterAsmKernel* impl_ptr_pre = nullptr; AiterAsmKernel* impl_ptr_dqdkdv = nullptr; @@ -429,34 +540,18 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) if(a.v3_api_check) return 1; - fmha_bwd_odo_args odo_args; - - odo_args.ptr_o = a.o_ptr; - odo_args.ptr_do = a.do_ptr; - odo_args.ptr_d = a.d_ptr; - odo_args.Hs_o = a.nhead_stride_o * 2; - odo_args.BAs_o = a.batch_stride_o * 2; - odo_args.Seqs_o = a.stride_o * 2; - odo_args.Hs_do = a.nhead_stride_do * 2; - odo_args.BAs_do = a.batch_stride_do * 2; - odo_args.Seqs_do = a.stride_do * 2; - odo_args.Hs_d = a.nhead_stride_lsed * 4; - odo_args.BAs_d = a.batch_stride_lsed * 4; - odo_args.Seqs_d = 1 * 4; - odo_args.seqlen_q = a.seqlen_q; - odo_args.head_dim = a.hdim_q; - odo_args.ptr_qseq_padded = a.seqstart_q_ptr; - odo_args.ptr_qseq = - (a.cu_seqlen_q_ptr && a.seqstart_q_ptr) ? a.cu_seqlen_q_ptr : a.seqstart_q_ptr; + const auto odo_args = make_fmha_bwd_odo_logical_args(a); + auto odo_arg_storage = pack_fmha_bwd_odo_args(odo_args, compact_odo_args); auto pre_kernel_launch = [&]() { - arg_size = sizeof(odo_args); + arg_size = odo_arg_storage.size(); int bdx = (arch_id == "gfx1250") ? 128 : 256; int gdx = (a.max_seqlen_q + ts_odo - 1) / ts_odo; int gdy = a.nhead_q; int gdz = a.batch; - impl_ptr_pre->launch_kernel({&odo_args, &arg_size, gdx, gdy, gdz, bdx, 1, 1, s.stream_id_}); + impl_ptr_pre->launch_kernel( + {odo_arg_storage.data(), &arg_size, gdx, gdy, gdz, bdx, 1, 1, s.stream_id_}); }; fmha_bwd_dqdkdv_args dqdkdv_args; @@ -588,7 +683,7 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) (a.cu_seqlen_q_ptr && a.seqstart_q_ptr) ? a.cu_seqlen_q_ptr : a.seqstart_q_ptr; auto post_kernel_launch = [&]() { - arg_size = sizeof(post_args); + arg_size = sizeof(post_args); int bdx = (arch_id == "gfx1250") ? 128 : 256; int gdx = (a.max_seqlen_q + ts_dq - 1) / ts_dq; int gdy = a.nhead_q; diff --git a/csrc/include/mha_bwd.h b/csrc/include/mha_bwd.h index 0afaeadd22..9652db34eb 100644 --- a/csrc/include/mha_bwd.h +++ b/csrc/include/mha_bwd.h @@ -247,42 +247,6 @@ struct __attribute__((packed)) fmha_bwd_dqdkdv_args p3 _p43; }; -struct __attribute__((packed)) fmha_bwd_odo_args -{ - const void* ptr_o; - p2 _p0; - const void* ptr_do; - p2 _p1; - void* ptr_d; - p2 _p2; - unsigned int Hs_o; - p3 _p3; - unsigned int BAs_o; - p3 _p4; - unsigned int Seqs_o; - p3 _p5; - unsigned int Hs_do; - p3 _p6; - unsigned int BAs_do; - p3 _p7; - unsigned int Seqs_do; - p3 _p8; - unsigned int Hs_d; - p3 _p9; - unsigned int BAs_d; - p3 _p10; - unsigned int Seqs_d; - p3 _p11; - unsigned int seqlen_q; - p3 _p12; - unsigned int head_dim; - p3 _p13; - const void* ptr_qseq; - p2 _p14; - const void* ptr_qseq_padded; - p2 _p15; -}; - // dq_shuffle & dq_convert post process kernel args struct __attribute__((packed)) fmha_bwd_post_kernel_args {