From d7f82d6e9f2e3398fd51ad462ec603dda53e6442 Mon Sep 17 00:00:00 2001 From: Sergey Solo Date: Fri, 22 May 2026 00:05:43 +0000 Subject: [PATCH 1/2] Replace fmha with a new kernel --- hsa/gfx942/fmha_v3_fwd/fwd_hd128_fp8_causal.co | Bin 0 -> 66400 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100755 hsa/gfx942/fmha_v3_fwd/fwd_hd128_fp8_causal.co diff --git a/hsa/gfx942/fmha_v3_fwd/fwd_hd128_fp8_causal.co b/hsa/gfx942/fmha_v3_fwd/fwd_hd128_fp8_causal.co new file mode 100755 index 0000000000000000000000000000000000000000..6b564efe4e238c60903a8b2e05026a51b3beeaab GIT binary patch literal 66400 zcmeHw3w#vS+5Xwhg}{;u6c8xjf~KMMGa?3vfRNdQi;9Sdii$3qWJ8wRHoKb;Zj;SD zga{#^q9R1@0s@Mcfcqtjmr4~{0@%{l?`yo&OU2f<+WPftyWe{*lRYyToXxEC-~N9S zeusC?Gw*rk-Dl6t&biE)ikWj~nN6njSVaF~P!}?>ZEVunc2|gQ%#FiX8cKlwzk~Xt zcz|))bh^8T4KgyxCNMt?OUGo8++54Dk{v+vxEzFlkc3aAIAV5DC(TP1apiEVi~h#; zkEDSuV%E)Q_QkLbR?ZCBNXkj&rO&tEH@+PF`64|YlL_4kiU*J^yZmst!2{bMm~7lzgYB$XVds*t9#v;aTc^7d6!#N^xZS zN*x7uM|QT;>)r2hI&$pGJuaUU_T=Pv0JD4Dj%??ir3J-Vj)L7OMUFz}Znw{4FW#F{ zoSW-)`q1tauWO|<;Xq26qoCAj&vzB&JWJi}S*5wTPERc@w$ts%kqRZK6xx@y!cHh8 zCf^FTbEoUZ3Cb{^Az>C56|B&VvP3%I0F?u3MmbES(B+6SUy9PIazM=}$Eg&$98s1@ zQI1zRpk|b5Dup7BGQx_v7gx*zuTu&%UFDdXfzDAWbjd(j)!yUHb`&@~(sAcgidd;4 z<5NP5fsrejNgvmS_pK>L36SY+(^vbuh?8*cC0ozGw*!>LA$dwJ$AkNEd`tM|4ogRCpm; zg0^T00y+rxdF}bKE90r44i=dZuSZL8GFpODItbtgBA>rBoYBD{li_@{1Q((ucvlAj zk%7B7bm`!b$?##c1fN7pa8U;VTpWtszV3^|7dj|pDts9&!Pn6e{7nY|sKDK{pEe;K z9J~ytO|elkBt*^7UkAfMxH^<6@1swf2I!!Y3oND;p0I&E_6AdySqi<+T4YKGN17~m8sR-7V!9W-(=YNBRnh?=2E2LsfL zV%aIutb;@@#pb9PwnWXaRR;r+f^nme*H?hK{rRadjT znw^iD;X>35@Aig4-#2$es%CZ=Ow4f=Ee%(j3Q`U-FgQ7h04(* zsCHkg6#JHgewHY0&vD|Rt?m)E(`KY{Xzhs22`a@dN5uI(BOah~=m;Vnrc&&3L|oQ0 zVynubBZzpMO0i2u%q~WUywc&J6~+DXna5Ky{g)(jYSavAQ8P@_!61_)Jc@JLoUVgR zc9k|q-J~?ni8=;7O-fFZd|54-A9Xkjqh?qXHN#>Z3^GahTCzk3nY@-n-M}>GMID2l z1|}y-nXHz$qYlRxHA8vS46Ai8$Ry!wiC+hqyp}}W+%z{t9Ye30o5xa?dWuWk(7nC1 z&{^cO=eo+BId(j>{u4Y&@g*`r4Q8fwJ2(nlON$a-?1AGVr_If6DTR)5dv1ZlXJ20I z$!FwbAEGnPevxFqf8Im~OlR{nbj4ktP+=seiKn@vcEJF#AuMKgssu zX@Ub`#$)h@6i8<{_LtHKip8c6qS&|vQ0jyZS)Y!K`K5XBGvLCB*- zbMOq9bg*4ugMiP+`ysysY!Iv)Y&zh6un5XG<2=|_oR5nSA`sdSb^sPU9^wSsh|JbT zbF4KK8~gnbeAbZJOf5ph)}eT72W-d0troW8-Yw z?WiHy*09z!z0$V97P7S|qb_4@#`+8!c*|`JwiC>%FvZ0 zzmDaHhlZ2Z6a9%*{WGW)CbNB$GFk@M z2KWd12UZQppjNQjT4s#SXvwf;_|vbl5u1>a;P312TP55C<`&_521Bn&8TmvFVPLk*N=vMA+(!h7!qsh_8O?_iZgU*dIbI z%x4qv85kN!d?3#Hsma)<1$^q5Pd)L83&jy1h@n1|1oaPMvw_bHTo++`l&%PDv6YJMoEq`{DeK^h6k3h5e1;~@P9r16kaA*Df@1}PmdX6Wg&Zp{B!xHgx>3%C_FNb-Sx-n`=yH z+jbXI#Op-wVOEAD_5%-}^Wp4Cg<< z=i0FoCtS1qtNm!eon2kX>_7I}Ypz)#$V2_W*n1dz;|+@8Yb;>zW9-d0DF)Y9jC}x@ zYD}lNm`)*iOs9}Mrc)>!6OI8sZ;!a*RN}%93@GD*pZ0HzORrWCSMUn>4T9tGdf^{R zjPZH}2U3jJD+u=t*za=a75uJBT!U5O8ln={&~RK+N4;8Un_#OnA=}HghFaG$bmGjb z_rfD5a9wVztu@)MjqR{qYpb<|X4d&bq0qZA$1AF#KGwK*C%La^f}hq#NEa`hhC7A? zYU8ccCXA!D9~waR_fMeqJ6383jH7lCuA7+WuYr=p7vLx0uK{o33wNW?*^<4kv)*T1 zs&(xgSJ%!d>e@L)T{{=9YqPuSe+;Srvt{)kFsT0o>#qNRLH#FKcm0o1um9({n9eI= zIoC}nxDJCIjq5O2E3U&} z$7arq?N~CjrlaD`rjAtfmve*rn$4s>?`QQn6t2(6Ov((Fe1|5Ujr~^ZIId6W^qSa( z!ulYAV((#0s^>O}y^k@eo@Y?(1Hck!tV5+(hYh?$tWkWIC{ra?xEI9NbXlykRAQYS zj@3$I9VW#(V#p<8jplr~gI+s%U2y}6{vL#N4cAI7X>fRAD%K8%e`0zQh7_!a_&&tWt+93RGpCN79epcsh{W5Wh6h_h0R z#D}pFLl(r1qZo+~V}XF(8QKFWV(0(FcKrih7APFOfV87#zqVQEQVktMvRRd4jAYPWy$xAvC*Rd z8$&P>BgU*_0E5p5ek4YWjZFgVYJ!c8Lw<9Hw^B zQEKmhncBs#Q2S5^wM$;3HtS7lbKa&l_ovk6ouhWyi=6}3gbrq=xdwVsc`#<(j? zL>GTYD+*p)6+DrI@7KFdpf8DSLto*0A=sB}L#NO9gE6QLq0Y8XEhZa!i{wv0{={b% z(}|MmD>3#)HNGkFcM#mv*M!^ABdTJoZ84sT)bu0gQYUteo&|*4%j(hZv}fR*tuZmf}IC;9@zO{=Yw4Ub^+Lh zU>AbD2kbpy7lB;__I|MUgIx@EG1!N|J_L3N*d<`Iz-EEX0hB4Gph9HXRf?pJbD$yp?h#5kim?<=fGlfQRme3^57S@Y%gl6#;p+&q^*eu>A zJSxr=wurY2Pl)q`t>PWRHgUetD&8sV5Elr$#Jhw&;zD7cc(<@$yhk`F-YXmu7YRqi z`-JDk`-K<82ZWcz#X_6-pm1D#NC=2_p7_6vOVWxv2* z{@gF{KVSC?{O@1;1^)iGeu2`KZ+7d`gvILToErB2W zz!JFjT1()%>nwrmueStlxWN**@kUEv;zUbe@?=ZkrkgB*sZ%Y1n{T!RX3T)^POtjyo)YJMXjv?z+noxchEP;NE*Jf&1>W1Ri+6 z5_s@IOTcco1RM@aAUoRD6E%Z>u-MiU~W;X08JZ`B@vx=qAX|*#s8yw?1Y@Qy@S0 zNwAQgx7`f=fjiF5-w77-3-+4PG{`SJ02cE195$nyA;0J-SjgZ1vKdW>{Nh)@LjIu+ zGnxVUC9i>neAb(0xb@v}Hs@`ykk9?88E$)boXtB27V^t}Zbml97yJq=#iiw|R?*8s-FhcRmld4BvM9@>}XdNuY6F-nM0T zzarNT*3qwGrLkV#wqnhleJ$f?te3ZK+1>xh#j2zK1uY1-hj3lRZOa6T$=j0@ zleZ@+CT~wJ3D^BhQum?#M=^POl4A1qB*o2pZ%=}*J=&8Fg2~&H@cDYQC$kA=v?q=Bq|u%<+LK0m(r8cq|J9z9erut9;KTO= zxIHQT)-sH=C#Bz7hLQH9^jpg?(w>xlYZ*q`lhSW3!$^Bl`mJRcX-~4>THFSyfmjB+ zpbf{xhcVKg;S zhcVKgz&f)T<Oun+cm3?cSUg(-9q&UrtAzY!PO zU=%7+PE*`Bb0+vPSMNB@@!!np*TEUn=bJejl$^g|dj5v#`8m_`^IquLp!n<&aBAwQ zgHuya9h{naDmg!5dVa+8{2kNtcfHWFS)peHoSJ&-;MCMp2dAc^aBAwQgHuyaCFdth&rg`1Uobtt=!Kp&3Oys>)YMZ4r>34dI5qWD za(>G6{FLeW2d3vAdZA~NLeB^|HTBfNsi~(9PE9?PoWEsy{+8+aN2cc=d!grMg`N>` zYU-(jQ&UeJoSJ$nIX`21e#Z3t6Vvlgz0h;3LeB^|HTBfNsi~(9PE9>IPCQr1$L4^C z=h5Q%;56aq8W3y$Ml-MWFM}21o^X%;61zu#iQS{W#O~2wQoBbthsRt}aT^@QL#6w6 zwfU{my*wVnhq=A7F+aQy-`}gv-^Ba=ZSMJliaH!YJ3_M#>)_O^!#X%M>#&kDg8OsL zHq=Rlo)K_r>ZyZMQ%@b7ntCcZBe*};)bor&&j>g*_0++ssizK3O+A&I5!|0^>Ulw- zX9S#@dg|cR)Kdqirk+a92=31{^?Y5SX9S#@dg|cR)Kdqirk+a92=31{^*p7}GXhRc zJ#}zu>ZyZMQ%@yl1o!8fdY)J483CuJo;o-+_0++ssi%@Ng8Oq#J>OO683CuJo;o-+ z_0++ssVB!tp4A-0#-P}UQE z+_Lp3y8FlJKG`!m-TmWqpL+i|-6wkn<=?7*+^=ZweH;DbIyg1kdpbBZ+j~mR2>P-$ z+s_ROJtN@MY(MMZ)YMZ4r>34t&ItOlHT7&(=ota0rk*-DHTBfNsi~)uGlITsO+B|L z^o)R0Q%@b7ntJNs)YMbS89`sRrk*tlJtN@M)Kdqirk*-DHT6_-M$nh7sb`Zy&j>g* z_0++ssizK3O+A&I5%gti>bY5=X9S#@dg|cR)Kdqirk+a92>P-$_1vn^GXhRcJ#}zu z>ZyZMQ%{bQ^p7v-**|VP)4U+gniZZ;vk>OtkTtR>c6py^PFLv%m!1(~&p4Cyv7pTY z&maF&o@d5u?!-c(bES+mvhbQ<#u{118d=5~S;iV!Mql{9&pKJEec{sYE%cN7$TP3T zGtI^`&G?yXSkvk`vZ^B!)+?W^woaC@4)cZMT5FJJLK_a&0p`~>Ml&g9ti!B- zZFgfGX0~qDc+&opubV|N`MOyYldqdKHatd!+m6OM%yg~i-mb%(Syuw@Lh-te)mE>; zZO}JddGPKbE}y>))_>sgg$1aZ&leY=20mZnMr--J*Ml1Qe5ntu=4zZ#&sP8@(AOTcVA_Uu7|;a3B6 z`yD<+usJ;T+!2D^!ed9DC)lk#_Tmc!yN$LS6yAOWW%GEf6W(eB7`)^s{5j5@CD<|^d*?jC@_Fp%KO>l%$9{2vU?n{E zt6vh#!(;#TF2TG!_Ur#9SUHcq*F~@uJodr+1Y5~tzxj}0t9b0=kC3qrys-|vu@1bk z4!p4rys-|vu?{@#A2Zg0XY2Nv$oLys7jHAP@r-rg@w$SFb>N|Yt74F1J%mfVH_uoH zp1$*vyt^+JZbX%RTit8>h2PtSy4UstOnM($A6gc_x64=up01OE-zAE5{kGo2XRHHn ztOK6|*L>2S1#?0oT?gLi&ocV6jQ%XI%jnPYx{Us;bDhR}4UP928t*kU-fL*Q*U)&c zVXxk6Xw2D)Pv_qAfZsV`%-Op9bGA19naPE9>^aBAwQgHuyaC1(WB8*A!$Mxkc}oSJ&- z;MCMp2dAc*{3>k2(1;MCMp2dAc< zIyg1;RB}e}ys@U9rxbcdz^SRH4o*!yb#Q9xspO2{d1FmI&nxtdfKyXX9h{na>fqGW zQ^^^@^TwKbzN^qP0!~dmb#Q9xse@BfPma@A2 zjL{imbWZg7UsnO1orinw!FbJu?sXNUdwqtHcg0Kh`V1rQ zikI&78D`8mH|Ct<_t9G`TjH(r$+Ok+cg0hTyel5Bsqj&J&+)lly(iliettU@jw^cy zAKhoHF>I_ceBl-2z1tU#tGshuv;VF^(RUfaSej;kXmjs5HTy$#aBB94>fqGWQ^^^@ zSemAuTNHXmz^SRH4o*!yb#Q9xspO1cEKO6-8ik$_aBAwQgHuya9h{naDmfz^aBAwQgHuyaC1(U34dI5qXu!KtYy$7!rF%+`O(B5exNZouPp!v-2_43ABM@w#j}4yU%purU^AjKvva zamHAjF%}2wFu|JR#`AK` z8H~xoyEBL3F^nhDA+Avw}5>D>=R(Og53&s z8`y1NTfw%1-2rw7*j-?Ef!zak57>QR_krCHc0bsIU=M;l1ojZvBVdn!eID%dU_<6t zD-a5WLgv@8tq7TaR1tzL8=7;*_o*3mp_84yF>z*CCAHIcVnZLSb=MmfQ?t8d4E8Dg zjQW%^pHlES#(aFpboSV5AH-w7vd^hs8S^Vs;adKC>a(2rEa$jNIj-_QP``5KSFXaf z;!o6P1@l?Kag}jgE5D+CE1BO)6|Pl(p+2ja&nk{spEb;9 z4aZf^afyGYej@V|Rk+~GBHvF1^QqvtR&ZQ?Qw;I*Ge5ryS7i+KsboHt9M?*Yt16EA zRWZLR6|U+&)Tf&HRC8Rb7?i4D_`ETd^z}+bLCe;exIQ%es#>RPKB%f3hGnOeCj!_ zH5^yNKBCEHnd)C|4P)}EL^;U$usI2!D zto)ju0V$rb>oU#kk&(;3v~Y3B1Z$JNMj?U+gZb}+vkDqK5fQ=gs8XD7$Cj&aG(y*gHRYkJmQ z7z*#K;p%Q3zGzj=PwSyHvE^eH-=J&3txqS~qdu z+n(F0-yY_-M}=$e9n@zp^V!RBJ;UHFI2t9;AMUnBO54uETcf zbC~%Y=D0R;Tt^(#?+EidqQdoDHuZUq`8>yQwQyX|JE`CE%uA@t-&r#-cl;hgO zxMb(wdRBj%de&ct0)!+4yU$3nH8(8^=dzOC~{2%7ZZ-D#;W%(9XzPV?4n5xy; z%$09}e2cRDCRTo9&+;3=eyq-#CO<+=9SBu+;3b3cU)K+) zwNQhGF&oEh!4K(1KbD`*>{tH?730luY=11n=dwJ@cAL4bqI{=kidDEh(>ixQycYQ%|(Nd?!?soX{iX4SbySLcx%X9eb9(SSFp6w{IdrRH!Vvi3zvhy8FT}4Y_ zztf)M%qm@KcNOIp+jEOOc1J;hz1VAadx~>i1y1kvUQhP*1qEe=*8#H3ne8JPYi{cJ zw8^=XvQj6c<&Jkw%AGiQqBF~po#se$OrDUM zpZuE~$7^mP)CXNq;72iRPC$!B(ocx9_%8?2pqM@gO(ogx{~BRyAc{vp_I;;Ids2#u zeNL2;v!ckmq7bDlEhSp*Ki<#S|#&4bc(i&|9R$Z{6gC#2mlo^6h|P0V1MlTv4mi_;wKP^(BAO2D4qs(hh2+^Hf4{S$ytG z#iZ+jWaltPd_BO&N?>{EI&lT${~QVxOXa2Ofn={?{?dM_JcH42*udPy25B2xED!!%62DZRC7TZW_`h^M1KZa_!sVswY4>?@ zEp=SfNp1K4*qQsA2-~ Date: Fri, 22 May 2026 10:19:33 +0000 Subject: [PATCH 2/2] Add FP8 PTPH ASM prefill path Route per-token-per-head Q/K and per-head V descales through the ASM v3 dispatch path so the new SP3 objects are selected without falling back to CK. Co-authored-by: Cursor --- aiter/ops/mha.py | 12 +-- csrc/cpp_itfs/mha_fwd.cu | 6 +- csrc/py_itfs_cu/asm_mha_fwd.cu | 60 ++++++++++++--- .../MI300/fwd_hd128_fp8_causal_qkptph_vph.co | Bin 0 -> 82288 bytes .../fwd_hd128_fp8_causal_qkptph_vph_group.co | Bin 0 -> 82536 bytes .../MI300/fwd_hd128_fp8_qkptph_vph.co | Bin 0 -> 82200 bytes .../MI300/fwd_hd128_fp8_qkptph_vph_group.co | Bin 0 -> 82448 bytes hsa/gfx942/fmha_v3_fwd/fmha_fwd.csv | 62 ++++++++------- hsa/gfx950/fmha_v3_fwd/fmha_fwd.csv | 30 ++++---- .../fwd_hd128_fp8_causal_qkptph_vph.co | Bin 0 -> 82296 bytes .../fwd_hd128_fp8_causal_qkptph_vph_group.co | Bin 0 -> 82544 bytes .../fmha_v3_fwd/fwd_hd128_fp8_qkptph_vph.co | Bin 0 -> 82216 bytes .../fwd_hd128_fp8_qkptph_vph_group.co | Bin 0 -> 82464 bytes op_tests/test_mha_fp8.py | 72 ++++++++++++++++++ 14 files changed, 183 insertions(+), 59 deletions(-) create mode 100755 hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_causal_qkptph_vph.co create mode 100755 hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_causal_qkptph_vph_group.co create mode 100755 hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_qkptph_vph.co create mode 100755 hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_qkptph_vph_group.co create mode 100755 hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_causal_qkptph_vph.co create mode 100755 hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_causal_qkptph_vph_group.co create mode 100755 hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_qkptph_vph.co create mode 100755 hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_qkptph_vph_group.co diff --git a/aiter/ops/mha.py b/aiter/ops/mha.py index 7e0379e285..b06eb25dfa 100644 --- a/aiter/ops/mha.py +++ b/aiter/ops/mha.py @@ -1299,13 +1299,15 @@ def is_fmha_v3_fp8(): ret = ret and ( q_descale is not None and k_descale is not None and v_descale is not None ) - # support per tensor and per head quant scale - ret = ret and ( + pertensor_or_perhead = ( q_descale.shape == (1,) or q_descale.shape == (batch_size, nhead_k) + ) and q_descale.shape == k_descale.shape and q_descale.shape == v_descale.shape + qkptph_vph = ( + q_descale.shape == (batch_size, nhead_q, seqlen_q) + and k_descale.shape == (batch_size, nhead_k, seqlen_k) + and v_descale.shape in ((nhead_k,), (batch_size, nhead_k)) ) - ret = ret and ( - q_descale.shape == k_descale.shape and q_descale.shape == v_descale.shape - ) + ret = ret and (pertensor_or_perhead or qkptph_vph) return ret def can_impl_fmha_v3_fwd(): diff --git a/csrc/cpp_itfs/mha_fwd.cu b/csrc/cpp_itfs/mha_fwd.cu index 1673fa6a05..bb57851b76 100644 --- a/csrc/cpp_itfs/mha_fwd.cu +++ b/csrc/cpp_itfs/mha_fwd.cu @@ -29,6 +29,7 @@ std::string get_kernel_name_key(const std::string& arch_id, int hdim_v, int mask_type, int bf16_cvt, + int qscale_type, bool mode, const CFG* cfgs) { @@ -42,7 +43,7 @@ std::string get_kernel_name_key(const std::string& arch_id, } if(cfg.dtype == data_type && cfg.hdim_q == hdim_q && cfg.hdim_v == hdim_v && - cfg.mask == mask_type && cfg.mode == mode) + cfg.mask == mask_type && cfg.qscale == qscale_type && cfg.mode == mode) { if(arch_id == "gfx950") { @@ -232,6 +233,7 @@ float fmha_fwd_v3(mha_fwd_args a, const ck_tile::stream_config& s) a.hdim_v, cfg_mask_type, a.how_v3_bf16_cvt, + a.qscale_type, a.is_group_mode, fwd_cfgs); auto it = fwd_cfgs->find(kernel_name_key); @@ -374,7 +376,7 @@ float mha_fwd(mha_fwd_args args, const ck_tile::stream_config& s) #endif #if FAV2_ON - if(ret == -1 && !args.v3_api_check) + if(ret == -1 && !args.v3_api_check && args.qscale_type == 0) { ret = fmha_fwd_ck(args, s); } diff --git a/csrc/py_itfs_cu/asm_mha_fwd.cu b/csrc/py_itfs_cu/asm_mha_fwd.cu index d186279c5a..d8218059f4 100644 --- a/csrc/py_itfs_cu/asm_mha_fwd.cu +++ b/csrc/py_itfs_cu/asm_mha_fwd.cu @@ -80,6 +80,12 @@ mha_fwd_args get_asm_fmha_fwd_args(bool has_lse, ck_tile::index_t batch_stride_descale_k = 0; ck_tile::index_t batch_stride_descale_v = 0; + constexpr int asm_qscale_pertensor = 0; + constexpr int asm_qscale_qkptph_vph = 1; + const bool use_qkptph_vph = + q_descale_.has_value() && q_descale_.value().dim() == 3; + int asm_qscale_type = use_qkptph_vph ? asm_qscale_qkptph_vph : asm_qscale_pertensor; + void *q_descale_ptr = nullptr; void *k_descale_ptr = nullptr; void *v_descale_ptr = nullptr; @@ -104,30 +110,63 @@ mha_fwd_args get_asm_fmha_fwd_args(bool has_lse, if (q_descale_.has_value()) { auto q_descale = q_descale_.value(); CHECK_DEVICE(q_descale); - TORCH_CHECK(q_descale.sizes() == torch::IntArrayRef({1}) || q_descale.sizes() == torch::IntArrayRef({b, h_k})); - if (q_descale.dim() == 2) { + if (use_qkptph_vph) { + TORCH_CHECK(q_descale.sizes() == torch::IntArrayRef({b, h, seqlen_q}), + "q_descale for qkptph_vph must be [batch, q_heads, seqlen_q]"); + TORCH_CHECK(q_descale.stride(2) == 1, + "q_descale for qkptph_vph must be contiguous in token dimension"); batch_stride_descale_q = q_descale.stride(0); nhead_stride_descale_q = q_descale.stride(1); + } else { + TORCH_CHECK(q_descale.sizes() == torch::IntArrayRef({1}) || q_descale.sizes() == torch::IntArrayRef({b, h_k})); + if (q_descale.dim() == 2) { + batch_stride_descale_q = q_descale.stride(0); + nhead_stride_descale_q = q_descale.stride(1); + } } q_descale_ptr = q_descale.data_ptr(); } if (k_descale_.has_value()) { auto k_descale = k_descale_.value(); CHECK_DEVICE(k_descale); - TORCH_CHECK(k_descale.sizes() == torch::IntArrayRef({1}) || k_descale.sizes() == torch::IntArrayRef({b, h_k})); - if (k_descale.dim() == 2) { + if (use_qkptph_vph) { + TORCH_CHECK(k_descale.sizes() == torch::IntArrayRef({b, h_k, seqlen_k}), + "k_descale for qkptph_vph must be [batch, kv_heads, seqlen_k]"); + TORCH_CHECK(k_descale.stride(2) == 1, + "k_descale for qkptph_vph must be contiguous in token dimension"); batch_stride_descale_k = k_descale.stride(0); nhead_stride_descale_k = k_descale.stride(1); + } else { + TORCH_CHECK(k_descale.sizes() == torch::IntArrayRef({1}) || k_descale.sizes() == torch::IntArrayRef({b, h_k})); + if (k_descale.dim() == 2) { + batch_stride_descale_k = k_descale.stride(0); + nhead_stride_descale_k = k_descale.stride(1); + } } k_descale_ptr = k_descale.data_ptr(); } if (v_descale_.has_value()) { auto v_descale = v_descale_.value(); CHECK_DEVICE(v_descale); - TORCH_CHECK(v_descale.sizes() == torch::IntArrayRef({1}) || v_descale.sizes() == torch::IntArrayRef({b, h_k})); - if (v_descale.dim() == 2) { - batch_stride_descale_v = v_descale.stride(0); - nhead_stride_descale_v = v_descale.stride(1); + if (use_qkptph_vph) { + TORCH_CHECK(v_descale.sizes() == torch::IntArrayRef({h_k}) || + v_descale.sizes() == torch::IntArrayRef({b, h_k}), + "v_descale for qkptph_vph must be [kv_heads] or [batch, kv_heads]"); + TORCH_CHECK(v_descale.stride(-1) == 1, + "v_descale for qkptph_vph must be contiguous in head dimension"); + if (v_descale.dim() == 2) { + batch_stride_descale_v = v_descale.stride(0); + nhead_stride_descale_v = v_descale.stride(1); + } else { + batch_stride_descale_v = 0; + nhead_stride_descale_v = v_descale.stride(0); + } + } else { + TORCH_CHECK(v_descale.sizes() == torch::IntArrayRef({1}) || v_descale.sizes() == torch::IntArrayRef({b, h_k})); + if (v_descale.dim() == 2) { + batch_stride_descale_v = v_descale.stride(0); + nhead_stride_descale_v = v_descale.stride(1); + } } v_descale_ptr = v_descale.data_ptr(); } @@ -139,7 +178,7 @@ mha_fwd_args get_asm_fmha_fwd_args(bool has_lse, false, // is_group_mode static_cast(bias_type), has_lse, - 0, // qscale_type + asm_qscale_type, false, //has_sink q.data_ptr(), k.data_ptr(), @@ -312,7 +351,8 @@ std::vector fmha_v3_fwd(at::Tensor &q, // [b, sq, hq, d] // H/t Daniel Haziza const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_q % 8 == 0 && - !alibi_slopes_.has_value() && !bias_.has_value(); + !alibi_slopes_.has_value() && !bias_.has_value() && + !(is_qkv_fp8 && q_descale_.has_value() && q_descale_.value().dim() == 3); const int ngroups = num_heads / num_heads_k; if (seqlenq_ngroups_swapped) { q = q.reshape({batch_size, num_heads_k, ngroups, head_size_q}).transpose(1, 2); diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_causal_qkptph_vph.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_causal_qkptph_vph.co new file mode 100755 index 0000000000000000000000000000000000000000..624fce143b27e9ea4d588aa42f6a02a53f8648f7 GIT binary patch literal 82288 zcmeHw3w#vSz5m%w0%4aZP{2SD1C>IrZA1(Z9ztdl9wJ~wL_~D6kc2FGZFV=A@S1F% z1c(p=Dhdh$%3DxG1MW>0m8un50@~8ndmCR>tG2e)*4x{A_x{dfHfJZZoz1M=fBoMH zpTl?0_x#RxfBV}rvoq(MIj3sI+?i&RDI*@yzc|#1Ozbs=4EFkqTPQm>0dwgn5&pjl z^+mm)9M_fYlo()QGA4p8{=>BVo`g{RmvI9nWJLXF9@hgG5c(m_qp9JTeHDE}8ToqH z*X+rxd`UREn9Ok^4jJ}-W_TG*J-NR8`4)WW&BdQD+V`X0Aan=R!|&JAKm1-Kp4egh zr@*gAQ(8tj_Pv3A7s^rqg?U-CvhPBu<{3wEZh^;&PzO;THz=v&$AagOLC7=bNjOL{QNw3J*~Fa<;ayQC8|_9 zuxyo`NXSawRj$0f&YQ<6-8@L#EG;QorRimv{D%Eh-cZxaWR*%+!plP0%M_J2)bw(U zN~J5|WrghJSd}-_^fFzgQq*2X*)aFxhFRpvlO4@ac~4D8XRB0-=qPL2PkWX-iX3kF zeY2@VyxfrSA)zI~9~}#0tiL$M`b%}xKL}@vyfS-HUWpuq933ir6mny%Ul3z`INRY9 zTaUiQr#<#N^U6GO2wXZO_z-wwtX~;p{k1yk@AKFfh5~TTr$d2{z`43u1sY=&fZxj= zw*v^m?5-d*>re;_!nRliw#O>4Q-=Tq!PXUo-8vM)g0L@EfdjD$9MK{0l*hiJ%pso; z&K=hw5f+8#V-;wPRlu)9;DEXfpaO02*Rcxx zU55ZffxBowZ$dgW_%NI|#mB6W7_&lO9STR_>`lKh&WBO{794AY*k%g$E=pW`*>a6{hM? zfPj?IfRvPTEhIySRCqwLVpfQI0~q*QT;_;kpG z2cs@#g~pf_nsg{YyC@AiM4EMogok2V%nI9MR@kXSfrNrTM0V>C2@l1-m=zAhtZ<|! z3i?J{4&bi$GUx&Bx<)%r-?e80L+4GcD%B!(p2y&9VVJW4m3P$~DmZU?U8Pz?XW3u@ zoP6MRC+vv3KxKN=)g0c;-ile_a?A?v^h815H+NiaW_B1%%ypKm=xT1A^xb>f+&)&R z7V#U5!D-ywx;y=a%DYF=?7mj1_6<9IltkK|n}>(CLL+MD%}C{~wLLZ`s#LoY9v61^ zxSz^fN8xd@O0_HDaYc8JQ&iqM3XjLAREy{_I~hIWk>4KfqPSl^^H?gT|CD4-i&-H( zW`(Ib6v9H%HHvfIoS{Q3>@01L`Igc=J7ypB+){EODGX~R*)hAbFlL3tF)J+9p%4}l zzLn(Y5DRZ5G2dXC3u5*`&kZIQl8Ufa;)>ZFZ_EmnF)OUqp%4}lzLofNh=sS3m~S`D zjWPSsqub3>sVm&2IyH zah3Dt=GN3=N2NW#$l7L-)7>L`i8}WEyN-uRsbW;(GzoP(HwZxMB4IKQRtA2gU+s0A}EFz-+((U=A+726S;b zpbu~(GN)`b$EO721;QQ$&bh1L~F14zJ@9)=ImOYjlmPY(zbBkEnNhMzwk# zaMmH$h&orRh@jh(gkaL`dgLA5;4L5ZItq5QdOq!i z?P^@>M@`#pYuSYwN81`VIHy(Hw%CHU7Lp6B(+0Wb9Zp~KNoO_G zTkk3xUG4IW#@askSk>TH&{dX%8|SIA7qQLgM##6m_#*KU+ctykDaga?kWV-teBnhW z0ul1wb+#7#p0-SQzu>GEtmm6G%QvkS{0L^Y;JMMk%-KF{A2e^Qg3@4BpsC%5UT&LQ z)2F@KmsyYNPOjr#H*&9=<=2yagMx!d`@1H14Z$J7Aq0iRww8>0GF!0DK-)myAm5;x zjLDf>geA6?%zG(6*f!XAjqe)DF9rTS$|u{BeM5XhYR*CXgRr3QoBdUz&23dabf7JW z4nq3|kI|tvA36g0qb&bS+Zl8mxM$m{(TTR{!Er6vZVEcwHU%AP!=G=M*;N&Ed8|_o zwN7tMNS96}q)9{3jy7RD_zJ#dre(JDwe|HS`I2h-W>OR;va$)8E&Xi$eEog>Yx-qU z6jE$0(}!oaWZE))8P|dTfQgxjzCOM_HNqsQmrg&=bR2W=C36(b$J^q43BH7yAlldl zAI}#=n*hzIPp}V(9n@bx9Lj=dGb`Uh%9Dagq#XQPKjr(j78{fYQ46!#N^JTE`x6`R zbN!Uj*ro++8ko&`Vv`U|AU5E``rs&Nr{J3nY^LKj3$L%Wy$tQf+D`bOt)AfZ@HxRJ z8+?XLuqExR!M2&S?Yhj(q^&iTIpG)@T?fb2^{%Z+Z+Nykudju6-|`SWCc@g~OwukR z=~UC>4vtCh=xT5Ii1l#H{gfSZH9H?4Me0@C8sH!G0p5rBu(=G58~5lM@IkH%^+#Qh z>+XJX)UI)K6dOZ&gn*f!gC?sC>V;T#csVwV%eBlJex z!&N_t+g|l5d@lSQ;Lq=Kl{*oVE}woAbq2a#zZ-{-g;b24!6VeYlpChP5 zb9}r411NVT;uZLgieCd&{2HX<*WfO{rd<1SwQZcO+JtN`+8XPfE76$?FW(K9lyF{c zt*9%+KE4hA3Za3x4Fe9wZ5S{aw_(7cxD5ji!)+LFIBvs$DYy*-j?9`7-~Q#b zb?sHJHMOUqzg`;H$809;`4DT*!LIg<%%sjh*>-T!#rSWujpO!|LC=Yus4G55pxnEd zliRtCa_?bIZs(bldmp$&>g!%neQ&kdR0@(&jLi+(R#m6ltB9z@O?y|H- zD9aX~w491ic89pvaymlU0`V!!OA*TM6!%*K5y}>d2QARA;>4E>YRBhY;$c#bFCWy( z7l}tnIlh!oE5BPjM#^VLC|@i-OUiGJP=1eif|SpRP`*SwNy=}FP=2rYA}NQTZdL!5 zil<09{EVxX-zUCA%HbznwfuhZUr9Oq1gnr8cyhO@ZMkrq?zD>#tBa|14zaZtV2<1iMFG+b>gz{qX*QDGXp}a)=H&X72P+ltj zhLl%ED0hkPlJZp%%FD#}N%`ss*Dr zS?x8LJ6nr83ViK^H}x^KqJ)m2HE}7eaqgyusen@f zrvXj_%mB;)v;o=xvjDRIX9CUyoDDb|a1P)cz`1~P0p|nG2h0Y{23!ER0B|ATLcm3U zivSk`E(TlzxCC%1;8MW*0q+OQ0n7nh2Dl6`7cdtvA21)V0I&dXCE!ZHBETZR62KBb z7oZE!4d@2M{XA(xfizxNDNPWHq=`a_lrFfWNrGFNEO@0ULWML{sFZFIR!h@_wbFD! zk}?FJBnUNN{-lQc`%EX@|0rCWs-X^yZ>x=naknk#IV<_V8U z^M#$#?ZOjMwy;~eL)arN5cWxT3QtQ5g#*%E!Xasqa74OWct%<*9GC79o|Bdc&rA0T zFGx#;R_Q+Bv~<7Vm+V5Dlp_Qrhw!qrOn6mVF8o-^6<(L}gr7+H!Z~S$@KdQkxF9)& zi_%KrEvZoWxl|-vmWqX6NhQKNQmOFol1u26%7phMxA1}F5q>KLYF8zYc2Z5-i67dJ zAKJ;8IA7J!%2igsP+5iBkAK}dYudVXapNIPfHVktw{GWVm@qhX|i~sY_EdD?I-s1n`7Z(4Y|77uh^`*uC*S}c&|M9iO|DS)e z`2Xwg7Ju-+Eq;^9>W_=F`V$hY{@%T<{yu%Ieyi2$PfD`-ufE#q@893*|IT-;{y~GR z{%fwW`iBg$`oH^KtN+?-t^Vt-v-(Gju=>CEJ*$7zD64<;XsiDRKd}05xWVfG(T}YD z8*jAwZ@S6qzxigXfBbl>f8s=|f6^qYf65fA|CU><{^`@c8h6tN-46t^WJ&v-<6JtKZ?U z`j;=a`t$Ov{uL|Wv#f@;0LKj4b1|*ZjM6gpT%1s9M&lu$?g51SWYLT!Kz`~PK*&$4 zGNXx*&!`22yluT1R)^YiF>4bb~ZO_H* zy?~Hku-^<*5B6MKco-1!i;kJmEs$S)0ub^`UNoa=kYD-|Ams0FH>2s0&v^w9^2=T` zLwD4ki@9$CLO%azX6TCAbFttOAmms6!i;Q?FZwkg#=RoJh1l(5#_n`xE8RiBT11I1AGA_9cIQjmUaYM_2lka~SH*6Jf z^8GL4hOYxozW-%hN;Pou{V(H2)&s}xf3Yuh#J>T(GC2P*H@Fx$&Ogj0mjTE5hqwF{f0#?D29EO&b0h14XOS9OobAlFNYO z{KMSPa^N`sFgI)!aGZab8@>)W&OgkhR0GHPhq;mUz;XWV18$(Rr5DOzeJ~JH%ndGX z>6J*i!8p#COD=2al|s2>nHyT((rXOmhRWQqRiKy6`e0by;p;$;a>Hf2lxonUT#C$% ztOq^Hjr5_c7jd8O!0S}{d{xFUkgcm9DGh&yZ@SO-8yeS0q<&u6{@&17AM`iE7$g~6 z(J|JMLVdlm{k@^_zb;?(jDL)wzFyh>-q4sJ=c|q}5wxJIj}o_4+~1o>x$r(-%7ypw zQZBrYH>a!ZXOXrKeX^7b@8hLhcpoq2!uxn#UH)M`(#K1=@IGG3h4=ANF1(L-S(kqc ziGR>1OS$kqUdn~{@lr0lkJsJhAJ!v%yp#*?%Q(`<%kF=1zhE8sGSCVAs+@nA zBYnJ_f0!eEyqtfSBYnJ_f0!eEyqtfSBYnJ_f0!eEyqtfSBYnJ_f7^f~eY~81m?M3> zoPU@jeY~81m?M3>oPU@jeY~81m?M3>oPU@jeY~81m?M3>oPYa(BYnJx^}#?)F-Q7% zDHqq{d!03V3P%L@ zlsOR$Nz}NC;q_kQSq(hjIMjd3Jwp(C!NG|-P^>ZlQ7r)?XsiJw>`(4Z zME#a_{``Y@AI4vNO!4w(6o2_eJeh|-tp=X_!q2Pq!_PdzbB#6h`7it#@N<;)Fn_cT zp6h~VH%B+ZoY7X#=j6GtI(YpTcy0UabLSiIY*`IF8@AgkI)msHc#be0S|&U%^#SU{ z_|gaHU6TI{y+`t&(dRIN@SG%y@7woF^m)g-XRM~L0N+1rHR1eMpImJUeg?<955eEi z%dIdoBw9^V!avqb^kJ=@X_|ZeV@;n9&A8tGShG>7`5}wvhb*3-vv_{qgLpP7K6@0J zn(@@3sToflnws%cYW|kR^S3OXzi09MeGlTd@4Trw&cccq%nNX7T)(#q$dm&o6oq&pJgsqtMihrw&cc zcjB-BA!ubYQ|HCre-{KXllk&srf03=cg>5e`fLga}VOVQxVT7G&SR?LsK)JIy5!o z$!X&G!(R5Rbl3By__=AC@bmwO-T%h&{OP7)in&N|js60=Mt^}_qrbqe(O*!zMmKlO zgGj^o;4nu?zHV204phFD$Mfob1<%UL&-&r#E^!&YzE^wB5tn^yuK5FsHXOx$gk~Go zp{d!1b!cj~VWnmi*XNq|P_HZE8HJ{1JauSl##4u;W;~UeQCy#E#`A(Ao>6FO##4u; zW;}IhYQ|Hk8O8OvW;`z|;u(deW;}IhYQ|HCre-{qno(SzYsT|cMLeU>)QqPNP0e`f z(A12lQZtI{bIo|3Q^Ye0P0e`f(A12l4o%H?DmA0HKG%%rTZ(u_p{W^99h#c))S;;v zPo-uQ*XNq?d`A(_C^R+WsY6pUo;ox&1~HH`+aF(|es3|Th=#*ft2%TOCrGS<-e z-&#Y1%x5;%&w%;N#`+oR^LCB(Gt}c^te>GC7i0YlWBm*~x7k=fLw#u1RGw~h5P{@<;iaag%#a!==# z!hCfYdk$MmIW%TX%fi;O2#s0OGW9WQS{Ale!?$Y8+NbD){U*k&b!ck#!RpY|?1NQm zMlrsu*{9v8h-Vas zzN{I~?TUCtp{W^99h#c))S;;vPo-uQd@4Trw&cccq%oc7+=6FO##4u;W;}IhYQ~e(BxBYKx{p~KYkw_BNLki3$8{mhjV1d#Q0~gE{gt6IHZ4E5 z%hm!Tdpp2A3#`-izq0NZUYj-^5}m(n?C+qqzP+)(gZj9rvA=_QT#Wr4!uNkM_IEJ$ zcfjN4#{LfKq_-n;YU{to=pT4I}$Id_!x09r~iCJqz}~esj3uxoW&mwXv`7<wD$*_04K1bNb*rI8|S}4v*EIg>R#p&lj$QwTrp(;v!Vb=SxdaBcCsGp$&ZA<3=0# ze7P5G;`3q!!u#Hb-hWjk!h79^@@rNjyuW=YziutUd)kNcRT9Gc*oX4fK7{wK59MoX z5Ziz*A3l_Sd?&(t!iVy^ zocGl;8g}!uz_1@&^wfd`t>(Ooq6_a03gR zjpvRYAspVbJ!E(68N$uxxo3|P4)4t#vODn{;qZR!A@1b!gu{EVhqxDCAY3-jooXc< z-fKN%_tI&?;r-P^+`sw>hxb$uaqVq{Tg-E30))eRr-$rbd6{rac<$`0gu9pLUi&fO za(M2I*9nK8U^Y4VK=>?>~UD{ky7ZtN>g$E=Kf#o3-SCNgG4 z_H@|>{jSEo;&@LS#lGS&=9HSOGUkNW^7VU?o`JwD-L~AYW>H?zT)a}G4>TV z_7%tT98bdt{&tEvuTWg}8pYfvaexAI~T>HRGv6Q!}1AG&SR?)Qn=SOwD*+P{cC|P0e`f(A12l4o%H?Dm9~6D^oL` zmlg4hLQ^xIIy5!osY6pUo=VLq*2>h3=c|f%Mxm)0PaT??@zkNI8Be8V6l-N_#`Bya zo>6FO##4u;W;}IhYQ|Hk8O2(en(=%~5zi?N;Ahp;=BgNbMykif z*fUZ+F2K%0BwL-fLVYu0cQfv2AmBz2XGGHT)?@2^8x1rW&>seE&yBrxDaq5 z;3B|9fX1`g#sOC0^1qSW*g6DW=1DpU&u0Iv ztvu-@ULUwkOgb4v!MDqfI^XsjbG}{vtn+R0gz7Wf#-6o`F(6~l+RHC}h~6dZ0e_C( zBXg2Ir_W^v(aUX`V``0xczzReW^`z3j;ZO;)ErY&YDO`?MKhkwig-q$sX0ceLsK)J zIy5!osnm>Oev4*2w=3cqg{Edab!ckFQ-`KzJe8VJ%x}?*XPqLRQD|z$Q-`KzJauSl z##5;o#rzh{cs42G8HJ{1JauSl##4u;W;~UeQOs}AjOR8*JfqOmjHeDw&3Nk2)QqQ6 zGm7~wn(^GJh-VatuL+1>^JOzX9A%!1clRx6t%dMtPiX07<(6}$HmyYz}UM0uNz|*jlB!- zxNHGjTB)5^jlBzuy$jA7dlwjc7r^-;v@X7}cR~AEWAB2Fj9R*Gi)}r{tW6YWZlO4P zE5$jFP@KDi;{06{v-eV5u%F_>!xW8mWsLbTQ-lg>s!%E2BCM9C32UY4f+S@KK1mR2 zB%4qtWeN>ame4585H?CPg(hj1uvwZdG)uP%Ez%ren{=D-urycLF3l4jljaLMC1YI~ zV_g}ob!CkCG2im}F>)UqKR-t9gJT?-A7fLQACsvvKPHE8WZf1#XJ+_1m`B5{+k&~2 zYT%$BQr?pqb0h14W9vH0>$?ne4(OG^`G@ze9$X9@=O5;h%Yftj!`#qv;5h#z=B{=TLiuldBMwm4Un3D)%9jRH5+1kQ}Ya~V3@(3b>T z(O0++AIe7C(D@6#KpbjCsH620tI39bLh@~pZ~N40YAah8*EYjjMfkOGa24)rKMiH4 zO?^z?^Ojpr$5m8$GoHQVEssmC0IUG41gr#H4Y(R`E#O)}2~Yy`0r~)I0BZp20P6r7 z02=@s0UH4~0&WCs0&D`@47eGv8L%0!1+WEh8{js;hXEf3+zz-M@G-#00CxiJ1bhPU z3BcWey8-tA?g88fxDW7Yz^4HZ03HB51b7JW2;dRGX8@l8JPvpq@HxQe0E6b2Va=^z zFlc@iV^z@n6)o`()J9}BBBxc(X;uD_+Ep^UN)@eDf2KC8 zn9V9qtAf*7{S~!a&FogIXs!7xwOPY#)^J)Pr?vJ!sNGs-w^l`K-G5S>b)Q^jem;Nsjs&1|YUt<{`XO#-#6VRkht zTD85YO)ayj<+Ro?t*~Ref*sr5?#H$l+LD($wksfC0mpV&eUa5K?_R$g?8~|OBIHG7 z{ne~~W%v4(U|-4AhyCn2Rx9hTVf9yauMd0Tb*$p*!`^otYoNaGbeR+NTie;{HTibS zuPaSHe5};`(0)nO&ob)`ss;n+?on1E(c%S{nyZyN%3lql(t1YpBg8X0wUYs^YYohETgEX4j;m z^}u(j%>&Hl0Zz-uX>Goi+HGcbn^m;7Tt{uTFqQj;1zSnax&CtCnen9XAqdw`;q%+qKY6*K+Mvg1n?`_f@R^ zy6*L1BiW91Tz&XS>8MiHuVM9l-Rt|n-pAFifqad!el4qC-MxM_*jID)Yaw3?^#k&; zw$|he!s{HkDj>(9j~;v5en4Zmjm2=AN}M0Mf!aL8Y#!p`T*rN95C4eTJ1OltEavw4!!+Q@0`nL+LLFuOe}T6JMX@wno4Xo|fb#J>c zJ*}gTYr74QZ&0@V^{jq<_xklI$acYxU)P|-SgC$%}qYz}f-n>npRcTu}T z%LYj&WKKa$3h7)b2R5JFcSj>~d=JEVFr*(`ws2>%;Wv zj%KcY3*=jr^|!M64|cErAlN_1)!z#Ft?c*<$j4g~J^ni7e6z`iP*XcX)$Mq@JN)OWk%yQIWmWV|Ter^PNR`o*O;xY2(uK$L3AQA3t$?-ZIDXbVs^l;<&V2$FiHo-JCmiLLQ4r$a(ICiM;NO zn(>C&++Sbvx;J`$7DvL%YZ#BC&i{>MJo**SBJ?B1r@_S#|Ci4A0_Y*(|M0%-2=zug zuc4mFW%C&ym>kaIy(zH$1qJ^Grp<{cdr7!`F3jL@p@)2m&^4T zjfTJrtSy(z=M@=Kn4w%Q`!C~lP>y>8<#PEvCZl{FV>0A3$geTt@ZirS*O%M5j2TeI z|7GwQczq)zTwgxVhK`>fQp4=4a4Gkv$c~31To-=^{3jom+3dJ=QJTaEzm((GkMYwO zC)W?K<61r+WXLCYX0LdAG!J;pjb-IB$n}rO1Vg#LjEjH|#g4OeO`#U)%rFVca66Oz om)p<%z{$RI)zN{ew4(g^m0YL>tG~nK3QK?#?C7>;By|?jIwQ6fyZN0s{cmKchnBB9J+0HVn_tyWN z<#YJ%`JUhT?r(p4W-@cmnR9Aq&YNX6nX(fReI=kCWMc0zWV81lO{eUp=e(X>cRyNrIIjC?)p zYuZ#+erF`Qn9N7AQ|$B1z{gTgt}lPTg&%keuvDy{N4-Jl7O00m?^0j*vxw{1Vf<2H z;#f+DkY-UIz|Y1YlnyA&&z+rj8$$I@JIV@*J&uf$LU(y#spsC-$1@!6m7cdzYxC0? zjuqZ2N2%ShVnw0HbI4ujD6sq7PH!QU6co6DvwK{Q6@~j(mR2lxls=wO?kFpK+~sxK zD-L8-6cu?2z3A}_k8@37%HfP^M`=}|y~J5w@Dw$-FRv;pDs(r|YI|Ld0=ZI(N~MF# zSKEn%tmIwoD%|V5e!SAnCy1LB<)y1Ny{wc!u)oR&YI>QbQt6xUvPAYWUF8Ecy&S7j z>6`GfTK00B$_HwCnWa)GW-nuGm&La1h#bC`1Hdd%OZW;uYAbLjZzc3kPAh4uyyy?2A|6V7vlP>JWI`V_#Y6kWUC_ zj_Qzzh{AL63UtIP;MXB=&|@!&I5VCJ=un9W#4GU%ycVy(865)f2cm>OHJsC-5fO$r z;}y6VufW?n1V|XTlS7XVjfgP3AFsei@d|vZLjX<=6)tb+p;~zzeb&@phg4)h2F0zA7PrE19STo* z?3E?P=^9Ek!qEIjME_%8Ia7l6|&-1n5IJk z0#Zo>QeMTikZc`Nkpan#TVYPz3iEU*z_nI&#I@F0Q=Sf)$Y3mrTVYAu3d?jTz=cLN zcfEDil&?c1G86@ID-_4AP^v?Lgd%*+b=Kt4Arcu1Z`=wsaVxCTp#XZJ_~oi%l+REyeq9*46f5zYowK2>w5;H>F2m1qCp&ekCoah5j6eM@Pc6Sog~ZYjBtlti?Wytv(26t}{XxD}S^P>2W# z-%9dzh()%NxNk7c#c}(f=LVAtNp(akamDS9H*STRxE0pvP>2W#-%2DMVv(&R?%Pds zOWZzO((UGnjFs++Di?HbuPiGp_u7k`HH8IsJhc8(Jc00eGC>VyriBa~rOuV*DaU*9 zxXM{`TSrEjqsCrT>hRiq748xyPRbCSarVoJ_N5CZIAA)PJ9B(dS+T=jTzEJ|{9t3Sjmb zJkfCJ|IRy8(9|3Z;<=+;on{p1OhBhPktsPqyfG)!ual4*NXJBCa$p?BKIT9kKsI1L z&i6G3@cdT^@)BSGuniEUm;<{3r0&fC1q1a5?1j z0Rw<8z--_pKm_&Ma2{|s&L<@Y5HuhOus?1{$>1l(A?Cn1z-+*5Tn?BA7y!)2<(Gpl zE(erw`M~7BQLOjfRpv1n~-~CliM@8!*h?b5xGV-x++IE09WVsjOp-fbv7|>7jlnjaC=4u z-JVpGh|mV)9nx#EgPNF>ug(XL0daYN604G zr0I1z%{d!$w&d6#*Y>beYCG<%gL)fWm1F8$o-tTkg0E8teg$2Xskrf-sC)t2jA?;< z#|tkIAF*v4*q(qqybt+=^TFp|a3T;PpWSF{$De7-fzJ!hZpV7k?Ag-v2Jj=8(~jp{ z2Xp2~*gk09R12lS+CXcUP;c9^z_viTS-QDC&z2|MBHdCy*EUzWNxG?io^75qUz%Tk zV30WguA+|^jvh?&g@B2`?`Qenvb=;|>YP&Fx2sOdX~O zUmm=i;LzYug2ED8d-feU?O11!ZICor8eE?}C1Gs7hN-{SC|L$O8yTa0CLQT-@>bi!TU9~}%$2#>; z$Bd4oEb&BArZ@~e+$l_e?*snlWahN@v-OiwrPTUGzKV?<7Sjx9&Zz7oD8U`kGk)K}_TFHDAdS@b*4z`r&4k~5m-6K#o7l9W`x@M>Fo zijCSY9Bpft`CGql3;A*D7+dIbgJ@GHgiH#e&46aqH`tf_9`Jb^MnG8*ZDHkGNqK58 zm6XHx*)SC}ZBQOW?aXEyu^A8?Ky2W9ZI}w5WrOk{YGyVYh)q&3iP(V88-m~`i7VL1 zF}}j~60~n?7vY1pCW2Q(T)-zAeCHgnCC5XvZ3{UD8*{dhW1_Xv3CH@FMmYYjc5O?2 z-LuVkbpsrq?f21RH{$r2MUJ0nI(78?0Ot(e?=QKkoCt{*PriYA0^#@X1mV4e z4^~?CF3tzS`S?iI-n?&I&>v+CApp!QAQ7l*4~pfO{8n zuf9q-_};tHLjQA%QtQUV#CW2`#!f%9-jQdCwD0)wXtlYik!2ZS8`htz8JWwb`NepFrCG?1=Uc9JGJJh1x%G(EbS*YX1r9 z?f*^Ar#BToy{Yi&P0pu?_RskhYX6*Hq4v-D6>9%c{L&k%i1r`$E8PCWeudkA*spN= zkK$Jx@d~$p&aY7W=llw_f6lK^`%jSgHvB8NO2%y%a0qV0fN8i50}jJ&7;rdl!+;}j z8wO0rZ5VJ=?##rlFRp0ps(rP!D--?o!l1rpGilFxe7PC-T=RDYSfqT%JDid}2D~(q(Sc z=<|tVDK`ppx$GE0>EUAmb3;-GCZ$l0v^mVB4H%e|PC4@1Vs6;rfk|U2M`DP%;X?-| zp>bhfF*o80(4!pj4|C}wL6368Kg^994SJL#{w)FyAH!%=*gwn-NnMzfLOJ3e=F$c% zOiHI5@egyu1}{t+OF7~n=7tYln1sfM{lnadD?pEO#6QfXj|4r+5&tkZYBcCkj`&vq z9Py9+{rxqZyKr1FKP0t137I~<2psVdb7=#BGZT*Zh`C{dflDA9@ey;whXMz2g}UVT z#@vW2fE!6T;v?qLM*;`m5A2AKm>V@3xT^>^DhWyK+F&ce@hz23=t~%lq`bE~(4dp} zyqnux3D@NwC%&%5{0TJVMa-Y{`K*~frv>iYRm&#$N-XtJ$|m~CEDcf0CiyBXEm6v{ ze3h1sQOYL!JeEyS%BJ|LESsa0P4)RKZBfdm`Bq!*jZ$`lZ>^<0O4)SZdds#bWix!W zmK{;bvVC=y2cwh;z6Q%fQOaz-CdS9*a^o)3@2OCra5Y-#wPS zQOaifwptEGDVyWF*YZS^vKxKdEQg|$&Gp@9IUJ?zCf@^=qfyG{`5v@96Qyjv?_tZc zQOXwh9Fe1}OnzI;$CU+jB|l;cYYwes72M@adcDCJ9h&yeyPqmUNqK&h@&e!Mq}&muywLXsDPJC?yvTQ! zl&^?VzS8$IQeGUTyx7-G%AvDEHGWRt1ya5$O8F|^Tco@sN_mOz7o^-3rM%SlOHy7L zrM%4dYf|oxQeN)+H&X72QeNTv4JogQQttA-L&{f2DX;XsN6ObkDR=umAmwYLlzV(1 zS^^36wWrX9?#zevhwo#n>0L6I^ zQ(Ul%V%}bg3lC6SbeQ7eBNUe$qqy`1ipyT4cy|}Y{Ff;%f0bgv8x)IvMzQz;#Z|wc zSo&*<<-eimdXJ*}L%;-At%+Q#NL{;8;F|itsnNI(t>+Z_f?x;w68BF)*%%u-drk@@ zpbmt(J3h9WZ0IK>e+u%aKCzlkRW@9Xxzi1}qd@8+ys59L10{72t4~PpNO0F?W*@zf zIiY97M8JuFS%6u9Qvjy`P6M0o80G9#Y4R|+TK43oJa=_()1%L&B zMSw+s#el_ts{mI4mI9UnmIIapx&U2(Za_C6?&rxAip2@SDsiGvDozs0#Vo-kP8Qta z6u~P_6{^K)LXCKXutuCNtP^JlqL?j6q9D|ZHlb0>5t_wZp+%f2Y!YV)t>SE9i#SJU z6K@pS#ksft#lj7ZiU$hIIV!jX%9l}fEa^V$mh45ps zKzL0o6n-KW31`HW!cWCw;hg9c&Wo#rH^ma+=VGaFQ7jXFC6)_sixtAZi!Px@tQ6iA z-NO5#NBFH6Xjq*@+DRR4Cw^!@erPAB5~SK;HLI~231=2!Dw?bM3={879sX38Xt9EroO^q-Buqf^;_|JEVL_4oJ%(t$Z*j6jB+aa!3`BT#zauxgmKV`9J>1;{W7#7XN3TTKs?by~Y2>&n^Bx z|H|K0Ce{euTv{g+>E^$#6t^?&bsR{s@OSp8RCY4wjBY4u-qmDN9b zwADXmjMe{xA6Wg@Tx0eB=toxnwbxqx*Ij4zUw^&TKVgE^KWUQHKY6m%KXt0rf5Qz{ z|BM;%+X+^GPL9<-bEefld$!eoK*&$8HKR$8&u##OylsOSR=L`9K6f)9@vf+-JbJ#djTQ8@PHYnOzb(o=rADU7auXB8z8^r7$D@AzF3yw?m>a$tIL<%JjaUyH=O5wx3@!`!GQ z;5h&G0XN9m4ij@&9}L74b3@A7lT#=+1jiY3X_f8C>6A;8xnWi9$zv%uOy-8K2E9Di z2gB-)SPy!X8zI}J*MT18(q(Q`6X;QHl!S6$zo?oq8yf!$`>JRBV=VRco9*unjR|tT>KGG23&VYs zxUJ&;-W1A3_VH3KvX7T?k$t@R;kKVk+CKEjQZBNOmvWJPyp)UV<8_7o!+NBTmvWJP zyp)UVTRTx1`wjc}2Dyg7u6?Bj*F!r#ouK3)gmBKvsZ`+;3#AMXmn8GXFaPi^$^ zs`t?weZ1;%G5UDJV?0J5uhGYg#~6%0UiCh1qmTDOx6#LI^zj;fyhb0d(Z~CLypLDD z|Ajs~FTO{|eZ2DhFXKobuYCW@IMT-}-~Td>^zq8~zl1o<47N`eE-Wh(#Om0 ze{sKHBlt4N3H_>^f0!eEyqtfSBYnJ_f0!eEyqtfSBYnJ_f0!eEyqtfSBYnJ_f0!eE zyqtgAfg^pqoPU@jeY~81m?M3>oPU@jeY~81m?M3>oPU@jeY~81m?M3>oPU@jeY~81 z`+y^TyomL|Kuj@5`gkc9*~d${$Ua`mMfUObXMHeCkM!|UF0zl8a*=(!lp}q-eZQ=S z`D^U?jNN;k^?OQ22KQ7t5e!MxyUO7GUgKE}Jl{Cff6F~X5}K>dJwp?Pxou1U=@T|t)I;&ET*@1f_&ea|J)2#6)PB@VwOfs0ZVV@1u7}{!{cW$$v_p!wAB&mngAczc0{d-S3>Tn!W^l@3hr~ z^Iv{^nJM@w9P<)_*RYEnFf$}pO;gf8)=ZJG)}?8hlmD@%q(d{I&p*~|QEGm`;`srK z=VvUQpIt&cTNK|t22IU)>d@4Trw&cccq%o2%i{T47SG?ac>ewp;@PH%XAGK}@zkNI z8BZOWn(e#qkaA&ci9SUmr53Gv*ah-VC%n(@@3sToflnws%cYJSAx`4Nlf=PaI| zUqU<^74eKgQ!}1AG&SR?LsK)JO3ja1JU?df{3DC!A1@)Et%`WYps5*89h#c))S;;v zPo?JXSUi8n;`t{Q&p%y4Jhv<28H1)~JauSl##4u;W;~UepRjm-!s7X77SBIlLOgdW z;u(XcW;}IhYQ|HCre-`jO+0_t%bt}EKW~bko2Cgr|Bu-HZ#>VRZXT|fiv-u`&$Dav z=h-#-^XwY^d9`bFb9f#^CcX!UIZE<%yV`T0^0hpkSNAJ;R#tx24?lN_%kcHR+H;P$ z>|1lqA5gU6815r9+prEz%{Ht%O%cx+G&SR?LsK)JIy5!osnm?& z`dl-f=M?dbK~poHIy5!osY6pUo=VLauFo~&c~KG17&JBGsY6pUo;ox&CtW<1|i z#4`p>&3Nk2)QqPNP0e^JHDkCw*No@eig?DLsToflnws&{p{W^9PLr%@G?0x!u{B}H zx*0Hjq_$p$+L)5DhQ|NY8X9Civ$1{#%x5;%&rqMYYpkE49v5T%4E4Ae>t`72XW+Tb z#`+oRbDNFzGcI%+>t`72XBg{e80%+z+t<&K$F%;L*U>Q6(4cE*;5o~5ywO-cL!Q5F zte^3}w|>T9<(kQtI6{Dv6e+>%$k;|k6F{Qh_xENRb$qYq7U}# z7_-))so4jsLsPR4R;d}o__Ai7c8emOF=%S`Y3tC`jHeDw&3Gy`V;Eo7jAxr7o-t@@ z##4u;W;}IhYQ|Hk8N>LpW;}N&;u(XcW;}IhYQ|HCre-{qnlX$oYsRxt5ziPjHRGv6 zQ!}1AG&SR?)Qn+#Su>ukig?DLsToflnws&{p{W^9rDhD{%bM}ru83z0nws&{p{W^9 z9h#c)RBFaBzN{I~or-wIps5*89h#c))S;;vPfnAJSugB8W^JtfwJ<4td3cWNBA6RX z_IIG%H@o&%w#wMF{M;^E3ykdT0QW4gPS^j)x?gy0+C)fn{<5*ZgWCG`#{LfKNlI7hN}u?C#H6YV_#o-+!*`% z8vFX9iETzOe3YT)w~ z<*0?vSGv$fKJRg(O?QgJZ{33MKJ%gcy=@5ZEg#BnYe#rL`B478?FjE7AId*)Kf?RQhw=~ZKzOhCQ2ybE z5Z)g?lz(I=!h6Dp^1B{Icpvys{;}N%@BJRi@7;s&e($0D}_(L(oDQ$56WbrNm~&z%Yo4)2{FvU~X@ z!Y$>w)2|TjPM&-9$Aru0xz}GK9DbHFr1!>82)CT)&YmG0-t#L?@4_Q*77}HdBAzz z^MupbSG=|B#Hmxx*?13{M7R)DjyHw&rGYW1)2FFjXkQxOUVW8veQ4eHP=aLaD^Az= zmg5>Q_7#sDV{(zPqsTEP%0-Sb(dPrmv&o?`CdwK6ipz5s*fR{HMz<#+wl7$c@oa&y zuQ>Egsr4Tl`--c_#n@Ne*jF6Sb36%ac;@?zeZ_I#wz03c`WW2Tv9Gu>#$=2!8DmTy zr!mIlaT;Sx7rGZ8fiWL+p?k?Oic4Rhxa>uWcXv_Df0^R)S1A^}L9ys(6pJrVG@flT zo^3LoZ8Dy1(t5V(pLxwqf5MttW8F=6s5Zx1U7>>(67%TfapKW^mPxRTQ9Rn2OOvav(J!D__c=m}8z*y2H?&+?%o|Fzv z&Gn>oXlkw}rPPdJtxV1Jxn6^C^6`v8Q!}1AG&SR?LsK)JO3fJ7%G8YKIYm5U(A12l z4o%H?>d@4Tr&2S9wK6s1c~KG17&JBGsY6pUo;ox&#;{hVW<1X*;u(XcW;}IhYQ|HCre-{qnlY@EsTt2V74eKgQ!}1AG&SR? zLsK)JO3fJ7%G8YK+lqL`ps5*89h#c))S;;vPfpX=GqSY{&yyL{*K9`r^v`FLdsbu5 zNMp}P3w~BjZLW&3XQX;uj6EaO<6`U?Y3vz^=dBofMyk(S`Tt?hNMqiNF>l70H)G73 z!RL8n-i$GCX3B(~5!14dUYG_r9dJ5eHefcO4bTRd3z!Qy3vd?T9Kbn%a{=c9&I6nW zxBzegU>;x|;6lKKfQtYZ0WJny3}`%?Z9JR(fBo5PV@=?1`I^8mHv-na9fapbc=7s; zWw0hN_iQod(kg*76T39oZdes?34|l_DzKgLY_{=iHlAOTUe}(Sp2wc;WOHuRp3QzE zrT3hfzSQGE6nv}lDd$_BBhI&~o^ihAJEr=~wy|fe zVhqUGv-aYPAE0;0dcdEdcgdWj&**d6LG)6m=9pTGBA#EzoEaUOnqz7@G&RT6l$tTj zZ_$irnd@4Trw&cccq%nxnBSrq&mD?*#-OPgPaT??@zkNI8Be8V4D(ww z#xTD{GoG!Ac*dZq8BZOWn(@@3sTohDW(@ONG~>Bl z5ziPjHRGv6Q!}1AG&SR?)Qn+%i)K7`D&iS~re-{KXllk&ho)vcIZb2FTDEt=KhwH0 z@_s#RZI>abu(pe_XRYcSkx9M^3tnqkZT?7>uhN3oT~;fb?DJUg8q8{C#@+?$aWVET zP>+kTcY(2Y0bV!8?lbl-z~i#TaA~D>UN!bEF!nAuZR}lO>|FrohtRtC#@+>8r;WV} zy0aVTx-GU16mvIIoVAtWoNW~6K0tBa!xR_nqL{ar;=%(I7agW(tSe*8kC`e|i_?S} z@djayI9*sL&JaW~TaZLSs26QQqnIN!i@8FJI8)dp&JtS1*}@iaj?gCFD71@nh3(=^ z!u{erVTU+hct~6z>=cc4WsG%YwAPg|=Er=?=f}u>aQyrjxetzUWPXfIWqwSK%KVso z!jW}b@SK?u>tP-Zw{8pO((8bOen@#wYRrvl0*IL<%J zrBwpQ`G>h-RlsrnVQ%_A`QK71$}V?$@pNr42^flznH$5xXK{ea9l)zn$J zKB04_x0dkh65uLa>N*K!Cry1#S9z`eN`MkzJzzaxBVZ$7Ghj1d3t$W2CcsUAt$?k7TL8BJwgI*Qwga{UZU@{Bct7C% zfI9$p06qlx5a3S0oq&%5J_@)Sa5vx{z&(Kb0QUjz2iy;M5bz-2A;3d`PXay(_%z_t zfJXt30zM1)EMU<560Eru3Orq{dq*ajdr+|3dunSR z9P6IbR#Tg+awN8?`h?n4F`Fu|Il*ka$aMb1%kL#)yXw!VT{W|-R?+hPf!g?(jgQl+ z;OWJP)y!r!r&Y~qt@)DLtzmX+RJ7LqmD;RjHfuR8AE&kMKd9Y0 zX17j7YyE#xoAu0QJ*QQ}X^Hd>)t>&~OQv$J*n4P4eRhK|* z>X=O(r?rODs!yVJ^~|nbMXR9?wP|2B4V>0mrWJ8)SF>Z=+xyt|LR<23$96U3tKry= zsPAL-t9sY30{beiz7O&~W&Jg*eogQCHDF)E)rbA;y4NV{uVwXD_pT3n;&rd)>cif5 z-D{z~bh6Tk`mgKh@S3FE^7|T-gpZZRzBGP~EPjnDF>bO_n3>riUf%IsQIwC?#HwYi7c+{0-}oYs~rsNEK3w?##3>y^}IE3?_kY1MIB zZ6m2&8?$Ru(Yp64YI85MxtG(b=d{{KQ@eI%*RGRn#~dx@)G5BYj!{RURQ zu6O-9u&?9lH$c7t>IdXwZJkL9!ux!(9M324gKcF$(&SJP-CC>L_mGO#!;`4Z!_4MkPHO|FwR1AH+sW*9s%SkjmD)VQY#!mXS~#soZ=iOMGP_4r zw06y)HoKV3E>3GBr?p$4cDtF~ZWXP^a;VK?%;qspYZIrnXC}4V!|e8`XziU%ZT2#o zy`0u&rWJAQHM6$c*t_k*^tA3quI)BMzFFD!H?aCmz3VrDeG^xI1LQX->u+N9TYA@r z>9E}`T>VXu-=wU+nbqIeyZ%P7-^kV94EfDaA0KaUT+`#Nksg1$<$Mc0{`TESseR_i|cC9n|h9vpcGy^~?%t^9-|jhSO^2w4N=b zcF!`qXH~S0t)w=`n9VUxYa7#wIR3V<_S@RK{lav{?pCh-Zh`z3W&7XC>fh75{ykuS z4_ALHFzeJemmsbmG!r=`uFy(e=peI%hlfo`EBg@3&_V?D?R>t zQx6jWXO7 z1rDzRWq1o~yu{j3=3D`EVX3hgz0Jy;ARA1cnIjuyPMjqh*d|{;j@jnsOq6YB&B~H(XU&?N zh3rcgOmH~8h3?GpMPFiadP7f_UscbUa?@i|1UCZAVil$$?3 zui(#0frMZ3`Q|cs|CeB}Lar~LS7f}JSabR;FT+rHgSF-Q@_9(cbY>`*%l^xFC6wdd zL%Cc&@5v~i_m~X%4)S}9V<6%0CD#wNf8}w2jp3C|{ty3mExg6`<@0jrxcVWrNxKY} zaxX=80ul~PiLH5zl9y|@_C|04ivLjd_0;DJm$u+ zav9|MM`VJbTwlh;z{|1YO|zy^i#%qS3WfMwEBi0EpSyvRedp`9+(0YJ-(N11KUdBd RN2&i6tN*cV$WU4T{{istjVu5F literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_qkptph_vph.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_fp8_qkptph_vph.co new file mode 100755 index 0000000000000000000000000000000000000000..58144a24236cf3b899f9695ecdf6d43f17fc8d1d GIT binary patch literal 82200 zcmeHw3w#vSz5m%w0%4a_pn!oQ1}cSK+lUw-JcP_9JVZo9L_~D6Ase#fwb|Wd!fUd5 z5+FhhsHi9iCqoELPj)z=5al60ihqzJenSjso$a>C?j7F z`JTttFr=DD2{(g%-^cCXo7w_j$ZxFf#>fz73)EE9N5>M}Zb4XKFe+SEW)+M_JQ;!n?vz>hQ>) zyO2sG%MBSH5?T`c(Xlwe`b!h6zg$QCC*e%tthARp%jGEK>rmmNP?%u-;soo%*$$uB zF6mc%!fU_PS?QHS;MO6*hrpL${h9>puhUU~pVz)P9Dp-^9SVE|&NL<}(2}SC{Jp&7 zb^t+`8wo<24uz;7Y)@2RN1_5dbqGKZY>^=B)}as;gnfw$97t5)DIEfjd+jSL9r6j` z%n=!Qr-pMnG@`=r zW}*TY6BT$zhX4r!cXH^_p%E2^4-yslI8lMmbO^x7p~CG8pB%o>ArTdYKP4*g*F**W zu0sH#z+JSTH6a}ud>GD}k`q=)O<1A74uz-S>`<+|jy`J|phGG;AcGTDNKaT{gbszL zz4poydok{oD0G#{2g+Gfh7PUhh+LJh!nlMLexO4Enn<-uK*sBkiVjFt!V1|5D@@m+ z00F6_0V%KIT1bu#spx>@C9E(vVTJiR6yRE`I_g^MtZAVRndo3FPFP`S!V1fED8PkA zHFv#r)|9V9Bsvs@2`dyQtWc^$frKJ*&2`q~)*%ud3SYtsH3=)M)1d%|NQL4M@#~O@ z4n||b3M~mMwCYfRc2N;^h_vYti4Mi~gcWuqtgusu0tp3wi0sxO5*>d}P^lKP^E?4(OQM_&s(h;EP{CQ#Ybw=ZI?DzN z;N%0hJ5fj6IVy8WUCq(W?9GG~E+(w-&ZQ{m`{jkx}>B?;ed znp+a~;gW7Qk7us*R8+a4dwXS>v)pGda@9Bs?RaSYCwKzk3uJ;C%uEX#I7(eB%Ttf_ z;&GL;=C+Q^GDnTQsMO)Jixr*{CQix_opJW|_wCCTOmx6>HqV49MPfZlAl@UhOWPbu!cKaaB8f&hV#4+(XZ2dYqLoF)r#;w!yWK$#f{wyVC8k zuc)Xh_vP=;^sX)|s3?8=2XTLHW=SE;5(-Pe;K$)i(NXOz@>D>G@Hfqo=kHZJJziHu zdAmHhkWK-N`X3h!N`hH{@;pjbil<)=#4{h|d6IuJ2?aZo(Wy>k>Jubi#M~!Hrokrl31(m-xleFB#=hp@LVz5=e4OuR4&wQ* ze#rX)gMe*-DAgR?4dDNlId}vx2si-3?FaiH;*{6bs~`Fy}2pc^m; zct0S5`fWH5xEtqF`UDX)FhAe`+|c@fpBRUlgW~~n0CR9T;6lJ4U_LIt9CUFxpdWA( zGG}ZuCuf9`lfM^&-!^16Q$&bhGwPGk1@Dtm)=9gNdu+YCa#R!YjB4_D$8>n_aWx|M zs780?md7(M&SkFIaj(>MnYK#ZDHkGNqJf*jg*6b8>W5F)^3CH5Nc;O+lbA;&_H4Xer}jH z7TdIgO*6CEKx|S%DZ~bR*bo{6?G${ofz3?ZX5sx6wwIvYSi1-xvNaLB3ce@!WP|UJ z3$~=4HQTn3w%wS!g|xNSN*5eMV;kYPy2`yR?RD=q*HsPB?%VI9$3#@SoK4zgES)-f z+`%#F8(ZhA8npqAxu39Ou72kuV@SO^TQmHlKEUS?AGTD&apM_V4?f6sq5hb2a^2mJ zjhWG8+k(#n9Vn~2qX{8=F2Kjt793If4hhxz-N}$r zA@zqe0McMc>5xW1%7Angq;Zgb0BJm=EJ)dqrbEhslm}@pr1_8*LRt)IDWv6)@*x%0 zhRl9)O#0n5kY5Mw{*}%djmcg1{*7HGw6*JB>TO$WJ9oM2A9aliJ-W+HgXu zC+(!KKna^hp_wDAc1o4Voq-7Hp;z+Ik}zZQto}=QmL;)Wna?=#_%!~EA|F?LQ$^49k+=qLp&TFA;3CT5xQ?T~4wro$F)W6! z3B)yctBS9SReXhK2kGCXNE-+)iTH{Jk@h-5_I2bH=aczskW^YeQZCOJbv`+Rav3r= zdd&Icag-a4xjc4^pp3||fVrV*gHlo{N7@|b(gzMo$)FthZ80}|$e@&Qlp`_3+=yX= zQqcH_ub3No1?W+Z_=mZSQJ_aT;veQlj{!Z(5&sqghmT=2I^rMZhNdk_Nu?a|4|C}Q z7o}uSj`)YU;X@XsjH4X!4|5}iElNQXBK~1+-(fo`OuDT?CH!h`IEEz?lh0e8k-FA;2XOj`)bV5yOCkxI$g> zdt+|o6~K)m9Ptrz8KZ!M?+13oN6d{L1Kf8BH#!CR*|ot|f)iRQU2y+B2Kg7h(}4z` zz~|k(=1REc_PX#jE9Q@*p(inaLKLl8qRRsJx~gRp#S%+>jIv2$nWZ5{*<`W8(h{RA zTdcHfj8Qg4^jbE>D4Qx)SvJQgnWgn-7(7Y#Eq86VwBAiH(U0^D4Q+bW7!*{ zY>v3qav(<8T=8DZlQGI}5Vu(l#weR7-e)-!qwGfU0n3pXW%I=cEzia%yGeZ5@?4Cv z1>z%?qcO^E7I#@XVw5cuAF~{fQFe>C*K#68*&^|A%gGpJw~G5M!5C$W#V0M$XX3(_ z3~I;cZQ>zPjxQh7%9n^wlX841p;msoc$k#WjZwZ-e3q2o5TpDK@hB;u7o&Wcc#M?a z7^D17@kLS&f4WuuTP_|a8j;-i~mZ>;ZLw?xn1lc?Y;Vk)awtmw17cuZmH=N_>lym&7P95r0m~-7(5b#kWa$WsLGN@t36B z6QjIb{5MkWjZt19{)&{>#3*-*?~?M>G0H2&_euGh808-ELsGsrM!8q~$P!GduRVn> zbZ3?H>5Fkv1;uPH#i=62>1!#@sHK?GK+(2=V%}zov$s;5yN%+!2Pn>enBsz66c_HL zxM)Ac#fK;^IZScsQHsl6q`3Sf#k;#G=D$p_;8lu+Z%{1yDaGOo6j%M6V(Bj_mj8;P z`+bU@j{uY0wI*_{;_upxf>+lEPmRI-V?C$Pp9prKFLD0_l#R8av*-N5B-DXWcgH7I zlMVftOd*o!|RhWI+8rKSvf~8WKHZDISFtQ zU^ZYj;8eh=fYSk|1I_@P0hj}r184)Z0p4Z z;3~jXfTe(?faQSYfNnrHpa;+ci2Hf6gkou;uu7UFluDC@aw%JIOH%}oG*$3P(}Zei zx=AZABrOnjN;eCSN(+VE(k;RsX_2r`x>a~WS}YupZW9hlON6JS+l6PO zrNR;E4&gaznee=Hr|^QbT7~W3jxV4bV~U`P;v+_Nd>|y(hA{6Qlapg#bSq*C$PcGzrpVNZF94K$;3^ z8l>rvu7@-O(o9G>kOW9JNV$;mAkBg_8`2y|b0OUTX&$5-ADQYoY|Nac_!Ah{t`Lh?ZJLJEBHu_f^7Z!Ce&KeGgW_ghQg_g`27fBb_b@a3N@ zfxrCO68Mk5S_1$1H%s8Z{%#3`{@W5TnXG}NBx@ig#Tw|_*Ba>8&l<2=t%0;OYv8iW ztbu_8t$}ZU+Zq@$#2UE#a%*7NFl*pD-?0X+xWXE^@=9x9)F^AB&2IkDM25z{)8o2RBYv87vtbvr|1dYY2{_Kb?Z6FoE$Wlc z`G>ioWx#R%VJ^KAIL<%J4X*-@^AB?)Rs+ZRhq;mKf#dwcTt*#moPU@b-2@!x-#*|5 zyW0Dp9M%T|F~!`_vi3fylpBiUjJfp6_C6VuOP9IfRqcJoQEs@*jaUtO3t1lwt2=T% z=uvK@Y?o07dX&qMxzSCaN4e2{l=mX;^R-^7(&wu(hJkF6exxk?9lq*5->+y~BbEC3 z_4fCM$MK-Q5yltE*ouy^jtuJS*W2G49@~rfs%QLT9QF0XUPRLVv7@lr0jkC$@MeZ2XRwx37ZKJ>{_F1nAGa?yRfl#A};bw~WedZdq+a?yRf zl#A};rCf9$Z$ZSr#l%18lcijAA1~#i`*~9?3$14ynx{udJxadCKT*5{7@j_hTZ)S8KuY+*WeZ27fz%IIvcLm{$K3?djHu`wg z`{<26UiG*beY}w|9;1)f=;Os>3`QTXdLOsZ$9tjM=;Jl|c#S?@qmS3<Eo5}e;G&mc;)+F#*sc=`Tm!2q>op=|79HM<7M~1 zxL>dld>QP5epSvt%#l7`&OgkNK3>j0%#l7`&OgkNK3>j0%#l7`&OgkNK3>j0%#l7` z&cE%zkv?9|Kg^LnUd}(vkv?9|Kg^LnUd}(vkv?9|Kg^LnUd}(vkv?9|Kg^LnUe3RL zz>z*)#QI<$rkEpryp)UX}-&sFE14++mzkMVqn@f=1^ z)bkn-nL*IQN3y516xy3l~-JwN*(*^lw(A5*;e8O681NG5aaXVk+JTlhJ%0r*)(cpkBy zKDULx1AhLo3Fd}2!t+@0%;ne?m=D_F{hU0{)d=tZ4DW59eeQZ4o(ZdmXS#O#L{|vC z4A1{1L#u=5oIXH37*BqH-X;0Z(0e5R8GXhg1kX33Y*WND4o%H?>d@4T zrw&cccq%nNV)6Wl#q)P8p1-?UA1)!D+ZFMQLsK)JIy5!osY6pUo=VM6Sv)^w@%$r;=N~U2o;wxs zj6+j1o;ox&>B+Ac8&gm+BLd4GRGkc-(SQ0Bl)^r?HNz`S{~1t`vp7)D?i7FpQ*%U`1)S$8Ae?8 zjk)FzD%x-y_YsI5aimsY6pUo;ox&X_yTMI*NyvSG|<9};?3^G^QSQ`W8DjREKsL#nY*2Ykei?KF_dR&aPF^sh_@VsSX zZ4C8!%f{Lm7rKqLF^sh_jI}Y0wK2ZwYh%b`TK~*zWEksX(DgC!d}KP_XsnGP&&@X0 z#`u4?HpU_4ddQbLuN3B@!`O4wI?3TNYg!hyPDOaknwF`LS<|wpbr`-;W7d8}AM95# zX01b0vkz8>re+_kQZtV6Wz9bA7DYVc(A4bH)}g5xPaT??@ltH74eKiQ!}1AG&SR?LsK)JoF*BwUetTc+E~|XQA$QZWRB}%m>Wy> zZlK)PyRKJ`%Gk90OfFmJi|pF~_bjjm*Z<0zUU=QvWJq-WvaxrA+S>KT-VN&GqQ>40 z>TxmlZiwF7#n`*S*t-FbpBsBOsE?l;dpBI@Hui2X_HHotZZP(4(AvAf7@z*;?BAd| zJ}ut^!`QWttowA{Sl0{h-2nX$WZi9ui?Oa3T@#G#-S8Ey>viyp`mVgIle^v+sd%m$ z??G+s*?aMX)_&D_(637NJdf-(fM!uHYJXv4&)(=g?lt0S?Agoqa~MzhWTW?Upj`BR z4wQ@D&tY_A-U#l`HTLYK`vMz#_R_tDjXitS`zeh*d)4D&?Ac3?8)MI2W6xgPZ(9tP zd1~Wg#-6=j|DL^h&6O@c{07IH8rI{n+SBlB)baU}Rj@8GS6)_%8u)xgIcnkam2QOh zy$|boJqYi0AI?|#5Z>QDoENJR-qSvuUtNRnKK9}K+BFF8T_4V`Ux)C1_2GQ2gzz5q z;e4GR;eF}D`G$Ig_o5Hyn;H?`e?FYw(2Ve&^Wpr)7KHbi59c>;LU?caaQ>cFg!hvV z=eKS_cn|q-{@yl(_l*zdx3wd@S9~~s-*$xehY#l;xF6v?;lud{cObkEd^rE`LkREv z9?n0q6XE^d!}(p0BD}|YIRDsgg!gq1=lAYGcrW*G{_%YX@82HI?|%Z}J=??iCl4Te zObT#JhPgv<0}Gsu=bnCwaCkrVu-)Nj2sfAKo;^Z1yzhG0?&x!b!+WiVxns{04)3oX z=3ab(a0_|vcn9I|KI&n+lP3s=_f8LU{~911-Y-4Ob#)SMDbJk>5)SW+9=3b=CBiM^ zxzn!@?oOV2^+$xu=egHkBOHE~GpzT&hZtNLu>=|zC8E)(uZtNLu>={nS ztc*Rw*?ux6GG;~gYuOI{uEw6>c)uIPp5ZX&l$owF=7iVs4S3U@hqZjqSMGP6_de+| z_6%?BI)3UD-5)#|E<}~%P2oLfU<~T?X=)eVa|XCqU!`1MTK9dF=|zC8II>Uo`5wx^F?FNaNM_T>=~{;2KQC$8E%X*8DmVw7?am!j4^p# z#u(Fu?j?s|%m-cQUV4<`vKJ{XKS}ZKE{gdtQ!IFuV&NMUi+)P6_yR@a*(T%JCga&A z>1gU@ILWD7)!duJ>50elhUE7xt^2`P0jVB zl$vp@m8rQt*J}_?KAv%CYQ|HCre-{KXllk&sTs#wnVRuDr-)}9nws&{p{W^99h#c) zRBFbtR;Fe=FDl|0ho)vcb!ckFQ-`KzJe8Vptd*%5&sP-jj6+j1o;ox&j8u<{v1g=uT#P*a(d2@3)2B-0L}o+0n7ok0onlb0P_H61I`AV3pf{W9^gE{`GE5Q7XU5*TnM-j za1r1lz{P-z0ha(S0W_Y?HlEG?zy55tu_o|0d`)1O8v$$I4#smMe0crFGFTIsd$t&J z>6O5liCwyEH@phCB*Kw-71+*rHrseM8_%!FsB7<&v5-C6$>!XsJ)8YTYVSES$MCb+ zo!+!#@ND+qI;+x-;q`$##k6A~6nd-jY1doc!>+fgo^`z?9#ws2+t{;KF$QGpS$py1 zhv;2A7wN?3=shwg>2vyAb_l)HsX3{+WiM`W^CVZm!HtIZ$D7Aq}y-DS11DWcbc*I-sFGxjb}kBhN)fqGnwy$g)J z3-G!zcG1|o0FTQS!=;tldDYmvz}UOsw6S-Av3CKSAHwV68+#XYoi_F^=+0@N>$cc7 zP|Vv*arRb;aNV9}Z(rlqsnj>tH<_c}n4MMv#PuMQqDBLg27j{TD z2@gpNgq@PHu8gs+jMlm`#{8IX`1}~T500N7Blp2Ej?9m-smzbbRhb`?PdKt}3!XDG zay`tW;nr=zTt*#m&<`o^NsYPDO~A2ro#pji2D=9J$>IFNdsh!F1CH|#bLo}9asFX$ zcolG*f0!Gw8aU2B%#ByqFq+~0Kq%1)U2nZE0*vYtq)uJz?Sd%;(glwJ*34OjzM1Golo4d6P!b$}9}1n39! z1J(o912zIS0yYCS1GWIR0B!=@1lS7L3b+Mu3t$^y8(=$NJK%P}?SS_K-Ve9~a0lQ+ zfDZxg1l$SuDBz=jy8(9t?g88bxDRk2;1hsP03HB50C*7aAmCGgPXRsy_zd6?z$1Xq z0X_#9GQR|CZiPZ2^D7u@L*^gVhTxqI%{}M)#Eg2-Yu&z4Ddrv&>h_)5+84*V=d{(- z=B^xtZK^({HdV}~3T%!u8y_;AKmPLjeXw2i=hUv6*;T7(iNB*ZBC`=Wttw8d=J(XD zhS}AqXs!MuwOP$gs9im?t5?x#=u2%Hm`wwxwU%i`9oyCH*!J~4wtdi+eB7~J4f$#~wxjBctbSGR z`c+_G#nl%fFDmP=VfAZz*RKKl8m>OW%bwht`8f@cCY8^!=IGyT4nuuR^Q*dz8~!UT>W~;*DLEcu=;hq>(_yO z9ap~r@(oZwC?9LH`Rb_>*QH?!NVqV-rVwRw!$JjQ8l;qC|US_kG)7s3m zqK>_0)^;0vw_TW?*4@ap-Db!)E8G4CR==rt{U)$);_7dJ{03$HO{{)P@A@zuw!4L^ zzX|f2l=U~W`Wt)K-w5^_x%!(SzZvS|;|-2$db~B#<8QZ|Z=uKEz8h#9_pvzcQ;GHC zH&UC&na$%|tXsL??TMSH-4o322^FpVH&dJa%w|8Qbq}X?;1+6kfY}{T(R%V$YV#zs zd6LuG!f746joKY#b_Z3o4&6>|4l$cUoYq!O>!~}a-BZl&DHW}!@1!%;Wv?l!J|JLKDy^|!J5_x7%TFWBG9)!zpBZS439%EwzPJ^p&+e4ELS zP-_=LbzOM7JN)|i`xLDdP&&gDhQ5EHZ!9dol;M~E9jd@@$FuTe#xG`hmJP#vS9yuk zGc`lFDKBIG{Je|_<1^u3#<*)rE6P_oJ$ARlS6uEWbK1QXc3-i>XZN_vy!I82a=W+6 z?XK|nz+y#-W2LKnC6qhuh0cPim3CKoQH8yz!ee)omf9=4cDJXZ$W`j}UgPzwxTdtU zy6lI*ta7gKkxWKW*7)qnMUx7$CS(_lcTOsrIC-M8z_B9Rk?oi~A*;|)aP5Ta3dc`! zvY3RO=iZpe`@X0YZWda%PCb*$7BIeMYBZ1croi?s3jQRf&8aALS+sp2(1VitA~YQvL|$;b zpn(Wl(bbg8k&80RD|}9rS-85~ySfZzt}L(0EOvN{5&IwWJ`c+DR1`XV4wUJ0*7%6E zqs+AeXlJRX0)FbkLinLk=8B54GG{p$fR@iu09XvQ={t(3)KTCpwO==RQZ9Qt-Zo`C zdz*b-Ru+3ZCC`@4-eyglCEMDvC&;#Wwu!Rs>{*j#+u5_Svypw-f{6~7&*_;krKqgf zVJ{L3?Zt)HPROy=asfs8c$HaP7oP*KbOOc%jMYt~h4)`S>hpFs1byJ$eh?KMZ z0EROFaZKg^oN-m1wjQxG~nK3QK?#?C7>;By|?jIwQ6fyZF_rr@7~{e%;xN5wzHYl`>+2y z;dA)z`JUhT?r(p4W_ISBGw0OKnm^kT7nhxY=wCeQL2>LghHUn_Z3boMCt@xOCBy&U zLW58el;gUJof-p7Ou=NZ#ebMq-jfij|FW;7go0==&EtCD0zyBac{Dv7&wY!&p@Mup z?CZIytbAEGx|z&GA`Th$einEcO+BT)^7$5h;48qNFWUE`-XL@{)Wh%B-#`3bB%at| z{HMY%##35BCH4b={xy`P0SXIp=j7drQ2o=+vVvl-Goz%?Q(jo=y{GlD45w$6_ifbL z{B(wMrLW3a>Ts@HS?Kj1@)SA?9HPhND}<7Q0uOKwuiLq@@bOip6)T*jk7blQ%L*TJ z`#g?{0~r-XMczUmdMv~1T3eWWIHTHGT2<&Mag`T5Ma>;6s)~vVJx#RQKDV<#sg$fy z>EMbr4kDo_`PR4#_qndUM(ySk#LbHG(lxqXRw{2eSmO0}^})d@nI63GoA?;PzejfE3pc^7OTJ+0|M{^QNo`Z&Kb}M z3&WeS3S5j;;B5l}Bn;fip~rwmSQy@qRp6so1wJ((04IkEw=Z;Z_}qX*SQNg9Rp76& z3jE!G07QYiXg?c=3~2CSI2)G`vqEyr3WE$NJPBuqYV~#W*|@<5q{0I-EM|q&m=#7F zP`-aAeA&AkijcQ+q z$0dC|9<1@!QFxrHQSIySxVo>$X&P@Gg~t;#szvmeos6FLDsK;WQQR+|Wg->vr=*#wt%ml^eRZSCti(`y55Cn!*AH9$Nnioq@wb^07)g{OpxlQKkSoc-b|$FhZ!oG_iubM2&}vSO#BNGxy^ z7hHW!mZQj>1zpQ-pS##m4T;pA^5W&GloJ!Fc5MiLv&Ub7I`WlT=-LGD|7j( z3q4*}MR~h2(U49D?DIb^9vTm`0+o4{%;IXmA$TUFGB1Ks_zTMD^@b3uZ6-~4lgc`AA}m=!zlAS zm9lRt#s%Y=gF*bgb#+=$pfetw>O^r#0rGh)NdYnqHa;nkhKYovz(kA#EP*_LY`}b+ zA7}}<0c6O_fC0cZK$L6=>;aI!WeFSw3;+&>aLZsHfILdE1kM3u1NHz0fX~C_kk1DU z0J;IQftLXh)NjLiz&$vhm=r+Jm}J1gxKSm6pBP720uuqV0kd&AU>;xqFdvs+3c9!) zP{!p$lLAMv-nWwi0YDkB2j_<+1wIGdj4WxJEeUDCgoN(};kyr7EEEwU*o>0Wy5Mzu z;(BQ}a*wZfSB`B$p0Q0HZ+eIKZdW66k8N~UrZ)gr=kbp3@NRQ8F>W{VjBoIG#|Ay# z6qJC_M&ujc?5j$D4F$V9yq_dtyL$J=^cj2X?YmLSczeqx*Ni&*Hha+Cj?xga%XWE2 zT~2e(rkt%gcF46oIjf^*ujo;+ucJfi{p24`hi__a!wQWXKEw&x`lh)ro5gi4-XC}ZT`~W zr36O?M-mj4+S{{l&uPax!|cQ4;qvhM?5R21gk|>joI5B#!ahR2RKAq*%Yna>@~QSz zd89nD{tUEv?Em*H{@U@D&RQ8A>l+Y0t6e$k~^}X9P^nNtOr71M7wBpk5aJ%ro(K4Zh^0(|m$GK~9tt>la;NZ%?*U z`$g&Yc7?y?d-jkYw~V)k-ZzLgcS6YIAld?GK?8#W$@c;8w{Z-V1<_VkzKxWp1XD;k ze4dTdK+_K8LDbG{wiBBn!6C#3KG()+@LqN(529vfvys>&1`~-5_`ER)ev-I?jS}O_ z>@Pw4wsjFcXm28T1;hn>vcqT20b6oBG~2h5W3Vx2D>)`wD_wA`k8gzI{|fi^l-Ir6 zT~{=~@!5VaJ$A#6pV{R2iKJ6U&ku0!@QttYRgK*U=Z2rKb3^^Ehto;DI(swxqdvg< z5FfTy!tw1HUk^Sgb)kOxIi>ENN7H9C*|*~JRR_xK?r1^?pR@4spYE6OkIjR{?jL5|LA>^yc5g$KQ!lr(Cn;^>9sdb59Sl$R}QY*)=-|d1&{Tdk1r;Pg4&6wgC4o=3af3 za`1bBx%Yt6^yv)e(-{=*(-{=*(;3w36ZQe05Y>JqYWRg8C{g!2Ld?A53K3)O1*1{YgufR~seI4-%d|Sh>VH$o7*YImZuU}VM8f&i%&T5u}!Qk8R zCu+OrAi|iUyPNLCF-ql}2&!1d2m(yTdwv z^H-)k2S1^I^LtaCyA=h`R~~Sk_dem$Y)2Qkc6342jxMO$(S_c2G$+)y<4N0|6V|qY zgSJh$P}>F$+BV@rZ987OZNJI+^rp(EH&s5p$@vu4wmH8-ZJYBe)V4XlLTx*OUq)jU z*0y{7>TTP-e)YEPUcY+Vb_Bm-h*xjh=KKn^ZO*Sy+vfZVwe5JBZ^OTYD`4D)0Y~6A z448`BFyJWMh5<+8HVilhw_(6E+=cW~P&#_(ODTn%4mY4|!@!`C^zzNS%MQx#uF4Ugb!EXTu5^t{N&72h1vbA4}I12HkwX*F#9m)9H|8?Xqa5)Mb7^Bik8;F6%#BM2J<1XP76XTm zVKlDSKg^9tS(KPeIpQDYQim)`Orsp}4|Ai2FG`$1IpQDYMvq*Sh_31N4|8KK13k(S z|1g&}7W61b{KMS1bkL(5@vi_l;vf6{{S}7f(3iBj!er1PgZULV3P;ij?0Tq1-9HOv;xd1q6lc^@%x<7)-$*fc3&q*n zD9+tZao+tD=RZVo;ckj~`zS6tKymS5ic5}ATzZV+vKJ^Wf05!{T@>?Qrnur&iUn^_ zEcz+M;tLd4|D0m!uPB!PnxgwXik=Sv zKQW`a57*PU>4w1z^Q=K0jC4b0Gt7s4VVpR2ebp`0_FnF2AmBz z7jQ1%JivK?^8x1rE(BZ%mX=$l&RJvVwR$3-JC*2`DFD(~3q&tO^(p`dIatNJLz7UX{ z!b{Q$;T376@FS@}cugu4ek>IUXQWlaPo!euoa7SDORI%9r4r$1QmJrJDieMwl?!i6 z6~ezuZlOo26yB9Q!uyg}_>B~3Sd&QFNgZt`erP{_XeX!Q<=Rm-Yixd@rWUs!|Aq~= z%ncjjCqbGFX$qt)NY_D{3TYao>5#66Gy~F1NZF7CNOnj$ka8i-f;1b_97uB^-2iDG zq#Gg4hqM6FLP$43%7b(>q(zW!fwUOXt&o;Lx((7&NVh{;2I&q+%OTwf=`KhPNcoVQ zkXAri38?^5A*3Qms~{CaazR=RsRU9fq%uh5kSZX#Ayq>1K=MNJfBccv|H*Hy{?9(O z`hWjBtN#z5Tm66fqt*ZA7gqmY{%rOC$6u}f|NNWP|6hN%`h)*%^~c58{PFQNe`2D| zKVX2(KX9PUZ?oC_DJeGpC70OzLx$M=-~P7EKYX~&f9a(*|HzRx|98G)^IvwE&42mj zHvibMHvf0OYxAe4+x+9l+x*}EzRiE-l{Wtme`xbxb(PJ3_0=~2wb$DGlP1~xQ>NJb z*Ij4xPn%}*Uw^&LKXWF0cY@8IlVkJGnq~9PnPcBVc7BmI&*$se@w{Nt-3RZj1=WYRn z{OoNObRFd9ZU=kZb5d)m;MS6^5ws_pd84%-vfla=R*sePd9hrd&An-TFF=l&imk=bC~OVBJQh$ z`_N&y409vOfK%>&8JAiKoO1umxKUNWDfhpO8@&cN<^Gp(V>SS%-2XBztqwTl{+Ds% znt)^Xzu1>X;@{AuY|cN-jVJ?-^AB^WmB4ZSVQy3vaGZab8@&cN&OgkJ*#I2pALi2P zfaCnb+_)y-IRAD4H_WvtDWCHXb0f-tjpj`I(5V>STC z`G>i*I^a0}FgLCVIL^QQzzuV?C!uWC2Lmz1+=#OFq-4sCz;VW0YGr#;8s$=G#wd1ab)ZMNG=&@21bUPkC!^dKaG$U3a*aM;jWG;l z@9js*#Gm1-?(_YM#x;_upI>i(Z)hA3`Ws<R|8`rXi;w;C2p&@zc-n3;eEW63-9BlTzDUEes9~)C2b%2WGNTk$4j~J zK3>X&_wl-W{lj{skC$@ceY}(l@8hLhcpvYIUjG&o|DaEna^Zcvlnd|UrCfL)ucy~P ztVjBIDHqJKt z*X-jp`*_VhUbBz)|9Bs-a{mi`c0PQMj{A6(`(MV9K3?VimvN+zSGoUX9O>g#?td9a z`goQ5U&fI>UgiFmaiouz-T&f#!A9_9m<#$%il zrCfL)FXh7fcqvEvcn5x25A)a9vln~zx$5_pj1BIsbRig$sCSpa>wV^P7;s^C9MQ7(HRnhlJ*;TV(Zf7#qoR7>B?5=P)`eQ;y>?j81RLad-~n zZ=F>s$MHP&PBG;;%wvD6@+sF_-XpHJs-AJZB_4C3!OMGo_I`ql@#h~=y!a`_UwocG z=Gf1uhbOl1vwVZ`GnMe%WIcUu3x5XuoMscu4Q+(yvEbRz@hvbPw8Q%ud7i5gUjG?h z+duux^*THgRu9i~?eU4OAbJ^|DNKM?2hTaZk9sh^_&$1v!g z4f+Co*8R>YTilm`@13^A;ry2$UlJGm6pm*Z!QZHh9Wd)7TFtn`f2^4-W3B#aT9W>; zrffhne!xH0Y*A}|z~cDc-Q6rfxjdnjf-we#qkadlt{%_amM=Rq>2MQ#YOlGb{;@!X+`XB3*c@id^R8&3n8y75$N ze!}AU35(~SSUmsKk9h7<#WM;`-FO<%)QzVBP2G5Mns{!ok39$7`z$GbCYmPv+&*IW zwy`{0x@EL#o)KK5KhLhwpJ&(T&$Dav=e4fUExmIbGV%R2%s*1D+qItYRIcUmoVj1Z zbFj*DeE6A4T!ydjwVq+bW#5=<{(!0tM{ysa+lCEj>b7A6n!0URtr^Ajx$ZsGYpQrg zp{W~B1Dd+=G@z*)Pqk(g*XO$NJg16h6q>s6G@z*)PXn5|@lpeljNnG3Q#YP!%_y$Vb>sP_DxOhj>c-Q6rfxh9XzIpOtr^Ajxo$k)R>d<4P2G4J(A15m z0ZrX_a++klqM>YjiLLiS*2I9ZA+5DAw8o3f^)ddp*2f@omCdy=V6L*cHiq_`Tyt#< z?YNk0V`#_4TpPn&8w1Zc&&88O8XrZamvm@r*)KH=YJGb>nG3Q#YP!%_zo~b>q2H70)O%b>nG3Q#YOlGAwPqCK%ewJwRmC$3P2G4J(A15m0ZrX_sx_k+ zU)GK14pls((A15m0ZrX_8qm~@r&=?L@nzk3?o!1w3QgU38qm~@rvXjfcygL#%z9DZ zF>7;OuSJPzD|+X+E{3_WWbX#beZA{?WowL0E6?Pzb-u{H4RFr_YjFLqtm%ccoJuYhQ-Jl&8bMJ=myG~226OKQ zbMFRo?*_fS8_eTJur-2`^dUa=goD!@ZJs3|3KE=hPasPdeJq($leWK z(YjuTKCkb}y)vQejWMd{s_`Dw=AOM5PwMSgoeTY{WY6>7y#~-M%7yJOZ0^|`zQ?^z zT+Kau*?tZaNuO-^eh!oi-_L<^;rls^>zy}(`*Y1bd+ENw=AONDZ((!KUhRHLbI)Gw zxR`tP(&NV5v)A0S7x&v1!)2bKx!uP8??e7@3+@V@sUJ+BAhz3xN#Dj&l8+lTUEHNtz^hw^J`5Z=c=lwY?N z;l1la`3>t4-mgBCuaywqqdt_clM&vRK9p~$M|dy#P`;@V;r-`B`Hjs8?>QgJZ)!n! zpZQRJ%Vvc4mJj9cZbf)M`A~k_R)qJE59RM^LwMi#P=0$m!h6Ms^7rmQcz^g%{{H(A z-V;8Qe_$uV`@o0t4?T$R-tVFO!@CgP?>&^?{RqN)yod6S?m>88_fUS{UWE5@59J@* zkMREOq5Ofz5#F;slz-wN!pEcl$7F~*3^%aA*?I1%Ckco5V-MLKd75x@dG48`gv0x; zhwP3$OE|pOdWbvz9O3Z(>LKoh=Lwg`b0<0ohxbtr*}Zs@aCqGM+pA3gPbHxmSNgxO|>_{WZejXE{T9Z~T~WD|qhg8N%WH z%tLlR{R!dl`7p$FpCcUJ(U>$#Y1-Qy-T>YJomx-gj>gRAN~fJdxo2P zhMRkan|p?vdxo2PhMRka(=jV^&v3S%OdJ`rBKx)MfPPnV&v3lojcU(u7<0-<)fjWa zYx(-UDbK-LzUM0sxXyc@aG86Cw|1R4b&Bo}o&Xo3>hY$~o-;58b^0{53+*`r+^err zZUC+O9*UREJ;Ui5-%4Bq=APl4F(z}2$sA+yy38>quge@`y3oDk2#ool3*AePQC#){#pN$jysL|1{>v0syh^d) z4T?oSrC5A{qWNr-`D~Na^m-SwmlXzH#fWk6GRJt?(j6l-PbuFv%vgp-eF6q>s6G@z*)PXn5|@l>c;aG zRXn56)QzVBP2G4J(A15mS~H5ZGIis5MitK}Gs6G@z*)PXn5|@lDt4o)fQu&_nOA}B%xAOBXS4DAnzXw1q_jNt zY$uy@qxEd|8_9j=%pAwhW_Nm1j>EIrf9tGDIgZx{?i5pw2T|~?%BNg!d5^f>s(Qxt zmUv9_nQe2=TGbekxo7Rg7e7Gn;JHXAKSS@5IZ2<<=dy$7rB2;3wH8%8zlu3C1~hfY z)C_3qj;X0NqnO{K8_zaXJfqOm9iud$sT)rNn!52+Yeq4@MK_*1Rq>2MQ#YOlG#p^C>m0c%#t#}P) ztuk}(0`0h%dlzWO#oW8V+`9m;8{-hoy$kTTY%yF~X`NTiy$j5}3r?GR7npk&!1*Dx zF21>ULDy+>?}F~^2D)yGeIv!(Efi;Oqd0dv#d-Hroc|ETg}W){?W4Hp0L8_JDVpob znDb+%3Dwecp+>r1SS!sC)=M)5Ny-*vNf7EKyU-}*2+dNi&?3zeHcPXGR%wo~Rhlcb zNjC`X(mY{@bfa*eG+)>$Ef5}*77Dv0b6pv8T^YS~Wz6|8-|+b{N*^3QKSt?;V;q?u zW7n7;lcO;|CZBL*-4;A&X3PedN5ie#g1NLh;GiE;*^?S`pCmzy9{#;P0Hr{ z!+TecCiFr9(5qp-SM$4&W?Ue z@|}?H{KOX5S-By;bC$1`@ayB@DqQY531uha2F88YS7kdHUtR0Ve&&L&Dn7Luuo|!i zum*4~;99`-fa?JzKnYL=lmY7j>j4`98v&aEn*mz@TL3o$ZU$@xYz5p3xD~Jsunn*s zupMv*;10n10Ph3b3AhvRLBIzAcLDAKd<5_jz&(I_0QUm!1>6s~AMkO&#{mxl9t1oD zcnI)Gz$XEp27DUuDBw}RX91rD3|d}-HMfGnpyd^ewL!~|YJ>2~j^>{8eQZHJ=(TR& z*hEVY3U>QWZ5x1N-E-O&*XFJqi*2etp*B^_rV4CMFdH9=JAdNk_mZ$(^=H(sn%Pxr zXoT25;XrzOY5 z6FZsN$r@U9@zkb{+0=1bYdNj@L~2*h?CLeN8U|3C24>U1X{}>gVaIkgJGOm&k8K~c zB_DTeS3|xUj_t7eBCB83w|*7aS8?@4$cyUwYgzr8zV&OszJ{w0``LA`Ro7p~>aXcr zANIuSUc=Rgz3;l$L4EmTr3(#S-_zlXllLgEYvN>ltTYa!@oQx9Yt)Ewla1OmF`FhX z#_PH7t2u?*H8ZD<+NHyQoB}W*Q%j)_jjnx-OT20PD|#rwq8cf zr#9P|%{ES}j?-!zOYPd2U7Lp1J>R7^_b{7#IIVh4t393CwKKbR4Xy3tsm*p~vz^mw zU|M0vjl|mR`o8UUJ+#yHT)UMZFR9yoEvvtwZ++NEwtE9tAAVB0Yt{AZS$(;0eHrX! zu6{k_>(%ueSpB-b_3OaCj;r4Q`39&TP>!|radHq|=fhP2ISyrd?CtnIjo}U!!yOuN zzV}LMb1$>Gmy2^F_nqDMLuz*)v%612>;9{#&Hc>geom{2)7p78wcE+;c4}xna4ofY zfZ062X*F|N4^EG$4HMAa@LTw&mHV<)H8#%3A*HOD&%x;&4*2B}N&BM&*VNR=s z(|Y83YWE1UdqhKP_e^TDo7wEDOzVaHxGYrBno+b&E`>u%)QZZqVY)op(xtKZbOeiPU?arHMs zexthnW>&wYZ+(~!+ug#|-wgT9>iSz){Y`!AZvy*GT>UMO-vagV@dn2=J>DAW@wZ3G zx6tEn{|z*b`&k_KYsC7o8>!7>%;qsJ)~($4_V@y7_c*hATtn-?P1NQ9vpK+N-OXto zyqVe^WOfHNw4S(y+C0H*p5U~$a$1LOrFMsy-60LF!?#hJ!_4L|r?rjKdh&K^_aw7> zQbX&hJE+Z5%;qUhtBuoo`c7*1G_!kJL+glx+8kjvM>wr}IIW{jYIl^`9o5i!W+kTieq zc6R&)l;f?H9)CSbzAa8hsI?2Bx-Puk9sc|4_bJ*apj3v53>*K2zOb@<0mCo*Rj%B?(kN*-4z}mSgb5@u5y*Hg7QK~LE(z3RSs8qQH7(Z!sBq3mO3iD z4!5VG$W>bCy~^uZc~xm?b=ePrSzWl&M>1(enG>_76ir@{c}-T)#KOr%lcr26T;W`q z<;-$UxhAu~x#H?;t}U22xsb&q_!}%=6KL-~{{2$(z9iai}Bj`Ed{}z(W05m$4daRV;y(zGL_Ox)Gey;;i zo{B$@%`M63+_G?c{N)dgAArzwY|#6+8(t1U(7LXqT((k_QC{IIL>UEZ%DrpKP{ykA zs*GZ%w-~YiG4J!B3{ORY)8|APzQP(Gv38cZRsvmE>ZyPxP*4C30A;MKC@U*02LsUZ zIadG{Lv8wsA}V#RC@ghMyf!D3y}UN-TIJ=$oJm=X&B~mZ!(LuDD{CTqnK{|6*xIvZ zDYm&eS1Y!&XJ;z5vu975h#bonPI9_@g`R6C6_ph`9Ytb+qqyMeYqA_g?kq=TiQDHc zc2q-J<*BH0&+4yzMo9s3g7Sjl7vq%zipue=u(&Qh zXDHQ_^N4~MQ3F1+O61z4}UhMx&AN0&6Sv3)p!_ z!8B&5lq>!#csZ2g-a@5ZInOCr!0Iay-ue;3;lZ3zUuo|OW-ApLs`FO?kLxSv;m~pP z16twSw^Z+_%1&Z+@n^t)%JF$_Dm8Rdn#2gdRN^<7@iQ2w)X%F<40VPn(UiJ29 z0q~fc$jTK^>K{=EhDv<}mjEA%9cTMFJOAV{!xSjP=T^mkrTyFmoZ>rI9c5onD=ME~ XDO288$rnecf0WgS@rtlNb^ZSV4aI77 literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmha_v3_fwd/fmha_fwd.csv b/hsa/gfx942/fmha_v3_fwd/fmha_fwd.csv index e64533c7fd..72c08c0d42 100644 --- a/hsa/gfx942/fmha_v3_fwd/fmha_fwd.csv +++ b/hsa/gfx942/fmha_v3_fwd/fmha_fwd.csv @@ -1,29 +1,33 @@ -dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,ts_qo,ts_kv,knl_name,co_name -bf16,128,128,0,0,0,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtneE,fwd_hd128_bf16_rtne.co -bf16,128,128,0,0,1,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtnaE,fwd_hd128_bf16_rtna.co -bf16,128,128,0,0,2,256,32,_ZN5aiter23fmha_fwd_hd128_bf16_rtzE,fwd_hd128_bf16_rtz.co -bf16,128,128,2,0,0,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtneE,fwd_hd128_bf16_causal_rtne.co -bf16,128,128,2,0,1,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtnaE,fwd_hd128_bf16_causal_rtna.co -bf16,128,128,2,0,2,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_causal_rtzE,fwd_hd128_bf16_causal_rtz.co -bf16,128,128,0,1,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtne_groupE,fwd_hd128_bf16_rtne_group.co -bf16,128,128,0,1,1,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtna_groupE,fwd_hd128_bf16_rtna_group.co -bf16,128,128,0,1,2,256,32,_ZN5aiter29fmha_fwd_hd128_bf16_rtz_groupE,fwd_hd128_bf16_rtz_group.co -bf16,128,128,2,1,0,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtne_groupE,fwd_hd128_bf16_causal_rtne_group.co -bf16,128,128,2,1,1,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtna_groupE,fwd_hd128_bf16_causal_rtna_group.co -bf16,128,128,2,1,2,256,32,_ZN5aiter36fmha_fwd_hd128_bf16_causal_rtz_groupE,fwd_hd128_bf16_causal_rtz_group.co -bf16,192,128,0,0,0,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtneE,fwd_hd192x128_bf16_rtne.co -bf16,192,128,0,0,1,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtnaE,fwd_hd192x128_bf16_rtna.co -bf16,192,128,0,0,2,128,32,_ZN5aiter27fmha_fwd_hd192x128_bf16_rtzE,fwd_hd192x128_bf16_rtz.co -bf16,192,128,2,0,0,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtneE,fwd_hd192x128_bf16_causal_rtne.co -bf16,192,128,2,0,1,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtnaE,fwd_hd192x128_bf16_causal_rtna.co -bf16,192,128,2,0,2,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_causal_rtzE,fwd_hd192x128_bf16_causal_rtz.co -bf16,192,128,0,1,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtne_groupE,fwd_hd192x128_bf16_rtne_group.co -bf16,192,128,0,1,1,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtna_groupE,fwd_hd192x128_bf16_rtna_group.co -bf16,192,128,0,1,2,128,32,_ZN5aiter33fmha_fwd_hd192x128_bf16_rtz_groupE,fwd_hd192x128_bf16_rtz_group.co -bf16,192,128,2,1,0,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtne_groupE,fwd_hd192x128_bf16_causal_rtne_group.co -bf16,192,128,2,1,1,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtna_groupE,fwd_hd192x128_bf16_causal_rtna_group.co -bf16,192,128,2,1,2,128,32,_ZN5aiter40fmha_fwd_hd192x128_bf16_causal_rtz_groupE,fwd_hd192x128_bf16_causal_rtz_group.co -fp8bf16,128,128,0,0,1,256,64,_ZN5aiter18fmha_fwd_hd128_fp8E,fwd_hd128_fp8.co -fp8bf16,128,128,2,0,1,256,64,_ZN5aiter25fmha_fwd_hd128_fp8_causalE,fwd_hd128_fp8_causal.co -fp8bf16,128,128,0,1,1,256,64,_ZN5aiter24fmha_fwd_hd128_fp8_groupE,fwd_hd128_fp8_group.co -fp8bf16,128,128,2,1,1,256,64,_ZN5aiter31fmha_fwd_hd128_fp8_causal_groupE,fwd_hd128_fp8_causal_group.co \ No newline at end of file +dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,qscale,ts_qo,ts_kv,knl_name,co_name +bf16,128,128,0,0,0,0,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtneE,fwd_hd128_bf16_rtne.co +bf16,128,128,0,0,1,0,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtnaE,fwd_hd128_bf16_rtna.co +bf16,128,128,0,0,2,0,256,32,_ZN5aiter23fmha_fwd_hd128_bf16_rtzE,fwd_hd128_bf16_rtz.co +bf16,128,128,2,0,0,0,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtneE,fwd_hd128_bf16_causal_rtne.co +bf16,128,128,2,0,1,0,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtnaE,fwd_hd128_bf16_causal_rtna.co +bf16,128,128,2,0,2,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_causal_rtzE,fwd_hd128_bf16_causal_rtz.co +bf16,128,128,0,1,0,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtne_groupE,fwd_hd128_bf16_rtne_group.co +bf16,128,128,0,1,1,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtna_groupE,fwd_hd128_bf16_rtna_group.co +bf16,128,128,0,1,2,0,256,32,_ZN5aiter29fmha_fwd_hd128_bf16_rtz_groupE,fwd_hd128_bf16_rtz_group.co +bf16,128,128,2,1,0,0,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtne_groupE,fwd_hd128_bf16_causal_rtne_group.co +bf16,128,128,2,1,1,0,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtna_groupE,fwd_hd128_bf16_causal_rtna_group.co +bf16,128,128,2,1,2,0,256,32,_ZN5aiter36fmha_fwd_hd128_bf16_causal_rtz_groupE,fwd_hd128_bf16_causal_rtz_group.co +bf16,192,128,0,0,0,0,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtneE,fwd_hd192x128_bf16_rtne.co +bf16,192,128,0,0,1,0,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtnaE,fwd_hd192x128_bf16_rtna.co +bf16,192,128,0,0,2,0,128,32,_ZN5aiter27fmha_fwd_hd192x128_bf16_rtzE,fwd_hd192x128_bf16_rtz.co +bf16,192,128,2,0,0,0,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtneE,fwd_hd192x128_bf16_causal_rtne.co +bf16,192,128,2,0,1,0,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtnaE,fwd_hd192x128_bf16_causal_rtna.co +bf16,192,128,2,0,2,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_causal_rtzE,fwd_hd192x128_bf16_causal_rtz.co +bf16,192,128,0,1,0,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtne_groupE,fwd_hd192x128_bf16_rtne_group.co +bf16,192,128,0,1,1,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtna_groupE,fwd_hd192x128_bf16_rtna_group.co +bf16,192,128,0,1,2,0,128,32,_ZN5aiter33fmha_fwd_hd192x128_bf16_rtz_groupE,fwd_hd192x128_bf16_rtz_group.co +bf16,192,128,2,1,0,0,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtne_groupE,fwd_hd192x128_bf16_causal_rtne_group.co +bf16,192,128,2,1,1,0,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtna_groupE,fwd_hd192x128_bf16_causal_rtna_group.co +bf16,192,128,2,1,2,0,128,32,_ZN5aiter40fmha_fwd_hd192x128_bf16_causal_rtz_groupE,fwd_hd192x128_bf16_causal_rtz_group.co +fp8bf16,128,128,0,0,1,0,256,64,_ZN5aiter18fmha_fwd_hd128_fp8E,fwd_hd128_fp8.co +fp8bf16,128,128,2,0,1,0,256,64,_ZN5aiter25fmha_fwd_hd128_fp8_causalE,fwd_hd128_fp8_causal.co +fp8bf16,128,128,0,1,1,0,256,64,_ZN5aiter24fmha_fwd_hd128_fp8_groupE,fwd_hd128_fp8_group.co +fp8bf16,128,128,2,1,1,0,256,64,_ZN5aiter31fmha_fwd_hd128_fp8_causal_groupE,fwd_hd128_fp8_causal_group.co +fp8bf16,128,128,0,0,1,1,256,64,_ZN5aiter29fmha_fwd_hd128_fp8_qkptph_vphE,fwd_hd128_fp8_qkptph_vph.co +fp8bf16,128,128,2,0,1,1,256,64,_ZN5aiter36fmha_fwd_hd128_fp8_causal_qkptph_vphE,fwd_hd128_fp8_causal_qkptph_vph.co +fp8bf16,128,128,0,1,1,1,256,64,_ZN5aiter35fmha_fwd_hd128_fp8_qkptph_vph_groupE,fwd_hd128_fp8_qkptph_vph_group.co +fp8bf16,128,128,2,1,1,1,256,64,_ZN5aiter42fmha_fwd_hd128_fp8_causal_qkptph_vph_groupE,fwd_hd128_fp8_causal_qkptph_vph_group.co \ No newline at end of file diff --git a/hsa/gfx950/fmha_v3_fwd/fmha_fwd.csv b/hsa/gfx950/fmha_v3_fwd/fmha_fwd.csv index f418e10cd0..e640af834f 100644 --- a/hsa/gfx950/fmha_v3_fwd/fmha_fwd.csv +++ b/hsa/gfx950/fmha_v3_fwd/fmha_fwd.csv @@ -1,13 +1,17 @@ -dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,ts_qo,ts_kv,knl_name,co_name -bf16,128,128,0,0,0,256,64,_ZN5aiter19fmha_fwd_hd128_bf16E,fwd_hd128_bf16.co -bf16,128,128,2,0,0,256,64,_ZN5aiter26fmha_fwd_hd128_bf16_causalE,fwd_hd128_bf16_causal.co -bf16,128,128,0,1,0,256,64,_ZN5aiter25fmha_fwd_hd128_bf16_groupE,fwd_hd128_bf16_group.co -bf16,128,128,2,1,0,256,64,_ZN5aiter32fmha_fwd_hd128_bf16_causal_groupE,fwd_hd128_bf16_causal_group.co -bf16,192,128,0,0,0,128,128,_ZN5aiter25fmha_fwd_hd192_hd128_bf16E,fwd_hd192_hd128_bf16.co -bf16,192,128,2,0,0,128,128,_ZN5aiter32fmha_fwd_hd192_hd128_bf16_causalE,fwd_hd192_hd128_bf16_causal.co -bf16,192,128,0,1,0,128,128,_ZN5aiter31fmha_fwd_hd192_hd128_bf16_groupE,fwd_hd192_hd128_bf16_group.co -bf16,192,128,2,1,0,128,128,_ZN5aiter38fmha_fwd_hd192_hd128_bf16_causal_groupE,fwd_hd192_hd128_bf16_causal_group.co -fp8bf16,128,128,0,0,0,256,128,_ZN5aiter24fmha_fwd_hd128_fp8_gfx950E,fwd_hd128_fp8.co -fp8bf16,128,128,2,0,0,256,128,_ZN5aiter31fmha_fwd_hd128_fp8_causal_gfx950E,fwd_hd128_fp8_causal.co -fp8bf16,128,128,0,1,0,256,128,_ZN5aiter30fmha_fwd_hd128_fp8_group_gfx950E,fwd_hd128_fp8_group.co -fp8bf16,128,128,2,1,0,256,128,_ZN5aiter37fmha_fwd_hd128_fp8_causal_group_gfx950E,fwd_hd128_fp8_causal_group.co \ No newline at end of file +dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,qscale,ts_qo,ts_kv,knl_name,co_name +bf16,128,128,0,0,0,0,256,64,_ZN5aiter19fmha_fwd_hd128_bf16E,fwd_hd128_bf16.co +bf16,128,128,2,0,0,0,256,64,_ZN5aiter26fmha_fwd_hd128_bf16_causalE,fwd_hd128_bf16_causal.co +bf16,128,128,0,1,0,0,256,64,_ZN5aiter25fmha_fwd_hd128_bf16_groupE,fwd_hd128_bf16_group.co +bf16,128,128,2,1,0,0,256,64,_ZN5aiter32fmha_fwd_hd128_bf16_causal_groupE,fwd_hd128_bf16_causal_group.co +bf16,192,128,0,0,0,0,128,128,_ZN5aiter25fmha_fwd_hd192_hd128_bf16E,fwd_hd192_hd128_bf16.co +bf16,192,128,2,0,0,0,128,128,_ZN5aiter32fmha_fwd_hd192_hd128_bf16_causalE,fwd_hd192_hd128_bf16_causal.co +bf16,192,128,0,1,0,0,128,128,_ZN5aiter31fmha_fwd_hd192_hd128_bf16_groupE,fwd_hd192_hd128_bf16_group.co +bf16,192,128,2,1,0,0,128,128,_ZN5aiter38fmha_fwd_hd192_hd128_bf16_causal_groupE,fwd_hd192_hd128_bf16_causal_group.co +fp8bf16,128,128,0,0,0,0,256,128,_ZN5aiter24fmha_fwd_hd128_fp8_gfx950E,fwd_hd128_fp8.co +fp8bf16,128,128,2,0,0,0,256,128,_ZN5aiter31fmha_fwd_hd128_fp8_causal_gfx950E,fwd_hd128_fp8_causal.co +fp8bf16,128,128,0,1,0,0,256,128,_ZN5aiter30fmha_fwd_hd128_fp8_group_gfx950E,fwd_hd128_fp8_group.co +fp8bf16,128,128,2,1,0,0,256,128,_ZN5aiter37fmha_fwd_hd128_fp8_causal_group_gfx950E,fwd_hd128_fp8_causal_group.co +fp8bf16,128,128,0,0,0,1,256,128,_ZN5aiter36fmha_fwd_hd128_fp8_qkptph_vph_gfx950E,fwd_hd128_fp8_qkptph_vph.co +fp8bf16,128,128,2,0,0,1,256,128,_ZN5aiter43fmha_fwd_hd128_fp8_causal_qkptph_vph_gfx950E,fwd_hd128_fp8_causal_qkptph_vph.co +fp8bf16,128,128,0,1,0,1,256,128,_ZN5aiter42fmha_fwd_hd128_fp8_qkptph_vph_group_gfx950E,fwd_hd128_fp8_qkptph_vph_group.co +fp8bf16,128,128,2,1,0,1,256,128,_ZN5aiter49fmha_fwd_hd128_fp8_causal_qkptph_vph_group_gfx950E,fwd_hd128_fp8_causal_qkptph_vph_group.co \ No newline at end of file diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_causal_qkptph_vph.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_causal_qkptph_vph.co new file mode 100755 index 0000000000000000000000000000000000000000..f29060b234b175dc6fb75a82fa198dafebeacf18 GIT binary patch literal 82296 zcmeHw3w#vSz5m%w0%4aZP{2SD1C>IrZA1+a9ztdl9wK5yL_~D6kPTV#+E+5+HQ77~ z5FrLsR1^dh1Qis~fP0fgrD}zifL7XiZ{w?K)z-G!_VzyZ{?21IXD73rWmfvH|2xa) z@ZIx0zw_PS{`Smd=A1L<)XbbW%WN{`BqRDtLfy#3-ebsN@Bex|Wp7TwTsBID|1UxP zQ6DJBb!9sxhM1U)sbGs=n3g}25Q_f-S5ZPnG=S!DJ#Yb`AJROU7LECr&<~W6uZMlj zpTf$QM5CL@946wBVV`G)xAD}I>&xG7!AJf={QctnJn9WXw?IAoc^CS^pGD${9mX#O zCXb_Z1Zfua1^oS3gfakyoAPGo--b}#Gmf&tVxJ?k#OW<}miq2(c|6nMUEzBdwKP4G z=~(Wsbd=g1%a=QSz5`yTqtGsTU4AE&6c&1cv-{kR<<2KolvXTrls=wW?kICU?)H1_ z75g$Pii&(rKYBdV=UVMdJ(yYLD6MqbOI+oJPg8UIvdW?&r?-(-+v9c=%9T=8D(zpk z%1$I?CI2e7bC2t~Yn5(3N!+X`FI}bSrAPk20V*G;>1Db~rHkQZiR@*D$_HwCIZmb0 z#qhF9_Hw++2Wonmtx_p&FXL>O`*6c7^*Lonb5uT4)6qF9m0~)|n)VaE<&IK^SN`05 zDv>NVWPC_yN$^L}ncnXtmRgcW|MLjjsdl}bRy>yU~LNLIoM*$FF5)1d$X z@z8*jS8^>RM~75&K=Kk+n3J%=JRJ&ftyL9ut##UzuR|s}7z-0tSe&rJQXL9#p;5(M zZ=E(3=n#nxMPb4U#R)5v>QEq|=(*-PZF1`ni4KK7VTJ0171rudfJ3B0afk$T$V3OD zAz_8)gcVwJC_uZYh&n`Cb%;cVVq3xr+Y?sUp+kX$fp|CA~c! zpz_gic$}_M?P7Rb)!X9?m5+|Y<8dn0VtUL@M$h=qFYJAx0~kX zgnhW6+s$K{E4&qzZs^`#QRXc7+lyS)&O$pLTK_4YK==ZgpawJ3A_k69*NXDgqrG@s z<+QoAJ+sVFZ7(Wy`0Zkaw}gq4GDK&b{bG`R$@~cpn9k;`Y0X?NyLg6je{2Fn;DsnQpJE%HelLeCcr(J)h}ydSIGd)Msvm zDbEmDB5URg|~M6A#&ZKqcVfK}j%6P@ZSW3?>5(gqe{!Y@TKg!w==(3)`w*Kqs-|@XkXI!2M-A}h<$0oN zcK=Au6gD-5!}xdT>_|ePj%0MA1DX1Sh!5sI^y?(`31wg+xld?3#=hoIK0pp&0nYa` zhw%K@0OSLJA;4BZlxhy`0tj4U4jl#z0S;?=0pO4ESUjP^a zbOYu99{@y9zZK^Jcj0_WpAdou6aXB68&V(e6XQ^GXgpvJU=A(^%m)ks7U1&BKo^$- z1^_oAbH+wVd2E`o^~VwzwJ@w-b5C)_Z-U!d_n*f(g3ok$-HHzjDm$DBRWV`>YSP zt8=d(Gkuq>Z6|6TYir)%nqF($Vhh{aPzFM_fGse+Hn%BvL+<8W8{}Fab_H6Gx@w`` zdbek6t=l&iYX{(K)q-DPw=yDWSe!C~QH1ck-6wwyb1+px}H+u*>Ez>vC} zDY;vOCAPNQJ1IZZHZ*Wq;4;cD1^zC|r`ys4!ve$VPC@&Fu%PdogEeE#9W?>8zaxyE zg!T;{qXQiQ^c3WuX8C72PN2iUJ=;->j&#fjU)zT5rlNx#Q_-Og{QX9l-8Esi&pP!$ z`;7LKZ0T4^mNXna+#yT=U%|KBtlYN#w*G;%Kw4e@T#CXZRyHxWZGde+U|?Wi-GE$* zLWZqv#>m{ZTw87+=L+y2Ff}(d&@a%hPM8e!vg!AkfnyH7jlDS6QCLO3-=?jgZk@7LRlDXX60K*d0IG)l!Jfkr+(koW`puDYGXEAiOs<9Kw<-a zuAe#<+q8jA6SG-QY*NB0#0Gp=A07kk6nwLR%?#XT;r->dm!aKQI|(1QH4?lMz9;x( zgYS?Fwxpdk**255-H^MPw6zwG3yz_&4RBmt>E4?5hHtCu%6e$`ZTHb*BC1`^BJDDk zPAxs|;F$D}t@T%qS`WwEPuVe7x8sp9q+YG93I0(Z;B$x%n>}#cc*oX(4{}|oKjw^F zch_TMrZ?I)Q@|Y{&{zMA6KZo=sq^}_T71Dn}LNx(* zGNe>U{UHs2G#FAkq!ExZAYBP*9HbvY8V@N8QZ}S%ka8g9L7D?;9;AFo3n49rv=mYS zq{5o8IY5rdfV&#HL>E+h*I2ovyk^U8BN}?sTDKgx*NH zzvd@N+iPBh?}dK@{QU#&N*6-X`QvY*?oiMB_rmZ#fDbHM_CC&sdh#FOW98V16Ruf- zwE;BXmhNt34jg;s71yjV*gknKfVbE9h|I&tRZ+Y!17&a3T> zjV9Yw$(^>VY>l??%%(s%9DXfADfY>rm^zN4 z1q~qO{ZlDkl0k96IEsUC+r&D53wqL?hd*C`3;NQYzYT@YdiJ@_`kr*DwzYFyTRW#{ zYv&Yg?Oab=n;mKYNu>SHj%xqFLHj3Mr2PX2?VoUw_MfEQ{@>z!dQ0KcTMD1v;(Ur~ z|D0cu_RskhY5$yGk@g?MFTJsfYX3cc^|b#Uzk1q#k6%6QKZaik#H*+MbACnIKj&AZ z{d0ar+J91jZ^OStXfSTWfJ1Q`2296o7;rdl!+;}j8wMPS+c01TZo`11^JXS@esy_6 zXU%IZomuEF=LYvPn@M{9ViGhs5wT zk>lZJdhX=oif_E=--DjGhHjx8AJ@hTo~BJFjA?CZ$O&nENNDXFx4q+Fgc>TGfbA%jxJQI5nAb0dZgNE%5&tkZdJO1Mj`+6_ID8DF(LMfQZfM$qlvK(Q|1g(6a6w82<%oZn8$M(~$~ejq z|1dXV*n$*vZI6GL8+keCQI7bBxr|YuM>*mj=0=YJJ<1XP3V|d3vA@5+f^!#+E9Qr$ zwWT1_=jVYVK4LC?AaG{F5g#!(d>gyi%~XFEVIa(9gKQt=n0+!Ld`O#Bro_r@qM7ypfv`(l(=h`%P~ z)iKK5;(MfgRg7|v_yH+j9i!YUeniUG#3=WPA6r66bu}l@xvs2|K7BDxte}|fqc}yR zIBgBZ=`|E{>M7dRQ_R~$an=@!bGA~P`vApx4^y1KlVbiJiVOBpTzHV;qC*rHAECJ9 zMT$#bqIh>F#e!ETE_;n);hPkTenzqQ9L1IIP%Qlw#qwWMbbmn6`!QgWyT(MWRRW#6 zQ0S_<(1|g)53TzI`ifvX`Wp97K-pLuI(;S(N;C#S*zO0apU91S|zC1uO?F2Xq6v z0lk1;K-|xhB@{~&gq6}np;VeAluOxyTbe9*r7416nkrOD(}Zg2dSSIRU05s45F{x_ z2uOlZC)tDsDOYHc@`Pq-rm#_(CA3Jhh0W3&p;fv;Xp`m&+oT(X`=xopcIhVJA!)v_ zL%LabRLU22Nw)~Qr3J!X=~m$hX`!%Rx=lDBEfSuRZWo@B77K@^JA~(?CBpO4ox%&! zQlVYCOE@mwEd(XI&>2tSufh4WIG@Jp#&cvq?r{#|kl-I7OmU-AkcN`w4DT@{RE+% zoJb1P46j~g4GPsYxcvmzt+QsWTbDEe(nLs;AZ0_E3~36asgR~Yx*pPWNHZYiKoTI? zAmu{JgESM;EJ(8<&4F|Sq`8o8gftJ*O_1h8x*1YFq+1{@fOIRQg^+H8v1WDQQ9Yz5r zHF)EV*5FMyS%Wvv)&=%mBLA%dpm6%ah&hE1lE6ivDg#4mIW^_H|7aswH{E`>VXgcJVz61#QyF1Nj2ILD~0fhXr*UZoz zwfk)0n}Con`k5KJqIRDxJ_iW-mG7954f3VG0)%||ugxeI^6n1+A@BXz4Cm90o%r6c z=JggbUV`&}xaS<~I-7#~>fkFqiHDj`I(5 z!z+Q~{KMRcRlsrnVQ%C);5h#mytsbbeS7o+16(q<%Y}Lh*hAM&-!3k-I41+k8&eryNp`Uqg;l}jcx=z z%8d@7yccnw@8By``g~Q!Fp#aMA1Moehi|&i_Zu45NTq&W-2UFkSRdCntz)bsgZg@L z`+Fnfe?7kH8UGkZeZ9E-y^%3N&Q~2{B4|NRA0=+9xW6}*a?yRfl#A};rCf9$Z$VGn z&m(Og`eZ2=-N#G0=ssS`MfdT#d;G(Cq>qi|*s4Ty!7rvL62y691r2mU7X3 zyp)UXZ8?&Hm2f8+Q*UV(7YeY`fpMfdUM5-z%r z7vc(kGo$->9fXVSj9X1o^x66GJv#2=mG6HUNBVf>`(MV9 zK3@6$mvN+zSHAyc9O>hg?|&Ib`grC0U&fI>UUvVB`vn`om%%RRSLOV}9O>ib{KFjS zib{KFjSib{KFjSib{M!Z`>Eq@6!yM`3<^01O>Eq@6 z!yM`3<^01O>Eq@6!yM`3<^01O>Eq@6!yM`3<^0gltPciaiaFB9OS$MiUdl!H z@lr0jk9Pp;gJF84kC$@MeY}*5?&GB#>ErG9bsfxKW6x*o+T*I*T{0@X+v7qoBvI!s zgZF!kXEpGA<4FH4_Y6s7t~&P&No1~ijAuxUXEnN`o*{|MRW}FpJgc#uJgaf=n}1fL z!;^Ltk8^bR(vHHj8h`7kOgoC_x_5|aM`5n}+n%RgZ~G3p-mZMs^|pA#g$69`{`rT= z0gUf_Lh=0P6o2t$GMR@zy$+uI!q2M>z|TCwbB%TM`7it(@N<-nFn_cGp6h~VH^(-^ zoY8jQ7v#CH26+ExcyIgs3)dU)Y*`&V8@9_Yy29ucc#bd`S|&U%^&#rU_|k{yJ(B+% zy-)I=)8{b4@SG${?%)3_^hMWuC#pnei|5BIp1)`D{QU*Q zbGss*acFABQ-`KzJauSl##5>J35(|^ES_Jocz$^S@oZ4UGY(D7c(U89?O=0RlPdvKVe zBwx3yJqIdZ%j0=n%>(JC}!#XrI z+ptnIj_Y&Hd#KkH@r*-LGoCs$HRGv6Q!}1Q%{Z>lHRE|k5zja@HRGv6Q!}1AG&SR? z)Qsc$Tr-~M74eKiQ!}1AG&SR?LsK)JO3gT~&o$%usv@3oXllk&ho)vcb!ckFQ>hup z^|@v|PbuOVho)vcb!ckFQ-`KzJe8VpT%T*k^DRX@t(2oDH&^M{G-;;AoH1x z^)p~Tv$1}L`n+9Z{S5WE80%-K$HiDb!&pB9&uuo=&rqM+Y^U(7P0c=S9h#c))S;;vPo-uYd@4Trw&cccq%pH7+=&%1wuiqT0c&-}nQ*G?)d;YlAp4NHLuS)h#@7a$4 z&7@q^9>m7JzVC(Im97BphQ@1EX~fmo*O%@2FrM_uM(_DRx#&F~C>Oow!|0xQBe*}; z*w>ft9c=9DOZOi(_Vrcor!@BURga6YuP;4rjD3BLeSLAiZ82QQsEvym`}$t|eSPzq zJgxxz2FDuf*Wt0+lkjWQ^7)dLuy!$5URH|g`Fuq=YUcAEH`>7GeO|PY&sX}3X=Og(on-Sh;K9aw;72&<*Bl)du z2=6B!$=|mP;XULd`3LStc;EO){=w}C?-d`(Kl~8F`@=`_kL*BrPxwfF=c5Sk10Tsh zwhQ6C-y``wyAj^+J(7QXFT#7gNAmlgKzLvGNdC$F2p^LI9Fr05Al$$LXXCl2pCTOI zvpr&W=o!Mz;kjoI6Atgq9=Ewh^Mu2Dut&HTUm#pQ&mC(g9Nud^V)xQ< z!r}eZBiz3R35WMok8qtGgj>vWCqjh7d#6Y2UU`{tOL*?&tAx9g=U)3U;R<-}jn@f> zpXH3`z4;TuE#tY0d_7ykw6*u-3H}(}b_7ykw6*u-3r(;&ezT#}p850?^B73@QgML?I zUva!Aj$&VN7<0-@R~d7{YxxF!Y0tx2zGprATxWexx{Q6rTRM-OIN_R&_n=9J3sL2G zQ)FKn7=t=_lG;V~r2+1>*C^MQ*8Kn_1&n>g=^Ed1Ttmjb;?ZMFZZdWhJ;p@2=rJbx zd;obiIWopXIb&aOdF}#xhGF!WwiLwn1xqoWEim>KhrTJb{$pcbarL+u`-&U;isN~X z$6*c60@2u49QSP-`--cN!F?0^iW_50#u$?^#^iGuV@y7mF~)SRYtbPX^Fimj79XLw z>1fj_JvPmpZE}rC0*d2?wadK>Cn_%PfCZT=6X^}%{bP|)LftIbqFUP z&p0$S}&3Nk2)QqPNP0e^JHRD(-Q!}3D74eKi zQ!}1AG&SR?LsK)JO3gUd%G8YKtBQEWp{W^99h#c))S;;vPo-uYYh`N2^OPcttYTRQPPnZfj9HGg3V+ z#-5SJo{@Ooim_*;`n;9@ANGti=FJ%MW{i0=#=IGPo;T*r81rVPOz0jtE$8sLX@JuK zrvv5y<^b9NZGd@zd4RJ3X93OuoC7!)a4z6Hz1VC~z%cy5Fruisb(YXWo67Go~m1Du)IrOS51D}hTQ z9GO>v?Tlx$jc2p*{F;o~wmup8?AcB>=SJ<>>^D<;&zU)jpUv*@r5%N5v;WpnnRXPf z58NT99Sx)K+n%RgZ~G3p-mZMs^|p9K^_gvB&sxP8kg;d&`IkOI?~(O@zd-MkIZ0p8 z=d#1-P zev4*2TNLq(LsK)JIy5!osY6pUo=VL)=C^3ZbDJWbacFABQ-`KzJauSl##5;o$NUz} zcu zYgujnNVe#);B}YP$|j3G3tod+t<2cFKs_$T-UaG$G4?Jn_AbEd#@Iz;?*cq7TMU<0 zYUfpB?*e1*f|JJH1;*Y5aDIrai*M{*(0S6>yPzwlp03+sTTd}>6UA9uD9+hRaqa^Y z=RHhu{!WVddnhj0M{(gnipIJ!#{8J6LX|X4sFto5R!h@`wbBehl5&KABnWkqO=ysE zg(fLaXqILQ8>Lx7i!@u2}ankQ_RZW11n<_kL{V_g|zT^X%) zWsLbT-}3n}avvN&KSu6@V;q?uV^f(QldCd6rhsr{-4;A&X5>1UN5ie#g1L-Z;GiE; z-jf=0qZ@%^>pIKpy9{;>>XXCyhxe`?S_T~FALi0Mz;XUzZg?edoPU@bu?jfOKg^9> z2OQ@g=8Sb^aQrUrx-u(&SJ#QxeBx6_lDozPYkKcOp{wdbC&u8p4BZ{*D}wFlYutwq zWn*pV^qD{?3AH2C)&8l~WJ5n8`3}f;d}cLuc-AF#%=Fg~er*z5g$Fv1L)mdtKhyX8 zmDb})RW<&cXV3X7lhUgIs{pG3s{vO7t_EBSxE4?XlmG*O0l+%II=}|N2EZo3CctLE zX26Yr8v$DYTL3o$ZU$@xYz1rsYy;c|xDD`r!21EW18xU=2=F1m9e_Ik9|e3Aa2Mb% zz}QJYoFW)-JZ z#c8enn%b>qcB@si*8GLqtYJ25I4zOWTKgZ=ZY{H0tD?2;KdH?+X0wjds^+w$|E6{l zvy)V`;Kw4rPYtuF;j~t9S^-lMu?sM}fQnXa61AyiHnp7AYEG*zh1%6IyE+xE`o7er zp4rrMT5FhA)UjQ~j%|PMW7`jH$_z8$m&=2u3riEm0W!h@}jc-YF59x zcl~OxujcB*es*1}mG#%K`m1`^hduGSR&n)V@4K!wP(N_o<3a=0cDMUYfnDoK_vD z)i#FOwK2Ok6|Jpfsm)eqvz61TXIfFmjl|mR+TQJUEwt0MT)UMZFDcu74XeMdcYWAM zwrd?%AO56t)hO%NvHF4D^#foZ;Of^wzD`-cp4G4IUB4FWYq|RMkgtdOA^BKaYYK$n zeF0n*kmE2wkG*X_pfTLWVz^Bu&i7qKZSG?>_i=G<;C{3Fe?;x>XLk3iXgzQ>wRwQq zJiuu+a$4K3p?2Gu-F6kN2d|?x4>FqvIjts6>!AtM?jdIPkc!sBlc>$Z%;sTEYdxp6 zV=}ed!R&UZXgxBO+C0K+9^tf_Iju*pr*@AryGK>DcFv$SJDJT+PHO|FwM(FOyO`ZB z6|Ki|sm)`|<}prdBd4`{Cbiqm>~^bY?U_w&_Ar}0oYp3$6?N=2v9{aLyY0gCw5|rO z?KVNaN!j+-v-*v_>oRzmeKJ&TJm%V%@_1Zcp4q z?VezEPpD|^yP4YTV>bIZt$R4F{kKrN{mgE^iq@02Qky54&6AwgW=`wCZPe}nvpb-o zb?|m-bCB5_p3U2dyd&Xr=oRa1+_WCY>seRTbWkW z@wb_^-79}x+?1CwZ(d%;wc|74 zU&gqrODoD(IK6ha!(UwPD0A9<6?T8I!*BPx%Y63bj&i%N((SJB`oUs(iDQMUdATwJU4C_GX;s;efLZBW?kAayqO9@R zlZqxT%epqZXuNY`(S%78oXZ@`vmM!vN!Ml-I+k5??RACYCpv!+al6+W6Jh$IR=i<0 z_v%mH_eIao=16#Z8RK!(`PXR1qhInYLO)`B7F-PRuWZH_Lk|i6!uzr#)EDizjCv-Q z&1ZaQN;Hr6roi@R6#P4wHm9QeCDHbUKo3gli_kP|(DTC6EeE1LDCa85<;X>u z$}C(}?psxcGFOyWW)?eq#fbfndA}EBdMgSYeh13*JFES~+EM0O4z#n>TLC|HVIlm` zD06v5S(&pO3_#28SO!=Owdp&GsMN8{S!%y-(&X{%?Rc9li@nXhE<2aKojh~AY>+ka zI@!RMJu#aZ$;+{Vu!s*EVLIFUUO}>y~v#n zZM4$oD7AY^+OuT#!mo$Sa%K4q z!{l*t0Y&*ZmRVdEpBvIm{5BFYsrjvb=mAxfJq$4u>n``ttcg#w(eCmV1{zJ?7xgxKsoLel*{GwnT+!JjLDGiAiu{r783qma{WkqSGId> z46hvWf6sqc!&_WmKHo-;qaRY6{7Y~t_fljhKoPEszXN{B$7en}e%+KNF~V=<_zhtE zbjHc`L+tpL&kH&7iJsXjJ|5i!Jm$u;av9|Mhh&1GTwlgTz{|1YP2E$eMK&``gF@WS pWdG&%b2o6Z?|l9J(`ZHc`^#nW=gRrw81>Vyr%EPys#ROtYTMiP-u?g1V>V|dvz=vD@2&qk z;dA)z`JUhT?r(p4W_ISBGw0OKoHxr77nhxY=wCeQL2>LghHUow^XZhmIT3SNC>j3$ z7c>ARK{>7~+o>_g#AHkcTl|M<`8^4t`cJx=5;CHJG>_|n3kZFm=Fv1Ird~$hP)5EU z_BC}XE5B2TZYJ}Q>=b)H3-Hm@lk3Z$Z{Y{N0xT8n`%!NYx&`Xt_q)_T{9eR$>@faQ z;lOc}mQntg{eb@4ScK95h55O&^KL_^{%L1fL9y4FQBvqBFD&)m+xmEh)3eh1Hfn8t zI>WibSLG~qI9IGF^m-3^3Y`TGzsKb(gpz^+4{#2z+qt6fiIt@l%blf!Q}+%QYMg|egB8tV)sABBP#>lepZAI^68 z#CA#F;uBuSt%a3dIRtJ45_|}JG1jk%vHm&(_4j!li^2goBNpDc zlG_0UVNP!l+6*Wtg0MYSfgQ05>@*+%L9q7*VYdMVMG*GIDsUiHfu{@zJnnU@taQpJ zgfmACNGPK4T&w~eu?hqX2psS_N)%_tGeH9?ia@*)tH5iq3Y;+@06!2V{Hfub0S!eM z-i%e?Vypsh8xSC2;7$%b1~e35ct2Kwk75=0)PMk-94g$t@X6tG0}_fTd=aa_Ut$&b zy8!`+0(a4VHVzrk;KOh>E+J-x*%v_0}V(i12QCLh18f8MjB8! z>~&O@IErz%M1iYJK2Xlar5VstM&zoP6~@J^@O=Xc&_t>=0y5r!lrkWhF)L)ntT4@h z0tBRz2Bf@-Ya!VNq?7^4jagw%%nI`iD8RK=wc=XqY+RlJ8D%gQ#jLO-W`$)26yQRm zn!DaQ8<%fDL>Y>Lm=%g+Rwy-~Ktj=b&2=`;Z9qgB3SZ0$H8Cr!GoS#6NQLSUkqpQv zgV7kXLQBjFtp*gJT~sIzkv0P&%1~^NSz$-a3OfxbkWlc4$Zi87%24c!S>Zs;3Qt{% zg0az-1GwwG5_*7puhEXucfGWMp|f!v8r33pp2y&9iNe{S#=Ghc6`YNGO`}>wXW3u@ zoP6MRM{&fRqcWG&)vRo0Z^o=}F=mCgFGa!FH+NKSW)2ukEO3>t>}_s6^xZGDxqYNj zE#fyAgVVUV^>zAljdzct+5JVM+E?uKVG?OaK_MR63XiCrwIGeR*7w+wtWoXj@VKO} z#{)IqItq_dHL86b9#{AEI8EcNqwsi~Mzx3@vy;)&Uit0eE{gl*vy7)=#!pF>%$OCj zVpf=DKtU0b-cg*hmTUuJinFvO=37e3oS1zua!bjDq(sq5@?v&pQOpWUVpdpYKtT}_ zzLn$~5L332m~SvG#WDL}mf<%aI&Pf`);Nkvoj!-Z!c)S;Ng1Lu&VGJ_W9fnkPMFT-$(&qNR_t^X`3oGy1=n7e zrkj)$IWqMy-2`{gV zAAlO+L(B6-drSX!Ud4stnnNKxceJb1f`Xm#=u{_)O9~QiEJ^g^#3u#QFp-cH9FMV| zC71`04VaJf{VhQ}|5bv#1Q-Nt14PM|;BElvUo63+fI+~45H1PqgOEolmf$&nY``AC zAnO9`D9p0_3CdTbT zp0N!c@92=nn}T3c?gr!=+w7}Ke+`AYJG`GHVY_WAId;gkJ?xU&j=Soh-UfH&*gCg&EY_Cb$xH`?3rd)jm0{X(=SjCnx75$I&y{YHZmOSW zpC`?i=GX5ZVhMt)=p%;dLutMcFah`{SpL^6FQJz@r_}fFs*`e>us>58xz{b+>o)oI z6lrK^C~5PThb|{LJT#o3u*BY;eMe3^))`_SA`O*>)@M)2*(xlxx98kR`C;~9(&f_S zlwSt?U6fC?r%J=6;q_;r&13(+YYEhjwRF}>=s;%(JqgDYL;)S_l+aU95xTA&+f7A>I;Wx|oj7Kk2S_PWO8tNwiozsTHZiAtpnae; zNE%c>Fo&X$W^bP{DyKcio+D*n37-)#IVV}_FZHh%CPTd}`k80o?;3o`NvHV)dxDfG zCDt#z%HE!Ar}hie?d>vu>v!#8KW-gs55I2+ZR&)ONg=cu(1QAh`jhVi-fzPwC<~!2 ztb8jePYI=va`-$Orh=v&%0sA~*=!>=gF=Id4ScQ*Q{lbrP#!|f%w_|zNem?t8}NBU z2>c{*1sgfWSJ+>I_HFATe8}EJ@G6K4_+*FAoCCJxcxbk7A;(~2&K7b^v{t&{SRdO6 z$NyFCZ7HvNx4EurfaA0MK6>mbj-OfN_=%)bN6!y%?(mJR^Hq)B0Oy7uvvWiJ&PUQo zy*hg{{G&d=`w$Ao?#I%nH`%w~^Hm4R?Cxkn2%od?@t^Ll zN;y$=!X>#s5+(PikUod>1*E?~`a2|4E4dROB|{njX&|H_kWwLygp>y9DoEoXeIL?z zNSTnbAWegm4JjAW97yvZ5U0p#RD3l z?QiY+S2&;8ckXi4Kk6DCdUTfy!3pB^`1@;r6u+bP75H5EJHVe`a#y(!5-*;71N8)Z zU%wNA*AhNhY1z9tAMDM)hmXG#r%t(MN$Vsu@Rpt)WRXt1{IY9S2=dVGG4~GUPM@Y6 z{%ry7UCh1uD&^q!0(0*Hr|Huf&Zjd->C+jc^yv)h^$GicPl#&25;gq750t3=f=ju7 z=$Fx0fnUMP;I~!f7rv0d93QXXAj113qT$z24Znu<`Ze{6m+I`- z+3Vtv{RMkVlWP?^b?&9x5xN@A!yQdcarUbdy6jimo9v;P%~B{7dOQAPZ3DE&M)%|C z?#o)?oV^Lsrx(w{6;(3Dq%?}j<0x9uKvF&+nc}~sQ5-mq;$Yl1vCdxul_}4`&+K0V z-jwHVL!tAP`(5X~Pr5YQ+6AtyT~M{P3#ztup|`Eg4!8e!(*9>F+COm6{s|Xu|G+`} zCtSGw$7{F$H#wi)RQdF#%BMFupA_w%^DEr`Ilsc~pYto+{v-HhG**iC-|JUz`|tIu zxBd6})!Y6f_!UFEdfPweSGfIieudjV=U2G>$4h)0{smkm<2DR947XvxRNRIEN8mOL zI1;yEz)`pj1E%3N3^*otWV$^xOl=5A2By_ zIB*bGs7tfHcxiRU$eTQ&k5|PBN4Ym@zuBFlieF^DE z%6q#54LOO=ySdGka9!?o;pe82vuuLD#9AMrY@)x++7O{^ zlE1>*5}_>1UuoSKp=`3>Yuyx~Y>L0ix;aAGRKMTa7NKmKf3@}A2xT|;*IL^nluh@q zw{DA2Hp5?Q-4UTI+h1pWFhZH&Z?HZTq0H`YvOXN4EXTjWx;sKyu79KTu?S@|{hO_O zB9zVY-(%eyp=`E)tMx#HvN`^HtxrZMyV1YRdN4xST>pL6LlMeu@;_ib8lh~S|3T|B z5z6NKAGSUlp=^Qw5$mxCWjFhGSvw+><@q18o`_I(i+`{6WQ4MX{>QB^Mku@0zuy{+ zP`1ebq!s#MT=-fCQe~6Uh%LlFU#s0&j9A8RkmEZ0^LdxevC|}}#hLqnJq5Ka2 zF;YG^LitkvaZ-L$gz`K6FOYKh>DKgbng0YSho5oH^1J*ml5+S-*DSx=|8Jxmeu6d2 z9sVv-z9d3{zav7q)BiFlUmBr&x&JgNzcWJl3jeF5JU>Etf&X<Hi5SFOE=N?C&P!(Al9GKbQXkDPI+#e3kz#QeG0Fyu|-AQtpmW zUh4ljDX)xBUgrNLDfdJuFZcgDDfdPwukinhl-EQkcl+NV<*OrxQ|Ll>W=T>%j1wy;W_c-2@l%|(mg4kUirEbm?HefOZl*YEE5$k6 zD9(L=;=G3`F4#peZ!g7#`zbCuL~-#Eic5}BT>1jVWiL{^yNhD}%M_QtO0nP#ibX%6 zSbTxvs-ICT{UycnUr}_wN73^kV7$9Fj$ErqUAs~6>iXcRbliv5a|(SyumgRG`zN4m ztR0;_Ck5kC2SVK)AKT*W=tm@f3i78uvBjOLY`7eAryFobfz(C#xc+e+D6xA)eSBI+ zyr(uZ`{;$t2|c4G0!{?X0?Y!O0yqV58sId*>44J#vjMXK?SOW`T)Ig z6=w@u#5qEnc%#rR&K0(cHwpKP^MoDZeBmK+fv{7&S$I^;6LyQY2z$hZ!ang<;R$h( za6r6GI4CX_o)T{to)(t~N5wmYXT_z$bK;%C^Wrk0L%d5kDc&svM2FBR<_kg5DZC^u z7hVxp2tO1HgxADE;YVVTa7J7y{8%g&&WSGJytqnuQ!EjFDwYZt#WLX+V!80PSRwp} z=oWg!O5t76BfKwqg9ON z9;Eq@7C^cgQXZsRAT5M+E2Kq`ZiBQK((RCzK)M6cQb>0~S_bJZNOwbWK+1>YgtQ#e z3P=Tz3LzCiS_!Ebk_*x*NF|U;A(cTYhg1Q{4XF~62a*?3;Ny?1flq#84Se>gHSoLN zS_8lT+#2}fAFP2dzpw`W{3mPRKmTG4{OzySz<>SS8VLQjH4qnP3&h9U0*Q&XK)-&r zK>z->fX!wLq@>samtAHH3>st$eCu1bz|f(#z~z_Q0>g*f0^k0&EpWvZw!oEF+5)3T z+XCPDjxCU$ZVQYZYYTkud$z#USK9(V_<=2O%{8{bwb$AL*I#c7OqgH`Oqyg1OrC5D zOr2^A+;D>}Fk=RMcY-aDlVc0aoM{Wpo^1=NKw+URuyQ4QmNn28;Fv*s&S#cb zP-ga?^Ajs9XaeN3ynv9O;*Z{J{nRj&4&&)p0N`B_^n zXfovIYy*V++y^XZ3gqWK3<&uJyDTtnx95D`UO>n%+;4%Y5_`@sIs^#$#YZgY2FNct z1_=43FIdoY$S->l5b}3-Sj zWpd84%-vfla=R*sePd9bpd&An-TFH0`&imk=bBODFBJQh$`_Lh{ z40FTEfRpck8JAiKoP7VwxDi#r$@jmE8@U=d`Tm!2qt*i_-~Tc$tqwT({+DrMnt)^X zzu1>X;@{w;Y|cN-4J!kV^AB^WmB4ZSVQxefaGZab8@U=d&OgkJS`QrOALi2PfaCnb z+?XcdIRCZ-H^j9tDWCHXbHmDjgtj`I(5qt*k*`G>i* zI^a0}FgK!LYic)`K49M#*+*b)ZMNG?^RI1bUPkBca?EaG&pxD>eFjHO4TIy|*7J z6Mu%Uy3hA38rMjsety0Ez2UJwu5a4FSVtQ5_3Q2L4Uhlz`f6nSV;uGM>+SCij|pmSx5eY})Y_VH3q z*~d${zJ0ve>^F|@;}r;}?Blf)PT9wsLpWt0FT@poGnIY3PQoesc;WMbowAR21>wv- zUg)Pb`*^kc=*>P}?YNkIyuD*QW*@KF$BV}p%syW2K5ny*_d>VX$7}ZSnti-xAFtWR z`+vNTSHAy+K06=2N5_4<^8GL4NFT3!|I0Yi$1C6eGLH1|%J;vFBYnK`{V(H4AFq7> z%Q(`<%kF=1zhEQyGQoPU@jeY~81m?M3>oPU@jeY~81m?M3>oPU@jeY~81 zm?M3>oPYa(BYnJx^}#?)F-Q7%DW~k?rJS;lmvYKJ-hr$ShUt+$Udk!^cqyms#E;VGCH)U(uH71qTXEwulJhIYT)_C;r?6h8Itf^b?zCG@Lcr> z&ybkUYV;_cAqme_w@5~w)!0Cu)j0IkKdaGMnQ|PDb98!Bj>EGWf9|o1;q{;3wf)o2T(85kW%clE z*lwTS6+$n=vxy1NGU0it_fZeV7vD$kko>3UU6TKlK8F#4XD?C0fB|2i&${0^WsCa~ z@V(QvIGq3T)p?M5`H>_>VP{C9HL6nwF%0tSK4LjPLi4HCxo0 zAFz0Sz~cECi|1#T5YHCXXOBWtH=YJGb>nG3Q#YP!&0n*4{+h+}w=AB&y@YtSsp1)h zrfxh9XzIq(fTnId)tVo&cz(#@`8yWR-(5mHcc|hSg{E#i4QT4d(}1RKJk^>Xv3P#O z;`up?=jWFY&qh@|qtMiirvXjfcpA{uji*}kV;0YkSv-Hw;`#eah-a%Ro>6G(#?yeN zZafWW>c&&8`5P9`->`W8fyMI=mk`hGs(41BsT)rNn!52cps5>AwdN-*o}aLI{*lG= zkCzb7ovL_7p{W~B1Dd+=G@z*)PfiohANH|lrF)+@#m`OCgrEOM?EW{F=TA3}RLw<# zYxL*YHTv`H8vS{8jsCpWHM*sD9z-U-2ZuRI@^!n`bD;9IJf2tg3wTyme%22^cZtjJ z^}W_}j=1a_bIl)Awc#l4BXrxa0ZrXDY(P`D4XZVyxIWjthk8vF&nPr?<7q%sH=YJG zb>peljNnG3Q#YP!%_y$Vb>n$a70)O%b>nG3Q#YOlG_`r z^|@|5-&Dmj3QgU38qm~@rvXjfc&as{xIWj7=i91yMxm)2PXn5|@id^R8&6J?tZ6iu zjX|+BVaU1}Fn*-9UWV3~lDUS)|I`{9WInUGeg@2EHrLP4p0{hRpP?NWbNvkMxR~o_ znCoZYxy|PK8QOE3&Gj=bberpEnCoYl>t~qjXMEGw&ydHo{+ZX&FxSwaYiQs(%XGZa zTt7peziqCc@xQlz#v%2Z$(K5>6y~eL*t23SAwPqCK%ewJw zQ^hk1P2G4J(A15m0ZrX_sx_k+U)GK14pls((A15m0ZrX_8qm~@r&=?L@nzk3Hmc$o zg{E#i4QT4d(}1RKJk^>}j4$iPvsD$(C^U8BX+Tpqo(42^qotk}>OreaEcLwZ9f7rY-NC z zgZ8+nxxa&UT+IC)l>5J!`#YHXJK*tibAJcz@pE&3hYQ{2{to8;4(9$2=Kc76%%`*Y2Ged*r8=Dxmk|6y}qU+sQMb6;QWxS0F; z(&NV5*Vo+F7x&v1!sKx!uP8??e7@3+Hu8C|2W{f>RX((t&-<$p-uFKI{;O*c-s?V`U%Lk3{q4i~_3IGc z(>|Q96%pRYKAf+U5Z=2!oNuT{c)$8^zNr!6J?g{x4b2GeOCQc}Y(aQ0`fz^pCWQB& z59jY`MR?EoaDM9+g!h>b=kIMpcyIY|ep@@j`^ks%_iaab5BYHZf%_5OH$I$ya0kMB z#fS3`KZNl9@ZtO;I}zR!KAhk6D8l=|hx3o^MtJY{aDML|g!g+7=O5pP@E-5s{Qf5p z-q$^xfARpr$D{zqWSBbyH?Y9jdG7F2gu{EbhwY9$O}IHc_smhk;l0_zcE_G29Nv#T z%pHG@aCi^)F!#dqgv;Z(6CH%Zd##77kZ^eK z^swE_FA;7j&z*jSaCh?Dt3M=MKF_`W8sYG>oMF8;enhzCJa_gC;qad4VY{FFm~i-f z80Naq5f1Nr6Xq_QCmbH{3UhD0NjSWRd6@gzPYLJdxu0Jo9Nw!uZ1>Aw5DxE89_If2 zZNlL_$-~^Q{)2EeJoioy;qcz$VY~Ow87e~rw2#m#-i&3(nqeZ|dv z#m#-i&3(n`n3cJ&INNh3j*MB6Jzcg#zpJ^gINlRSwXZmgIc213j5*=8d;{K;=U^@0 z^OgHu=e2Oc%NrAAvC+bfJ66F^WrHpt$Tsig$NW%zv5U@>eMqyg{+(Clre> zP&A)yGM{ZSpKUUqZPI(T>7RMc%|QH`T65h^y6z^vH#47Yf@hZU$@tGm#oC)r!i|!- zH1pY}3nQvvOo;9YIReLJAB>fKmCrW4{s;PO(~d!^aVB%m$R4sUd@TFK2VpGf68Ci1 zT~Eq@rtW%D1~hfolTvF&u~w$;`dqI;IQe)+p{W~B1Dd+=G@z*)Pqk(gYh~)j^PDQ4 zQE2MM(}1RKJPl~-##5~s#afxV@w}*tXB3*c@id^R8&3n8y75$NMzL0=ZaiO6#WM;` z-FO<%)QzVBP2G5^HKSN7Q#YPxRPl^LQ#YOlG2I;1fBNUM z$vvyNXQa7jq!mA_rZrc^+%r-;F6N$*+Ho=Wj5PO*#Pe3nJtMW}t^EJ6XQVlA#+)}} z&YLmk&EWIAId8_CH#22I&!}nHM=wkRoDMi0FdHx%&<U6W-wwfZ zBYb%M#xhtFn0vMubE%cUS%_V#Y&W6`xOl>mc@@~sd^X#BHXF~cNvms5O3P!qYlm;|)<7q%sH=b(EDCW25#&d@%o>6G(#?yeNZafWW>c&&8 z8O8h--FP;t;u(deZafWW>c-Q6rfxjdno-Pe(T!)TDxOhj>c-Q6rfxh9XzIpOtr^Aq z7TtJmSH&|5P2G4J(A15m0ZrX_sx_mS-=Z7OovL_7p{W~B1Dd+=G@z*)PfpX^vzF~$ z@XxfajJ#hDTiaz=3ass7?pdojM`V(}!iv{g)|x+(<*&5jb(giuCi}fsyauyYnYnj? zc3jN83$){6?p3-GvXFPMdodn0pt%`60Y6zPWcn z*J*R_g6`}Fx^9bo1I66U6lZOvIAxiL-=Oakj8UoFlY}Hwx|I zTw%L-lW@N{PuL;O7akH92s=e{T^Vy-8NGF7%=t0j@cA)v9~?hFM(%@S9GM?u*O(uZ zqcJ}wpKxT|7CdKW)Owgl!>!wbxwJaqpdV7+lNxhlnt)^LI?L<33~>!k%I5sTdsh!D z1CH|#bE%cUasFX$L=|wHf0!G&8aU2B%#B(P9OobA%yngO{J!3GWmf&Rz6-DU#HY@9 zcWoT3>Af2TudWZCO2=~*C=mT0FWjh&PUF*w!=7O&(KD8RK8n6bi25=4F z8o+gc>i|VS5l{k@0P6wk0UH4u0h6F-1+WdU4X_=s9dJ9~ zcEI}q?+4rgxC8JZz=r^L0`3HS6!1~N-GI9R_Wt01p5j06YkI5b!C$ zrvRS@d>Zg5;8DP50iOj7SzdxQw?d(iUPsm*F;vzpVY=Csy)N$u7!yEPhGYyV7b)-s#5oR*){ zTKAvSZXL5*r=hj}Z`5Wzvsuq+)o@zke^EP;*@+rj@MV$jrTfYYEYquN})R#_Hy3oLNJsrL{X}A2kCQib~N@IT-zeX0nMvWLZ*{DquvuWaDypH?6 znp3Df(Asbrwb{UIHgH<&IjxpK)UJiuwPlX(+YZ#OyX{ zXl=fn+H7Vvn>npoPOEh|wQFT|tr}YQe4EQTrlmM; zMAmNC^=-H7pq;Mc+N}tAQQhuqS^f2W>%&H}-Rrsf@RQPAtFB+q>Pvm=OJFZ?_3I&D zudd&~>euzHUkCPeT>S>fH$eTMe5|dDlS1%1AFc|>aVXJaZ~OOX47alwZr6zOeOFVP z`Pa9T~A){bkb-415ALqqGq>#5Cy%;rH( ztC`b!Xaco+h}k`)q4n@2YV$C&d6?7Mz-jHAOzn0uyPX(Lvi z-J{IzQ4OtKGpNljX0waa+Q@0`7O35BX17~I>#-ba^BA*vjMLi0Y3-Rw?e;LcJsMhj zXH%QK%w{jAwV7!tj=g5qb{qS)U6`KM-N?1wX2>_I+x`Ytzo~EiCa`bf>TiJj26g>S ztbR-1`Y;`~yM?R23G$oN^*6Kn8~fJZ2=*Jf`kNuY8S3NX4UTJiyfxC}Z?~Lpp~v68 z8)+Q(u{iG2i1p()QJcq^&Es6GTe<(yXJ$Wm&d6L;Y$!Trjv<}`z?G7@#gBn_gZl^Yfn9U(hYb&Sq)E(6BDQ5SShSuRb zsm)<#bC}a=i`qTS?4H)pI^v)!_339c6Y$HME{tL2aI4HqUTc z?VQ%Lh1Bj@X7{Xy*0Gh;<`}a%#%XP1T8iUu3v0ivecLZgXY6j}+V2*~Z&A1Zt*rh% zee2%?_V;l0w?clax_&#W-`2N2OrP#<@eB-}?80{k>fMZIIu_j=!LM zytUHfuSd?e#YqUYb|F;Pg}1xIe*^!OqKyJdWthltCo8wIyu|R!{|;5)mrGcA0^`@Q zJj;e*o~yj1&@&}Xn4glPOU)s29N-N4&7J3|Rr?0r&Syt%qRycgcPM^c$ zF7rB8ILjU0D!04B;{%HoCC-(u@|93t=qM;$UbWKUDle*V6jgW}&eBpxh1cQsR1~>N z3%%EPJu9v$Ev+v50WhlySNKRKttfMR)}*3|%QLUbDjHunv1r1i35Cm@E3%wf&Pms0 z7C4t*d)@T~<0lq=FYI=o7jcB?hhCYXEE>T0C2%vt|6R`bb#r(Yp)rhK6jbs*VEk*p zQ1V%f|2teH@qc(YYk^ z{N)dh?}yMdQZfABVR$tNC86xADU~gkWRzF<3QT>VuGL*5hyegyE=`BX=f6V(l zD8o}x;Pg3BhOe;3N35M?t`$HRmU=3n2^17S13(!oD$2?V%fSG&e9q;7#Za5RqKHbJ z%L_{#%MtSs4f z)~v}{$gy<61gFba=*gU1R95VC6!{Ar#Rb=1m*psOXF)Tr@;XZ$l_hSUyVy|;X{D#4 z%I#QLR5N+P_?fZQ&L}BBj@&yI2=i~GjX{pMA;jM@e5=SJ*Op%}95_xcpeP^rGK=ft zbBSC{KEKE)9|rjRg5N6{68@9VJD0)hKZQaSa((&yBI8xWx>sMW&uBCpUSMsxzI;BC zF^w6@<+A@WUJ2#67f~*k&wnyXtiBA&qXMC^P=GnPe)u?09}i3*o1qNXzzbYoK0k+# zukTZf)XP-wsLD=Yb@6AwfAVpg%8qL{rAdtNOF4c689$wIa{U%|+{@>S8aYtRUiJ29 zKJb_u&&p+x>mQK`hH`xw7XvTHjyKJkMlJG~VG0!DbFS>a+dE!x@3-g!e;)pRv3?%)2BDjv9{#-EzVK&}cw&d~ zONC#JrL>H4?E3=!4wR(<3OD4;S$GRVwNJT9@(O*f^rC!kX@0Tq?&e3+UEWo`w^4KB zQ|Ydi{t8#I)3tJCzR!2aoA1hVie9%rA4>A_yudkq9@onJ$5s`Wt#B1TnqKNE$$!-2 z_d3fCq?Z*G`11Ye(R81CZGQ6M^h#H8MZUAhU7GhKHFvJ4C@9GHHqdJOJgz*sQnE&+ zgDciJiG-}=U*pN&=e}lw+Rev_n`NcNYjnLVmp^cT#s}(pnW|CgoA9zo_A*W519iO| zr%~yf@Ul|&a=gX|>Ux=}Q7L9GV{DiQaKkM2<;#v{X?&)xqjNPXMRk-l?Z$63EB&id;O)Zg!OE{+7?Ou&EwAAvLV@d`A>D*%5l zd)*Eo2y?rG&|*MA5riG_3hazmV3z>_2!f+K2zv}DD1xv*UV(%03Or#z;8CA*Rk=$( zA)GmCKtd6PXX6!Ui&r3MK;WRyS)@2Ko(UOHQ3T@Ucm-aKSKy2R0r&$^#Ge|@8PHIK z;f;6&F2*bHwgCYW2JYn0Wk5p_hWFzY_$Xe1PYnpb$)U{SkDMGnHz1*i!WZ!hd=;<2 z-wg;r6u67_vle7PgAc=5OJdv#$#E<6H=ytYoE<9F*U@J!0}Mzh12Q;nh19qeMi@|d z(&sEMau(uli9C0Se4w1Qq#4jsM&ydP6~@J_@FN2X&_pUV0y5r!lrkV0aVuoTtuWny z0tBR-2BfrtYav+%q?7^4iCbZA+zRszD8RK=rQ%xatYx7A8D%gQ$E~n5ZiVFr6yQRm zlDpnIYsobrq6|e|+zN$pD-;`0Aff2K<~nQf7!Xm0!XLLnRon{e4Jg1NQl>gY0tRH1 z!Kja0p($>KW&;Y)F3J>#NQ(gxWhi#Ut*|q0g|NGSM2WRC$6WhnN?t#B}Ig(rHW zU~IJIAntlEhaTYWYqX>EQ+qcsbk@?QQ7vlcc^u9bDVz;ye5&qH!CA|z8r7mY%LWVJ z+je@aX?x@_%oG_S}=Pq5<-Q2q9r}wtGeWXz> z>Mt0F)3~|yboz6RPmiJ5eWg+DYj*lc5@}~%J|5bNjHsQpB8`vM_t=`OQSF=XxTvSc z12jH529HxUs(lk4SN8NcP2;0u@OYd?wWuDmlhIQ?`Qza(iu=oF9Z$uKpOUN@aVuoT ztuWnyf+8f{qc~@+Sq8)uXK8EPx0KeoarjN6^XaVsp1TVc5Y1w}~s zR+4K#Oxa4}zQME>#_fZV8%!=Fm5Ns4iQ652+zM53E37x5pa==yN&*JNl&vK0+f8dz z+&=W`cJoB~DsNeZ2fDXcmE@QDodxcy{5&TfTK^fIK=>S)pawJ3A_lHv_o~w5<2`s> z<*c=(Exp85F-O!b5@1%KJg<_KPXHWpa|n~wGb6lzaIr`nOFPl$XIYo8FA2Aj|)l!l4KKB4g#`&vT_0kQydalW55gy+8o zARhn>0k#05WNT;-K;S#p&{4n;-~b4B0PI7MM=93OIe;v{F2E4*3voH*a{)tu9>6T% z1AqwXx8OYB9-L3=6GG6a0)PW>jR(vE%);e>3jsrbxw!mN(8c9|0l>}3 znzq@Rm=;b<{6QFg+pyJ25g~$&s83o4yiZ74FYQL2v9+G^Q4PpDs=@0U)8@O&U5`AY z>OJLS>VT{9`o^~Twz(S^w;OrK)_HxS!d_nrf=RagVbvJ4n>uB2Ko>Al2<_J4lQ5r&yfFm%YCc80vQ})(u2jp5FbO&0FyKA7{Mo;kp7IpjI8T&6k8Q>_LB8$z=ZTNlwgqfYKpx&le8TzgbI-dGh>*{& zceLWqbY#Qlh3B+ly}+C~ff;q+M>x9`&y5ae&kbPvuyu1al!mKA%^d;sV*AwEejPP| z>;_zSYCZS9iF@B7zn>Zy5*|X@-=*P82@VSnBPc9&v}WCw-HLSvI|c`a1cuaRP0ijW zEOWGG-%k0Vj-i1|1D8^MIq-K-KGl&L7#0{-dj{Gcga!S;8mt~`ZLbcXgY9ATIJ9r@ z7#(U4peG>zB+Eb5ehM80?&G-Ea(zt%u|43eWbG*L>UESJXkfZ@q^e6N+{@o3zVlIyLmT zgJaS^w#HvEY9kzTKV!#S?XHK$ka{(aM)*g4fX^X5Y%Pc5#yhqae30ux{W0g{x_cfO zGo!(=6`u#%P)284148&*fRC#&o{E$c6(`&Q&qq?g^C_gyA$E3X)vTzNFyMnLAnCcI7mN&G#*k0q)bTDA!R|zfixG=d`Jr+Erzrd(sD?-kn*a- z)&Mys1D-0#uZMR3a{G+>#13cw`VI@)*6}a3j;)SeyWO=9yGMl|-t9(-2)&kYZ}m?T zc2>U(-wXc+`1=Pu6>fy2izi=4U7_yx?}Xuf03TSi>|LA>b?4v1$I6LQr`)px>jP-O zO%eVTIkoDAH%n0w_F%E5UUbMFDC>C+j`r!z?D(;1}n z=?v=j3Ht!ww})Rz8h+siG1Pv+rO`k1%V?~?uh1p%HwiwEj~D&}j5$7DA-G1t93QXH zAj*9c@d|xc!>_>_ehty^YiPG$S6b_41~ksw-ZiQ*FjsW_dGhrb4fG&dEE@@ z(~D=}DkGUp}QT;iL~tm(zfR)+BR^|wh0$$+rUBFCS0U#Cuq0r zH#nc(Q2F$R%BMFtpA>DI^DEM}Ilm%poAWEuwxjrEG**hX-R)O*+wS(OyKQ&-)!nwE z_!URIy4yDASEOxoenr|g=U1d{Cj|I5{40b8<2DR96t`i(RNRIEhvPO3I0Cm}z>&BO z1E%3N3^+PxR$|8|m(_PvztY^1f&O}7a6hY+wC6*tJ%_v7GqRF8gJs*HDd!Wv);5mY zQx-knb)oL~Ac1o4U`}r54$8fYIk}x@Q|>+BlBuslWnWVVM)5U{-xB3$_&Q6&*V!7r z&gu3wjry7@`#O9`6kp>x9&V)PMLw?hriz~HyW<+ViE?~gLyIVf<2sJ|I$ZX3#IPv7 z#u3-h%^JQg*6O4 zqsN?297nm)n9E_u2ukZd7BDw7Wl&NwuTV%{ zP!by7?JMR+UIu!UBmQA7Z4~HHj`)YU(PKc5a>T#Iz~N&Ujqdgjb3;=WB_&gi_=maF zfs2yTC`bIm-0&fblEzVv_=mX>!xkl>3Elo-ZscX4M>*mj=F&!i9_5ICm>WF?^e9LC z%L9)1$Nv8Q8qQrfu9zR1(wc-UpI!ux_=vgGfxuY_M|{NG@FBn@5RUkWxe>#FgSbLn z@_S=$2{$?k1=zL0Hi8qH%H44PJq869zTJif zpTy_goW^pv=JvVqH7n*%prJ2d{-h|{Geoxy?sYZGCW=M2+9+j{#1dOwl(NZUnXM^G zS*BQS+Z3g2is-X#j#4&Ntgvl~QZ`K#Z7osCri*KAcSk9^R$OOmjZ!v4++f=trEI2H zZQB{8EK97h-5;e)5bJCYL@9HK4YmiPlx2$>ZF{1W<%pYXk3=b(C2q0pjZ!vSyvw#P zO4%H7o9$qfvbo~jw#TECT_|;(fNGQOf3v_uHP1Qg(y*pzWC` zWedcIY{#OM-6-z1wM8jgC_Z945vA-Vai8sEl(I$QqqY~Kl-(>Iu!W+OEfycQL7#~m zUovPNpSOsINjbiJ&?;XdK1s^)rG!@bt>O_%8SHbl5$U!@?!BVQeGaVyhQvp zDfdPxFBSidl>4HTmx;e2om0d`*<{a`8Ph>)`m`v!Tn=hr_dJ!+t8P|e*((JI?&m3flvZ!L#VUu zW4pzHeoFGEAb;u;yX91Q-KCg2U57gg0v&|6^s}^~q|V{B32AK!-s+63qZcwJc8#0_ zI0-NlFcWYp;8eirfYSkI0L}o+0?Yz*06GA30CNCm1I`AV3pf{W9^gE{`GE5Q7XU5* zTnM-ja1r1lz{P-z0ha(S0bB~W6mS{fGQj14%K`5Myb~}NFc)wI;0nMzz&yYLzyiQR zz(T;)fU5zE0gC}k0ZRcrfF3|EpcfGL^JEBx(nMjkG)X9yCJUufrr?pL2wrKb;FqQe zmC|&fO1f59E6otrOEU#Y$`S&SAk<0@p@sZ@AdDii))@(5j0x$v&!72cP8 z!f&Ne-I^rQPHJd72}1h`LOVH?5U3trwZKBOBUEr4_*q=k@f zg0u+I&5#yDx&_h_NVh^-3h6dT%OKqjX*r}jAl(Vc2`Lwn3(^WmDtAfa|MV84F$pxtf{rli<| zmt0~G4jgC?e)qff;E*Bq;H8(^gTsc|gWvm}J$Tt=_Tc50+k>M<*@NHzzCAc*j6FDZ ztUdU{AKHUgUTF{h_{a9(Rae=AS6^)pUUQ8-IB}vqIC-)?IAw}GIBl9ecz1UUKrmvO1( zz{&T&j2m77oP7VwxDjiBlka~SH*y1T^8GL4(rSQ{?|&ILx&b(L|BHR8C;koUlg0Um zxuGS%asFX0wH!FkKgb;y9OvH-;0C)F z^~vS@!`#pk;5h#w|%qVs2!q+Ar7W^VJx`K#uNyqzwEWzV1HXuW4K( znfm$7_V-4{@u0sE#uv%hih;3?H0tX&+us`*+w1n#$oR)N>gzY#-y0e4<9szRCW01q z_fg`uiu-$$DW~k?rJS;lmvYKJ-rVlCpF`R{^vP0A*~d#cWgjo)lzqIOZvU_z>EorG zvX7T?%06DoDf@U=bo;lM_y>KmlvDQcQcl^&OF3m9ueaMjtVjBIDW~k?rJS;lmvTM( zc(d5wIKGcpAe^#~*FiXCA8$6{lzqGqSNNN$?BjJ2PT9u`-w*7ReY`6PXZGop=|79HM%o`7Zs=F#{KFjSib{KFjSib{KFjSib{KFjS zEq@6!yM`3<^01O>Eq@6!yM`3<^01O>Eq@6!yM`3<^01O>Eq@6 z+YcP+<3+3w24adi(#K0VWgjo)lzqIEQ}*!=V0|!5kM!|UPT9vxIb|O&~EGo z>3-99#QkQ))9yFLV{SBHdDk!APYhuEg`934hbJfH9zrcIPr=PiBgJ;5O;hC;Ie$gF9 zFTwMFiO}lcIj8qg7seOfNAHmQr|4ah|CBys5r*d*QDXo8U!c!A-#KNsdbQn^Y^`oXNxMHF=*<>(}1RK zJPl~-##62NA&cjSES`U0@%%$C;<-~5&logy<7q%sH=YJGb>pel{D{T#BNorkSv)`Q zMLg?O@r*%JH=YJGb>nG3Q#YP!&5v0;KW6d#Ba7!BdlAoORXk(R)QzVBP2G4J(A15m zTJv`-p1))9{1c1kpL!9`9jbW7ps5>A1Dd+=G@z*)PqpSJES{gRc>bBi^UuAA=Pp$| zW6;!%rvXjfcpA{ujVGsx=LY-PbI{$-lHzBgX~NI#BX(~a&$FdlMyTc)!8Q7G>>B+! zc8&fVyGDOb>l)qKJ;xyf-(SQ0Bl)^r>lsh^S{~1t`xQI~D?i7FpQ*%U`1)S!8Ae?8 zt-0n8soHQ1_Yt~n*np;P8#bV++lJMeFAwPp<0=eqH{sETI{n!52cps5>A1Dd+=RBOg?eXbkN zmsRnMK~p!L1~hf!X+Tpqo@&h)uFrMjc}5k_7&LX`X+Tpqo(42^X{IS_?yKyvST1<9};?3^G^QTpI)CDw}I#XwS(t*T&F}i@7$2c3jN0G0e3w@VsSn zZ4B*s%jVh`7dp+gG0e3w%(XGhwK2Z!Yh%b`TK~*zWSHw?(DgC!d}KP_Xs(SR&&@X1 z#`u4?HpXG~ddR(h{5^HDefG*6q`7QpGa{P2E0i1Dd+=G@z*)Pqk(YpeljA49PH=a9G@r*%JH=YJGb>nG3 zQ#YP!%^1d)b>q2970(znb>nG3Q#YOlGw4k68=(Jzth)_yG1v8?Yl4xz8@{G>y$*d|+mUl+V#n(vRnJx9 zJ*dq+doP~U+pjtY`c=uE=iPe^pjnht>@RHY*{j^+UMH^Rp1o{8hw-FOR=J-8<&^t5 zP)@m@!|3jLBe*};+_RVN3vBM$OZOHw_w3c~r!@EM)sBm~XD>Z&%sqR}J$rG#Z6RFd zX^o4Sd-i_wd-moumb(M+8=PpU+knSvPs6WK!{>`u!@9&=c}X#<uM3+i$0QXs7HAJ`AB|aBf@*mNAjDR5Z-4#lHal!;l1S}`Ma7C-cLS~ z-?kOuJ>(<#yIT<6H$IZz-iq*E@sa#JI}qL&JZg!g-o(t0KZx)# zDZnuq;SR$MEN~8zUvXYW6ux{@3kJ`jz3E{yuW&c zd;U4XE#$cqZG^-7s7LHxI7v9XcY1{T*C65We(4deqn&U|dG1t*aCl$zh}}ys5^fpK zoqm~cxAWX9KOtN$&%O34;qbGZ5xv)cO1Kp~clHe7@P6hIyPyAzaQJ)};X2O|4)19b z;Vzsf93JnAaBsdrIJ`f3g!|<$28ToHPIxWfpfBZFSj+c(`2qKN-{Wp` z&+z7s6Q@qm{lOFALR3B86xnkI#-L80rgo7%XMlU<70UIcb>Bk?0dvoAy2iI0*O0kq zxN?liL&lDjV@#A&jxo{a1IV+wK!DCWLIam6bX^IoS|@Nhna?(v z&o=2j+w{-8=4LQqZMC`XCS7+E-n$X70(znb>nG3Q#YOlGp ztd*%7&oioc#-OPiPXn5|@id^R8&9=n3~OcT#`6tTJY&$*ji&)k-FO<%)QzWFGlsP? zb>sQADxNWD>c-Q6rfxh9XzIq3(=_*tZ0^AGWCr)MTG2oK^V#H{)!Z}E+%wXKpH1DP2g|&n!qqO0@l7AjORx9@%oJ=uqH6~Y%%6i z%Ym~JyHwe3cm;3?gd_7Ru$}pAw)t!}o?nwz)7mF(A$zuy&AHKfHv9GDo^xi7<7czm zeJRJ`+3dfySEL-r>jSroDaXSo{AT%+?l*l$+;3Jq?S4}{ruodaxo53v49MKG_Tmd4 zpm*?Gq?4bacgdWj&**d6Vf13V?wDGWDxP1*oEZa}x?^evGnG3 zQ#YOlGiEVtoxm$k~Kh&~%$gITN0+`B+KF6Q0^+Ho=WE-?2l z!0X01MRV^0JT6-ZmsVQmRdeqGbMJ!F=H3P7-UV=eh^&im?p@Gv+T6RKGpmlS+v3fpb!E)?G1G)fX}VA)T`R1WW(e!0 znSvx`2?0qEY9)tIFJ%jjQjX9h%@Q_CvxR19j<8jlE3`=039ZsRVTW|RaIZ99*eTs0 zJRmI)c1h;CGUmE6dh5!V^JBi{^JC;bIDUSN+y}=vGC#(lF+V0-V}497;mEozc+Sko z4KR;}Tek&sX*Iw>Kcu`THReV)0LRvKme+R~>>ku7i}MffT|Kk}IL<%JrIrK7`G>jT z6~J-+VQ$14;5h#aYDZrXY(rn-K71$}>p*AE1wsj^4WZ7qkL?x*`YFk`L%#hJyQRH+Lqhv3e>LIP zC%{#BpyMQzowW3`eBWPTKbcTj?azApg1;gmwGyxrunMpWa4q0k!1aLZ0VO~QFaQ_; ztOcwEtOu+IYy@lsYyxZo+zhxGuoKj41A#{eG#JP3FY@DSi3z$XBo0DKDYDZrzEM**J! zdd(Qu{6?LIkJN=`QtX(MF=|8otFOGHBX}hJx zQ$7mYRD42hDws_L*qmTCeq=d+;-&ZcV7toCs9hzqtJKgE|3Ga-W+QT16`WSpAE{jx zv#Zk3TJvXWvxeEM;j}6_t+iiLyS2=2t%laRzfzlZ%w`>@C30Hp|AX4CXLjo~v^M-F zwb{UIHgH;1oR;)o)J|e{l7<%iSmgJqW;WHF)*4PLU`Zf$0cID_(5gwGHZ{zqhSOTh zY1JlCyIN*ftD#lbm)g`Zn>tQw9n(@A+m-Ct_V+xt{m_>D+_7B=`ARsp74=0{zoKXT z3b3!>>Wh#U)%Dl1`c*yaSAl&MS0DDX>s+g@zmC;k)3ZM8iPyP?s}FnMb*_W@fs^HK zG+=#Ko8J=HBfqb*1n{v^-;c(xp2e?TBgPGOYSX}M8n_s*=YFro6l&MV>>4$+HeNz) zHZq%yoYn?Tt7#y$Yhrdy8d{saOKmnWn@yaS#A$6FLhUv)yUiL}TP~$GTbRujPOF;J zY92=Inwed*hSpu*qc(Rjo4Yuz0H?L}GHSP#*=^O(+IBg$*~Vt zG_>yiKDD`<+1$-()pA;`W2jv#vuoAR+CG-rY-cvxIjuUTr8sUR)^6ALY`5#7ov!EF ztps^V-R`Sd{S7_q!$z{58@T%LC#ADmUB8yq5A>`b0Q&$}zZUYf>iTu8eofE%HDF)E z)vtqm9n=rW$J%;JAPnzw;i`ZfhXH!*?f4;$;SLtV9U5`I=SpgG53{+4i*r5qo89|k zYIiTQyH`W&zN@Iseaz-QPOE{_+Icm#+sW*9YG~bm4Yj$S+1$@*HF8=HOr&-XFuMmd zv>u#HZ60Jc4{}-?Ijvn&sNF7Rw@X9op=s3SA!hRsr`5!1J$x;-dzjfhtf94gCbikk zY<6>6n>ejK0=3)2?DlAAJ(5jr9$_|*a9W!=t-Z6T-Ckz5S3_&x9BQ+V+3e%AwlFQl zvDe7jZhg$$eu2>C{J+uz9QH}tIE0QL=B{f&^{sII@6)oWm{uWk$Q_uRFzv3~S= zYV#^O!YIA_u9N@I>; zb6Q(DtwXm^yF<+GkcQUbTdB=qW^Po^+gSa(de*-S?C;|0Z-e|c zb^TUWzoln=m_FUv!qsnue5<=rm z?m(!f18;YSUtj%@qMZUtWthaU{$J=D8_O3k{PMrUW%z9tD^Fy69?P?A1m?R-i}Jlw z(}Wvx(&o?4Nt-Y}9sZ?_yQ;XXbXC6B>2diBOI;=TPG6bRU+D5Xy`B=EbET`)>8tQ~ z%DjHCSXty+mCw8Ks^a3x zk{<)JI)9~~WYP*U#%E40n6x5eLT17E{7D59Cr`{@;aZvL%5+VhkdfzFarK02^2Sff zXEBL5&%Lpb_kB?@-Y}bc^(XK9qLw)v32!fDd=5mNe~o5*+gm(~(2p7aDO}v}uS~|L zK@SQ4!uzr#)E6zjlzJwYEns}rR3(r1roi@1D*iQ0Ta!`pGNpYU(1Q~CA~YQvbieL_ zw*wKhqAMwvB^RZamihBhdfu8+-prLQb2DakJd1JLrjRsa@4ZTgNPDt4{NFLqutc~UleJKiy6 zJbRmYO-2TLJ0-`F$=+s6oF&^jGAGEkIgW|4?d(~TW!u@aGBc5L*@B5Kw?E%|&7^{o zLYK2Z%ySmzT|FVwS>VZZmKS;aoDF@Ry z;$IPdHH23hColO8!>`851r+7uS7vctd`^(7$>#|f=Q2Zlp1_}#%<}U2;}XdKB^)l3 z>&xc}8LwdWa=Bce(P$XF!P;`Ud|r_;jTy@2vi~w(4&}HvP%f9xV=~I;F(yO4gZv&N z4iElba(%g-%a{da{40yk!27Ep;rjAbW^WsvJ1kqL%!eHoVkABi1j zJ8l}a$Yh2oP=?!?{QJx8=T6{c-?{22cRH;oe}B15{#-d<7^VI>R{y+g$WUGXe*h%# BBiR4| literal 0 HcmV?d00001 diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_qkptph_vph_group.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_fp8_qkptph_vph_group.co new file mode 100755 index 0000000000000000000000000000000000000000..a3d3403872871925a5f0486cd6a4a5d66900da2c GIT binary patch literal 82464 zcmeHw3w#vSz5mJPLD(e~C}5z7fto_EH6jKG4m@9#Wjb9OS@S!T8W`oFV$ z4&OcB^E==D?QhRaX3jZtPVKDuvrTbv*$Ifg;!zKZWA8C!v-i7aQ1<3T%w?fu`2Ra- z07`;#TvxVJVt|Rsm<+b~g=zUS38DB;x|$L)qJcDz>wyah{fOq#^l&`)9r}SX^7XK< z=ccmqrQzseGUJIjWZ37K;B7SZ4k68=et@5kMJNqWSdcp>?>2<$pSG737JKX&B@TDFqttV6>*E=A_e#&(sI~d& z4EqXimA%wvU$MgB@f>nH?1eU;+v#;cNnxQIIGe|1U*Xulvb18kz4Y;na(kKMahKO^ zt2mHRQB>q{c+uk-9_JcI^5KkXduf%!R^lu#e2SXemRA)OIowUO+FqBvP_C4$Qt9CG z)ixp_D|uJD9DAMDPf)t~1aY&XymYmumzDAd4pjL-O)pbbDt!}PmdIYFseGWOm*Z3_ zeG^_*%U+IG`9MuCvs5ZY?PZh=^8jv`r5=auXtv5{YC1YsrBXykS<~L{Sz#}=yXDW# zqY?>nL&k@MmIQxvEQ+!Ik{IhR(^3BkI8!((ZKaNKISK_jRQM& z`W5$kY_~cpJ#q+KIwbfIcw?+z6Jz~#I_mH9*cOEXa7NOhz(?RrW2^!#u?oQ7%S&zt z5QMqCL1@#V5Eg{(u?p;nRbZzM0SJP%Hwe3RD1-%JU#tQLV-V*nK*GSC9C~zUgoWY#SOq?gRp2un0&sGuaCt)~hc9$Ughk=YSOxwXtH6Kh z5P&Ff7wu=`kPZz#3}@pKVpd3wSz&+6$kq!lDBGoDZ8LvYsJRq4dD`ds2FkOcN z1f-G%q`Zo2A=x^l!UK{Uv%=h%73S+ufNQPluxqWead|pq!h^9WW`!j&D=gEY02dn7 z-1XMkxB?v_;h`vuS)n*)g;E^~Bow{ZTxa84Iz+-l;f-0LCT4|oIuzg#sZbmuk`9^h zU^K?8&=Rvks}2Qd7ZqWLNSh9k@K9`zSz$-a3OjWukWlc4$Zj1X;i1?Uv%Itq`+sZ@*TF*_MO?U6qo?xMKAe5UbKO#dm#lo_)^ zR?G_1btr^|q<0kOtSMWESlC(G6!R^mX>QCu=((ljLQ)deO7dcMXHm=wOJY`7rb8ht zBz!9=&>=3y}h!`QSP-BIcpq+HaxWcQ#^t2c``u_W~PM5zCQix_opJWdNw%d6C)!~;n>%wtQCYFwR^%(R6&GGN zAO^r#0pf!xiGH2;Vh_pNGpKUjP^Y zbOB}qF99N`--h#myKz1-DS)66Nq_@!V@d))F%B~Y#sg*pX5(_eJiq{80WQBBba6SL zgv$pf1&(69?c4M8e8wG9NmQ6qnq5G^bXHG&PL=K-RP=JZvd{&?HSwQ+3IX!+%DuE+u-($4!S)l zC;_1j$UC;#Tb2GA3U+sRK25@Q^{x%+Gj?0sccGTC)|QRV8FkjJ)}XZ=r6FXMtkR6S zoaUU3Ia_k9kZXI`DYYGU)F3(u3Ey35R1HXc<$`suAPE@{tZN|1hzT<@# zh>zH|4Qx+99^Qw1!ujCyFE|m1kk4+kw&TyV=D_C#=d@!zY0eyJMg#Z}%xTASr-M0j zC2SuwZK{RRU~QnaOQ^SQS!i7--7MW)pJ&aJZjo-OpJ$yX-6Y*qKi@iES|BZ`KQP1; z09Vn+4AY0wyaO-+`28&ZAs+k#1l7q9k)O@$U{c)+@|iai|HJU0v5mldCr9@>r%F>X_M) zm?fS_%oInUhdYId@O{AloXni|0oDOhij-16AcvwbnUzh-X&-1EC=HSZ)ep>}D5P22 zXO7Bg&#~r6*;m4M1We9JmikNm>xC&$FN=QXnfSK`UvkoEKEawGB}$3)`Bz!nldaS~ zKi%3c^SA!M8uH`TvDVP%2GOQY2$>W_n*mLzf3QFKJ>c^;jDoTt+QQ1WlJb;b3Mq&0 zvtb%&TA@6M+L_HZVlyZ>h}gjQ+As}1%L?T|)XZ!)5SzqcBC!FVHw3{?5?8R1V|<16 zC1~H4F2V<`O$4ukxPVVq_|7?COOA(T>lShhHs)+0$3$zT6OQ$)=>=rE^AOLYHkoBeeakUH<~-6YI`h z&iY54ql1s`aw0fEydHml?N8!&)V>1W3;zcA`%A7WCqm-IlW(A&K=1o^g799#2P-Xm z7v}@L`S1lcKpfO z254)IuE*0|m$kzAcoU@0E}n&}rDTdpX%v&kQ8c50qa`2fMarJC3JmyMPpa(tF2v`=r0$B^f#GEdp^Y4bFjBP zBNM4JM7AB4az5c}ZR5B-Wz+Lr59*B%5-9f$=Hzy6rQExiliPU?<=z7>nff|R_BC}- z1YcwLEm5wDud`Hqovq^QoL*nksIRHAuOo&=@HLj>;bwYX5wfo%hez-=hPVc9Rq=I^im&htB=$ed*TucQqQRuSj+A{Jb;bDv{u(5i zmXDIl(?*|9NTXbu%#BGupD>PcV=$M?juDjBdn{mXSjynUWXh2?hq=^2gA>yzM}Aw( zjTkyOaUA7H3^6xy_~1k|zSmdGjk*H#C`bImT-s>Rqa5)Mb7Rs$k8;GnMZn=>7>()m z4|Bs(@)MIONBqNF>Y)6@G|CbGFgIdoe&RUF5&tkZa(I3sn$YVX=0;rsdXyvnVJ>Yn z=uwXNhq*E7phr34Umw^{9R!?-aKuNW49IEX9MCBHZ3MqL5iXu=U6F_$(PIQV{GM|{NGm~`O2Pq;CONMhFpTM15R zsdPd=K{}H1-tIs{PU7=!ZgVAEb9Z>wuj!-tu=QFoOD4XtEZN4``*$uw6 z=Jp6>GkojK+ai?B^wpYoL@3Ml)tMiRP$u{q%nwB5C+h~3) zLfI_eX7ip1WwU+vnD<5~o8#MRJ{X~FuJ2y+6A{X8^ldXAicmJscc1xigtD7_515Ze zD4Xwl(ELn6|pK;alyL>N_a`=<3T7I|h zUr9Oq305t)`MOB?k_hDmzEh<9jtJ#;-^-+YX@v6SzSE@q&Isize6N!7f(Ye>zSl{) zJwmy|_Xa6n9-+L*cb1f|h)}-L_cKyn9HG3}*G@$u@%6Q*(1q^IlB9kZCsk0)@=%=Wqd0vn#Tm5}vl}Q{H&D#o zOmX&BigUM7oc935`43ZExQk-mUW)k#C@wloaq$s~OO8=o`U1seFH*d_i(#W$LfsvoSmLbcCnSFg@~1wv#GR^axEyn*8*oQ~)J6EX{&5{B zv3o>)d|F4myEZfX=!MLQJ)$lfHMHI0kZ+EfL6dmo0ha+T1H2pX zZomS-0>I^f%K-}k3jvD&ivWuOivd>wt^zCtECnnFEC+M}x&YmPZb016lPMI76NOdc zB%xHCER>5`f=iqtxW%c0SDYqPi_?V~@djayI73(`&J;v3TaZLSs28n5qnIN!i@8FJ zI7`?h&K6q5Il>lkuFxjlD71_7gze%@!u{fWVTZUtct~6*>=bVn9u@P1-Qq369x-3o zC*CUT7Z(W!#oL5K;$q=R@pj>9afxtLyhC_aTq-;#-YGmUE)zP$yM&YC-GX1V37ukr z5D@LcOX70j6>){|W3f_u95sQQ~;!5GCVzF>ebPDIiRl=KMiSTo=RJbUX3BMA{ zg}22D;on7<&?8m~?}~2WebFO)C4E>1IfIkZysL59wA&iy++wX)&bRAuWM)2c)Ht?u4`q(p`}5hGc_O z0Lcz%IiwYk3L!Zl6+v1FsTh(I(ke(LkV+wyK`Mt-0m%ia5|SH|2a^AjkInv1e{1%C z{+ZeT``?-UfB3@e|I;7M{;$3?`~UK1v;RN-YWDx<-^~91{x7pX_&;WUT%5%pA8+v| zCR+Uc`dR$_`&;}Li^ZRkV)0*gnZ-Y7kj4Mq?^^srhg$rXUvBXaA8zq~?|T;i6<1jN zS6*rHj~;FDfB*Xye|oybKX$Ce|HB_z{8wLX@n3U|#eeO!7XNkES^U>uZ}Cr@Xz@>; zZ1GQ-V)0L#X7S%}gT+5{Cj54S#h;U7@z0uN@z0rK@!xo(#edUH7XN|;7XQsRTl}}& zV)5U4tHpoYZ5IFSw_E&o++p$Gd8fsH*IgFB&1Ui2?H2!v6&An4VezkA3EyQ6v;{b3 z(4O;|B_@=az32R-3KNb z144fGRuh^6`MKKwAwTZ{6PgP7`40m^e&H??jLGdepSKqf^7#i$Fdbsg`9+5TA;0*D z3EcqsCC30Ezw`wYngRJ`F9JgT?k*FW3HgGT0U^KqRTFea?Kxlg1|Z~%erAHMs6FS4 zF91S*)h|rQ3i;As146$1Hzt$=dDnY@kavG%g7fL7E_`oT`&ug*3&D9W+;a|bo=?Pm zb#Na#1ealMSQ&8g{V(HED}j^me;GHT3OM=xmvJLk11I1AGH%p*;N<&X#--H(C*S`v zZcGz!?EV+~(n$OpoRrP^hq+;8z;XUzF0~Rk&OgkJr~;1j4|5||1IPJ?xl!wZj|%%xTW$N7i35mmr({$Xz9YT!8kFgI#F zaGZabOREEp^AB@lnt~Acpq;L;llfPA+GQ@GrW)2PPp(sUif}s z7v9Iaf^bG3FZ5FzeZ1;@^hO`AdR&Y?-rg}DqmS3<op=|79HMj0%#l7`&OgkNK3>j0%#l7`&OgkNK3>j0%#l7` z&OgkNK3>j0%#l7`&cA)Ykv?9;`d}cYm?M3>lnd|UrCfL)FXh7fcn7jR7^X-1cqtd& z$4j~JK3>X^KHmOc)x-QX_Uy&(z0UeQC8L9TDxC<1BLy9~9L5In9LC|V|2d4#%9P`H45QPNavYw+_*-XH z%5gl8z0;R+9OkjVRr!?jEzc3>TUF0E-|`)EqJhhLe*S)fgz*<2Q@r>Y#b17rK<3!b zsD~%E@Uwga@iUe1++;m{ZVP`0{G4VJ%nfaX=ds|~(6KErAGE{sIeDI|5#Ik9-djKW z-1#~@6IKt;bnW)~oI&(5JX4qetqz`ZdLQ**eDQts4#|Ip-X;0Z=ra~UcxDnM3>few z`n>y{Qd@4Trw&cc zcq%nNX7T)(#q$dm&o3?^o{frlMxm)0PaT??@zkNI8Be9=CoGwC3l7;)LR=9)jCXv0z5M`*TT9h#bLScj%&8&+yYaec0N5A~WNo>6FO z##4u;W;}IhYQ|Hk8O8OvW<1X+;u(deW;}IhYQ|HCre-{qno(SzYsT}UBA!ubYQ|HC zre-{KXllk&sTsxfxn?|HQN%L}P0e`f(A12l4o%H?DmA0HKG%%r8AUv!(A12l4o%H? z>d@4Tr&2SD>vPR`zNv_36q=gx)S;;vPaT??@lapOb5>jiDYF zV{HuexEO0=7;9tTdCSJy80zzujkPf@bQ^197;9q~YhxH|V|?4!#*oLf{+ZXvFxJPQ z>to>g$aK8XSQ|s0n{BL(@&9gZjKj+HkS}#!Da=KOvFEULl0#$Gv@C3$iqM!fEmI$} zre$I4Fnp`VtR+Ps?AI}7twU3@4_1e!W*@9lGm7zL%|7iGMLeU>)a=vNp{W^99h#c) zRBA>szN{I~Hbp$6(A12l4o%H?>d@4Tr&2SD@ny|;?oh-t3Qf&;>d@4Trw&cccq%oc z7+=lB~ ztm~DZn6|ukj_V?r8%y?XpxigRu2;6o*tGmiE?eh|?ArkMEU*UGKV(fWyzXoQBszcD z*t7O-GIl>jlCPx$Ip$u8!mJkdp8(+HyC?2 z7<)Hp?cHFEPk(#%Z%`edmhXXK?AlA#eL8Qf>xK7jfc^)v?l#25Sl5fL2}bsA_?p)B zI`l<-SMJpbU2lw1JXekPpf>jGy?9bGF|e_>j9H zvzHz>#-6>#p1ru=wiqt+)W*e(J$t|TJ$rMTE1eSj1}B;t*5k3-)9`E5@%fTfur4uI zURH`4__hprYY^VM zK9pa-4&nXkL-|?};XUd@`8o;Ved$B_hI)keq7UVp8WG-qK9t|kjPRcGq5Q@cg!h>b zFZWRX@qGyI z-yX^z*pKj@?VpycRgfx z>{-I$z1BnA@#hGK_g4>bFFa4UJf1tzK{&jRddTj@lZ3;2r-!(I^%D;7mmcD}ItjOg z=S~F(hxbJf*}eP{;g<5;=~oDMC(ph5W5N~i-0QCq4nNBo(tG14gj>#YXU`B0?`Iye z``J$khtG#0uKOI}@SZjy?!tM(;qk5z_tu+)!~2tmxL^F7a4w$vL*3-Y48zp8M!SWb7Gk>=|zC z8E)(uZtNLu>=|zC8BWKnj6K8Iell@n%!=&SvK{(ejXlHhem9Ce!(q%RBUNS039ser z_oO@rYx$n9Jm5UC@CMwC4+%W(}DdxnRPF}cXtQTP}W<-*69=<@;O+2qg|6XlFO!{xaP>=}kJ z>FtS#?fI2xJX>Jw84i6@YW>H?p5f|oG4>2M_6*1K98ba;o&`Q*&v4wgZR{DYJ_h%7 z>=|y1F&Se_#u$^wX^b&>oW>Z_h3>^iV9Wgg zP%Qcx#o`MTjc1#TXPb;?n~Z0hw4QDHXI^vDAHSy7Sa*}IyNU13jAxtRnWX|U{xdRc z?M*x3M#)^7@odwD5mhiIME8Uof#b3d#>&3VXPaLCBYn1M#~{Txld)%HPt5zo2VgAe z68Ci1Tu(}ersjH5Iy5!clTvC%u~w$$`dqI;IQe)+p{W^99h#c))S;;vPo-uQYh`N2 z^PD1{QD|z$Q-`KzJauSl##5;o#afw~@w}*rXB3*6@zkNI8BZOWn(MzL0=W;|a} z#4`#_&3Nk2)QqPNP0e^JHKSN7Q!}1t6!DBgQ!}1AG&SR?LsK)JO3f(N%G8YKn~Hcw zp{W^99h#c))S;;vPo-uQYh`N2^KC^uqtMihrw&ccc>1hGh3Cl(>2ETj zfBNUM$vvyFXQZ)bq!~Y}rZ!i_*fUZ+F2Txmlj5PL)#Pe2+JtNiUt^EJ6XQVN2 z#+Wx_%$qUh&EWIAF>l70H#2o&'yM=wkVoB=okFdHx%&4| zoovpH+OyejB=?;&a~waL-RVg=4$o%)t+OiSI9?yP)0c8Qh=OlbKIMGNbHw>p)ichw ze8*Iu**5m9Rg3`{d)8ij@dNY@o{MzybM!8mlk_=#E<1=`>eL)lYf;4W>zFg6LsN51 zO^2rDn3_^Eiuo;?@oZDXGYU=3F-jepn(@@3sTohDW)$;VG~>BL5ziOev4*2w=3cqg{Edab!ckFQ-`KzJe8VJ%x}?*=T1dDqtMihrw&ccc{-k9F8F6!S4Q5ihpp{0ECtqfG4`xgog*^YS7F9$EvwBR$?{d2@w&@uWm9|}GhTyP zt<2cFKs_$T-UaG$G4?Jn_AbEd#@Kwu-UWDEwiqt0)XuBM-UY_q1*eU@3yi%B;QSC; z7vI>spzE};cR_b{16{Ypx`ATuW{R`7Qk=Vu;=Bha&VQKV!d(>e_EOA0KylGwipIJ! z#{8IRLbW(us1a`v)`&BNb>d7x6te|M6oh)wDm02YLbI4Fw1~5WP2y~!Rh%Pi5$6hR z;*CPPI8WFv-Xz>F&KGuw3xtQng~CqJSXahaS4L}H8DoCTw|st#+y}?ckCFS}7)R#E zSXJi7Dyld?Jg z@ZQzK%7Eki!(3`5aGZab8&L%u=O5-qt_F_t4|Ai|1IPJ?Ib&TJ9KUaNU71zCtM9^V zKJlqD-c=h1YkKcSfvf8Sr_%9UhMrFJCBY8#74E}_vawcl_M8-mM;!=tcYI=rv!b7n zd?(~PKefblR<4iloaL=0{JMC!3YWT0LfOf<{&C;;R#{HQSJ!&8pSj?zichTutOl$B ztN~mDxCU??;5tANPz00!CBS;Xdca1&M!;skX22G}7Qjt_n*dt@TLHHKZUJlqYy)fu zYzN#9xE=6*!21Dr0PX;M2=F1moq#(59|e3Aa5vy?z&(I_0QUjz1Kba|AMhaHLBKkZLZ4E*rw`JYE#8*s=($1v+<(1^Cw<@FA3XKe@^YHnO(Jt zmhbn}#>Z@YoK_X5Rr3dGSHtXTRJ2zAiQ24YHmf z%W3&It#$uF?bb27bt+ox|C8FRXEy6Otr|{C{BLR}GCNU43w|u}`_wXU1X{}{iVaIkgJGQ-jk8Lls zB`{?W`Hh!1dY-BbYIW3XX+BB5f zZDMwtRJ1ln{Av{ z1JepSZba5@*Y$0;>!6*kEyH;7hp4FH7)|bFu z;_BB!zFt|sfz_|;TfYwM>$v(2kZ*wc0r^;47bgYbeF0n*kmFFI$KLiI(im=MG2E^a z=liavHuo```?xqaa=+RA*HF9rnce*=S`S=HZ607Y4{%ydoYs!(sND``w?jqi!Rx8b zgUseZPOF*IdT1iGdx+URq@wlkWNPy;vw4`)+Q4b;oI>q(GP|8BT8~VlHjglyM>wq( zPV3PdsNJK??ok!3T{Ef8E@rcf)7r>s?G~urZf3VzMeDH~YV#Pgd5qKA#A)rBMeX)5 zyFDsed*@J_z077Wr?r`Bg&lj%tnD`TZM!f%t-FzHyUmbqR<`{OtbSA9`b}Wp#MR#b z`3=hYn^^spzV%@`YrxNSOZ=yDjGn>b`ShsS&+x`XAZa=fzucCF}W@>YQ*&N`s?%}i! z-a_pTGP{E+T2I_cZJuB@PjFgWIITmsQM*IT?vRSs;oGUrVPTl=TiYoR%QKmR==%peV9Jo-Nw~#hkU!T{x(+s-oEwk1^aus`r9DC zjU9gh`FLxk$6t?}Z;O)hhNG6LeWA2r7}!pxSf@oSsp*1j9*{;Y zg~#S{R}?u*9iD4F?iJUTmR6Tt1I#MN3NOi|6=jaknp`w#dFF(yqVbMNMH44abS$^8 z$g*eICr`*Mv@gGI!u5sYCpmuD>$mb<93lFlS7wHn3}AdF+|2N=%Nf69F3%z~hVcUe z;rum>mwpw_XEFXcxJcq(cwcse`k{}Z=Y)SPB$<9_WGeMoF2j3MVEgQ8;d~+E^Avm! z=1s}y+|qD+{NoRf?}yNIY|#7K4{rydB$RzM<+A0XjPeSv17#GhF88c1Lm4Z}t1^o1 zo?^uQ$Gq2#GTaq~cCQ^}cpWufVr?&Tt^nFm>aKt$P*?~J0A;MGC@XW6g8^uH?aKj+ zp*DR-5tZ7PJ4$Wiug}S3Z?Dg~UVb}1XJQs(vogo$u(wlYWsPTVGbdSPTWi)V*)})l zI@xyi>`d8q_Uwt{k!|V1iFT*g;m({;R90-a75NHn#f8^R$g&l=vTT(lF0ZTDRt;&T zyQ0cvTUk^yW#af*I;&-r6e3&h9Sem8H`1S9j&Tf#e_i-BkxAB&-!M!VCl^qZk9V2H zb@4ewt|p&XWR#mfKCj@m{MutKgcpI2nOidb{{EHA@wc!Ra&`to^5 z#x!Oqm&^XkcqNqM-b1-uKJUpWpZAyy`3~}XjAJ3;?&xfm&~f!6YIE*8xRiS-vJ;^Q*TvrfzvScf+*E4lqBMySek;d+Ame8+POhJQC#@%+ zCjxSyn7!iT(E{KxH=dQtAlE-46Ab10GA;&Qjva6M1v?+*F~bxn#OGSsf4TkK4V>&d fUqAZ>T2cP~a+&bhqk", q_deq, k_deq) / math.sqrt(d) + if causal: + seqlen_q, seqlen_k = q.shape[1], k.shape[1] + mask = torch.tril( + torch.ones((seqlen_k, seqlen_k), device=q.device, dtype=torch.bool) + )[(seqlen_k - seqlen_q) :, :] + scores = scores.masked_fill(~mask.view(1, 1, seqlen_q, seqlen_k), float("-inf")) + + p = torch.exp(scores - scores.max(dim=-1, keepdim=True).values) + rowsum = p.sum(dim=-1, keepdim=True) + p = p.to(dtypes.fp8).float() + out = torch.einsum("bhqk,bkhd->bqhd", p, v_deq) / rowsum.permute(0, 2, 1, 3) + return out.to(torch.bfloat16) + + +@pytest.mark.parametrize("causal", [False, True]) +def test_flash_attn_fp8_qk_per_token_per_head_v_per_head(causal): + if torch.cuda.get_device_properties(0).gcnArchName.split(":")[0] not in ("gfx942", "gfx950"): + pytest.skip("ASM v3 FP8 PTPH path is only enabled on gfx942/gfx950") + + torch.random.manual_seed(0) + batch_size, seqlen_q, seqlen_k = 1, 256, 256 + nheads, nheads_k = 4, 2 + d = d_v = 128 + dtype = torch.bfloat16 + quant_dtype = dtypes.fp8 + + q = torch.randn(batch_size, seqlen_q, nheads, d, device="cuda", dtype=dtype) + k = torch.randn(batch_size, seqlen_k, nheads_k, d, device="cuda", dtype=dtype) + v = torch.randn(batch_size, seqlen_k, nheads_k, d_v, device="cuda", dtype=dtype) + + q_quant, qscale_bsqh = _per_token_per_head_quant(q, quant_dtype) + k_quant, kscale_bskh = _per_token_per_head_quant(k, quant_dtype) + v_quant, vscale = _v_per_head_quant(v, quant_dtype) + qscale = qscale_bsqh.permute(0, 2, 1).contiguous() + kscale = kscale_bskh.permute(0, 2, 1).contiguous() + + out = flash_attn_fp8_pertensor_func( + q_quant, k_quant, v_quant, qscale, kscale, vscale, causal=causal + ) + out_ref = _ref_fp8_ptph_attention(q_quant, k_quant, v_quant, qscale, kscale, vscale, causal) + + abs_diff = (out - out_ref).abs() + max_diff = abs_diff.max().item() + assert max_diff < 0.08 + + parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description="config input of test",