From d89b7e873ee5391fcdaf7ee59fb8fecf428bd5a7 Mon Sep 17 00:00:00 2001 From: Alexios Lyrakis Date: Thu, 21 May 2026 13:56:44 +0100 Subject: [PATCH] mla: add fp8 qh32 seqlen=1 persistent kernel support on gfx950 Add the mla_a8w8_qh32_qseqlen1_gqaratio32_ps kernel for gfx950 (MI350X). This covers the decode case with gqa_ratio=32, fp8 Q/KV, and seqlen_q=1. - asm_mla.cu: add seqlen=1 dispatch branch for gqa_ratio=32 fp8/fp8 (sub_Q=32); update error message to reflect supported seqlens 1/2/4 - v1_2_device.cuh: add seqlen=1 to natively_supported conditions for gfx950 nhead=32 fp8 - mla.py: add gfx950/nhead=32/fp8/seqlen=1 to the native-path selector - mla_asm.csv: register new .co entry for qh32 seqlen=1 persistent kernel - mla_a8w8_qh32_qseqlen1_gqaratio32_ps.co: compiled kernel binary - test_mla.py, test_mla_persistent.py: enable nhead=32 fp8 seqlen=1 test paths --- aiter/mla.py | 7 +++++++ csrc/kernels/mla/metadata/v1_2_device.cuh | 2 ++ csrc/py_itfs_cu/asm_mla.cu | 7 +++++-- .../mla/mla_a8w8_qh32_qseqlen1_gqaratio32_ps.co | Bin 0 -> 73488 bytes hsa/gfx950/mla/mla_asm.csv | 1 + op_tests/test_mla.py | 2 +- op_tests/test_mla_persistent.py | 7 +++++++ 7 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 hsa/gfx950/mla/mla_a8w8_qh32_qseqlen1_gqaratio32_ps.co diff --git a/aiter/mla.py b/aiter/mla.py index 2b97c9bfac..926b4a517c 100644 --- a/aiter/mla.py +++ b/aiter/mla.py @@ -395,6 +395,13 @@ def mla_decode_fwd( and kv_buffer.dtype == dtypes.fp8 and max_seqlen_q == 1 ) + or ( + get_gfx() == "gfx950" + and nhead == 32 + and q.dtype == dtypes.fp8 + and kv_buffer.dtype == dtypes.fp8 + and max_seqlen_q == 1 + ) or ( get_gfx() == "gfx950" and q.dtype == dtypes.bf16 diff --git a/csrc/kernels/mla/metadata/v1_2_device.cuh b/csrc/kernels/mla/metadata/v1_2_device.cuh index 7f26f9c16c..6b3d3ed2d5 100644 --- a/csrc/kernels/mla/metadata/v1_2_device.cuh +++ b/csrc/kernels/mla/metadata/v1_2_device.cuh @@ -501,6 +501,8 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba const bool natively_supported = (num_heads == 16) || + ((arch_id == "gfx950") && (num_heads == 32) && q_is_fp8 && kv_is_fp8 && + (max_seqlen_qo == 1)) || ((arch_id == "gfx950") && (num_heads == 32) && q_is_fp8 && kv_is_fp8 && (max_seqlen_qo == 2)) || ((arch_id == "gfx950") && (num_heads == 32) && q_is_fp8 && kv_is_fp8 && diff --git a/csrc/py_itfs_cu/asm_mla.cu b/csrc/py_itfs_cu/asm_mla.cu index 8733f89713..ff2d9b8451 100644 --- a/csrc/py_itfs_cu/asm_mla.cu +++ b/csrc/py_itfs_cu/asm_mla.cu @@ -328,9 +328,12 @@ void mla_decode_stage1_asm_fwd( } else if((max_seqlen_q == 2) && persistent){ config_max_seqlen_q = 2; sub_Q = 128; + } else if((max_seqlen_q == 1) && persistent){ + config_max_seqlen_q = 1; + sub_Q = 32; } else { - AITER_CHECK(false, __func__, - ": fp8/fp8 with gqa_ratio=32 only supports decode_qlen=2,4 in persistent mode"); + AITER_CHECK(false, __func__, + ": fp8/fp8 with gqa_ratio=32 only supports decode_qlen=1,2,4 in persistent mode"); } } } else if (gqa_ratio == 64){ diff --git a/hsa/gfx950/mla/mla_a8w8_qh32_qseqlen1_gqaratio32_ps.co b/hsa/gfx950/mla/mla_a8w8_qh32_qseqlen1_gqaratio32_ps.co new file mode 100644 index 0000000000000000000000000000000000000000..3ff1ad2fbe7f261535a4d86cd5a727ca5df2b1ad GIT binary patch literal 73488 zcmeHw349b)w*F0b0-=f}AUdscF#$r@L)bgbVwy!zwy-FuXcI_6M?w;ku&CJG2?0@2 zF(U4YZ0>?IAh@I>;EoE!C5j8KqchGpZ{C}E^X9#0{@=NEDpse1zzpxNGgkk8Ctshh z&bd|Rc2`&Rm+Di&u@ldC#KmRB3*+Gw$3z@IC&=RGYkzI9Qxb@Ci$wfwD_V;dSWdZS zTvM#&!AwjOYg0501tX z9_CJspYZw|a8D-B%s1x}VIoD;PIrl9Q znuld6K<}h6}%E4Vr@@v-zJ zIfaY!dz>#&#kv!g8}Y4wG=9=luQ~p~&?&CqAhwe}8-8>R4=-4PkD9`Q^2XJy zJ=N8$JBih7p6VM_*0_odr@D&GC$Wl+mKAJ2)fMbKi4{CjSUi7#ucR=qsIidRQ(e!2 zA6w4}KY95xZ`0Ht_Vj~?j1f=m0C2FrnXJY)>q&3Nbjx-eYi7g~?q&zSZYJyGw}X#{ z8>cmX@E!bPGXtJ@Gyd94*2!-MzbqTaG#-oIh!f3>c*5;?BQCL-w7+9RCYbNgH{z0- z8Sx}G#MMk%REr`)%+xO4#z zu9o4|qSLDPGklj$9-LE9o>w|>$il)LUyggJ+qcL+aDZ=7S>7V-`Tc$K7v+@Zlou3( zD=8cMY1FdvN^#=0xTt*2L+NE#Eu33i`2N6CT}S$YTpS}B zM|vZ!Zd>}&oF#errNx*PDlW@Bqg;|#T2@e8RBfK_;;ggz2L*I+;_TEszvr+4!1g#J z5ApYdEaoZRv~%#>OXt5KgG-!$-gDz#@d+m=GFsH^2Apc);#3nbQ&@M@BV?Ya8*GTb zcSOao;|6-1twj}zFu!Ls7k4ULTtnQdh6ZG`Y*Bj*aD#B9+~9~$X^4+cYruLM91a78 zP`FC8NQvONLsZr{#fqAEQB~tiXd4LDu8DMN=%mJKo7kcjLy4O)CT%Azb5yyjJ2!Mz z%iSp{p-|_}mDy)HRNPu1eIH3~NLI@!{-TRTWV?4|P2VlRj=(A4T|nVzQA;44E!1)m-=a2!{Dc;@8H6ob)MgVV zIciG)S-=pm7PuA|0&W0`_Kw=^fGl7w`DeAL{Uf2)qL$uJAz%`awts{7;EE}e4~H7O z7glHCC9d+it0#yXYAzJjHEwZp%^|TKc6UvKxUq)bkqzP~<86$$Fy6s*U>jcsxKIQL3MnezSZgdc7z5@JratDx|->Ir-vqq zJvHuW(?X+bXH*a9t4D_6H64KWf*ZB>BXqe!p|j7f42hR&2K0S7G`df>>TzOoO}EzW zK&acE3OUXlz|2M(+-~jrhTs(0&K>9^RvvM?9io$1bwmzx2szB{mw2A4xRpob1R+vA zBd@G*5BGN)x&GR2BRz#Qh8*D*$-_p93b#k`am2xQ0+SOa1_;x{%QfS=xvNv%dqN?x zx5nK^i)2=0uFM=aE)*v+cZS^kSN0d%Ybu31rR7?8^{>UC!*Y=jeTNM3R}LAHDw48~ z{o2(cbOC(z{A}ZS1K!8kUcULF!DLaY$(w2;FS2w~1% zz`D;B;&7+T&+zUDMgHa(9yoWhXh;SpM|>u#MvhEPb~*yI9Sw-5v88CCwxwmQyIP*- z@prHT`Dyvpl*>Uqaok3HTm+pU|>uV0D97RWt0-J?FK+`#sCy4z$4j-D#{R(BO!t{k6U#En2)g4?5IxkqZ9uq-(t6u1TZO_kf7`hf7HRe3yWe}FV8+#XMBIjn=fm9wLY zS>bl4z*1KVPfDf7liDg1%dnjCX;}q*R$+TOWx9J=#veP|ruAQ}faoW=o;2{}T+kfRb-1zQDM4Z8uh z26i)SEvy>04z?b4C+u$6y|DXX8(hM>^WFX zkd7P4BJr^6dK}-KBmMMQC9>i~%ZAKqY`t)2(mrtcv1E}zpHCIFIOoV};)j~}wM~4{ zzDa%)Kh(spZQ_d#(fR6g`yAtQ8}o}J?nST$@NY(4Bw*xj&u zVfVu}z&64*!?wa6hCK><9QGt^J8TDRCu}!t59}G(bFdm%Eo>j`0PH2$E3nsK2Vrl* z4#AGV-iEyktA~96`w;dq>=^7b*cY&`U|++23;P!K2iPBBe}er7>@Tpt!u|&PAK3RW zf!EOii-)ywE{4s5T?(5Gy8<=`HW!u)%ZK@4 zSHcQmMX(ZBDXbi}1hx!zHEcO75L}z}eqBfi*R71zjJGkaW4xVlJ>wmWcQW3^csJuc zjQ29$$9O;E1B@FOA7tFfxQTHy;}*uPj1MtB%=if7ql}L+KF;_A_7@udXVT>@=GQPmLk8wZa0mc^@Ut)Zj@fF5b8DC?3 zo$(;!8;oxcZ|&*TXBFhj;*Qw4$ib@kF6atv20lC z36HI3WTyGg7?x%nTT@e0E7Q_a)iDBJwA`twes@|Ljv0Gv#9ARTQ+xO`(|Y+cd-U;V z_DWY{SKzo!$NmZ&`zvrve?SaRs~SE$rE>Ui9Op+G=K{37BS+#~K=YrgYyM;ytz$I~ z({8P`e>I-FwYG}Gz78(QU#V%Sm91N+R<>=MTA7se!!~J}-`%>k-`%#YALsGL@eJGM z)^Hxsx+D0rz`OkJ4(H&!K>FQho$GhIQv7rbJllE0Rz`PqcYzl+hb z?Dmz?e2@Luc?iYM>Rc(sc(u;)gjnZ9;^*|Qlugf5#6%UBO^CP|6zBIMKU?v$2*J-L zzGP~pYEZeHS}GWUY87P54|pg+3oj9!dad7Ng)oLC_{WMw*e+D7^iz&|UMO?hfpGAoLOT|Rx&nAR_1;v$2 zHR6yzhvJYwmzKlNC4`?(am#WLm#Fgngvh^AWGlar5PlKGg%%lc$SaQM2Ngf;^9{j!e2FgT0-QYSQ(jy??Y1Dd!!|F zkLi&;p_QVhb5%{Gjk_jNP*L+jL1oSUf)zC{7Bo0Us_#^(V*jB}G9eT|Pe_4|knF7% zs}W25G3N30{&4bW@U<@}{!`|=5I+|DzWs{-jQOs_j{|?;Ma7%r)Y$QR5JF!jUJ_5(Wt@Sj6AVn7VqlNy2KJh1V4uqjO!pa>G0(sO^9>xdz`!BJ z2D-}(^ei=S_%#NO3<90dF|i$_L3Y|jnr7W$Ic!OM1NLd`pAD(vwL>xi8fSv%EXS}e z@$FGZ`?y<0Cue*h!Q;^qx^&Und%FZu#OsGs^zy3Cz|O!fz%Ib9z^=elU@EX1up2N9 zm<;V!>;dcv>;>!%><#P#>;voz>rUUx{`vEh68NmL){=fmi0l@Apu2Z+pocd*(9=6D(91g`(A#@)ppSP}ps)AR zK)QE!pr7}OK!$ftpucx+V1PF_FwmPH807T_279jz4Dl8QhI)$vZf{8-(_0$wc*_IB zyh{SZy~_e4yjKTCdY1>z^9BO#TP12Kcr80;x*c6$U16!PZm=|1cUTWtPgpNlZ&)8# zUsyVS$$0EC}5#UZ-D?=c++zt>EM=nB8jWkC4pJ_n?jjYl%(0pSmr?-1SK4_W|( zKcv_p(%`$xfbcy_9ilt@;nx7+j|^hmU0H#yyYM<}8Wl*}H9pXN7hZ!+7Y5p6Ze&GB zlPfAUxw1-=*R9gz^{X{`;|-d;X^kdtxmlC9uGQpi)tbD0ohI*CugSaa)Z{&PYx2H( zHTl5(ntX7BCO2)=MNRj{WVR#aZrPA5AuVugSPLT{@k*Oi0jW%a*!q)k>GT zuFE!UblI+*F59=))^6ay9`RiZnvQsBrCMWB%b7x(4?W)Ue-E`T# zyDoe7)MfA9y6oFmm;L(bvVVVF4jibC)@f<>=A6 z96MH*++(DbUAaTE-$%6mzQ0p%gZmMnlsV=X&N|)DMqswcr z)nzcK%ZdtJu2`YVl`D05-F3RW{(4>Bc%v?Fx=EL}+@i}{Z`I{(x9Rfs+jV)z9lE^h zE?wSpk1p@KPnQoopvwmz)a9m4y4tS+B_UYC)G-YKiTZf%@!$zdLi^qfw3PTZ}`k+x%U_|LS8Bb~?O zZexx#ACp_h9O*wMcRO>W1)1D>=13PZxjUF6jmYHgWRCPAle>#K(vD2-ZstfwGP!%0 zBTdQV?q!bjC6l|4IntU;?tbP-cQUyLm?I6!E@yI2Ge;Vo$?ai|^g5H<%N%KUCie_;q~n>~v&@mEXL8RmNBW-0 zJsa%#nU*axXGR zTB6Cl#2o30CigOPq%oS@E6kDJXmYPIN7|#wy~Z5rkS6y!bEHX{+(G6@pENmKw52|q zNUJotH<=^d(&XM^jxv!HBc0Ra>X;+V)8yV}j`UBHdxtsF zLQU>n=13Pcx%ZeOjnw4onIpZ_hbELtV+%e`zk2SeZnImo1OmG37+{*_{m@(T&!7p+BHiFcHWLw*UxA-|NC!!IX{+bI{tEi! z+7QhRwh@{vc%{vPpRob_9NMNt#Vd^#{Os-Ep_}37Jc?IZE%@A8@FiFuX|)(fX|~`C z4}o8X^^;}`UTL@B%a4H%c`MtIb_-rZ*2ZgHbq$U+pHhg&?(l`Z?Qh5-(r0~s^20`>9>fRLHVYB3ts8B;D1!VMcf?H zZxI)z--1{AE%+bRZxN^TTf~*q`c3^7ywY#M|DWi$7(Yb%Eyh>+EqtZl!aphfwhimI z_*s_HZ{aKb7Jig|+m7|yB-U@?EBzLJlzxkRrQaf7>9_EeehWWJzeT>%Z;`L`Tlh-9 zg&(EgB46pZ$XEI;e5K#QkJ4|Euk>5wEBzL}(r@8M>9@#N`YrO6ehXjexA2vIi#(;@ zB2VeJ@RfcG-_~!TdD;4{p~KtytuYU_erxNuwtj2tx3+#e)Yflp{kG25Z|m!9{kFc& z)^F?UZ2h*r&em`1>po&#(bjK0+E1z9LhDfaEwm1$-@;e=E&M3`7Mh3BZ=rc8{T9B` zZ{bJjx5!ueE%KFq3t#EC@T2rw5^QTi?Nm41tSrQgC=`Yrq@{TBI3 zzeT>%Z{aKb7Jig|i+rWuB46pZ@RfcGKT5wvzS3`zuk>5^O237FLjAS_>$hh#(Qm;k z{T6(behXgdx8S4nTkuN11s|o~f>-)2_$d7rywY#MN9niVm3|9e>9_EeehXjexA2vI z3%^mnU6c^3-u<3>rQc$G6KRa&U4M%>rQaehO1}lK^jq*h zs^21R4(Ydui_&kwEBzLHA+7&-*WV&e>9>fB(r>{l{TBTHiGGXmL!{qge5K#QSNbjd zlhSV!Sifz-`Yn8=-@=d5Z(Fi{o5=bte5K#QkJ4|Euk>5wEBzL}(r@8M>9@#N`YrO6 zehXjexA3F%TjVSK7WqoQg|GBm_)+>T@|Avze5K#QSNbjdDE$`sO20+E(r@7_{T9B` zZ;_|;TjVMI7QWJN;oJJHt=}FpuYVp$*O$&Wt}nG-Uu|E1Yv{W6^|$u*xAyh7_Vu^+ z^|$u*xAyh7#x;2M^|v0)zW&yu+1KBCH2eBnk7i$g>(T7%Z#~-ouj_B2btwH7T8Gka z;Vb5^QTi?Nm41tSrQgC=`Yrq@ z{TBI3zeT>%Z{aKb7Jig|i+rWuB46pZ@RfcGf5`@DbmsN9$XEI;@|AuIU+K5-qx4(k zEBzMvO237#^jr8R)Nidl5vkpd}EIU>-D$bm416B=Ei#cEqJBhf{)T~ z!8i6!uwH)){-k>-nAhLNiK@u9?<&#b0qy%>Q*^n>U_I;q`HAu^Qa!S1J^`Yzc06w2*>3U)sQyPtyHPr>e|z}I%! z{S-W!-A}=z+5Hqen%z&qquKowJeu84!K2yz6g-;UPXWCP?0yO--B01Bzr|jUYxh&A zRQ+)6ehS9*xVVNrO25T5>vlf{TfepSTheLmez;1fwfo`P{c!DmxOP8WyC1II510Dp z+Wm0r>+F8G^>ubX-1<7ZA8vh}-4D0E&hCd>UuXBjt*`rg`r)FVfZb04eQxc33hMX! zjr|ll{W(^@wfiX;`mNnh!O(AU4SSS+i)+^HehRjJYwNeRetX!ypP79>Gy8sK_WjK4 z`Dj>usSA&0sB63dMKs^q!+gD0L z^WJIDT4Q?eiizJJ1%J}LcB!W>;!nD_u9&UjPuxovJ#R7YaeLne53Gq~pqBA*j&lgJ z+wYTvvpVmSLL4}%_FW*}yXwAW@a?|0tq&YkJy2#GII4P}0I3H`BHlw84enJ9LI~WA zJg`#Kbgrt2w5h0hp`fy6f5D2H7Yly4fBmb6KEeBUbxovz-k&MZB!;zt|&r^N5uulO#+k2Uzuh(GY6;=2+*&fu-%rxJfb?D*Y?pAb8K8u622$L~)3 zl-Tim5I;3`{GP;5j~%}k@fXF8-<$ZEvEx&J1d09#P3NaC@t4JppHBScvE%n6-WNN5 z2Jtzu`_enOfvQv+J~&;(>{}oeWvO7w9h1CpJ_Ti?K8>P zXPS;r`%E(SnWp2@K9h`nrs?>!&m?1?X*xdbGs)OznvPHVOfvSFrsLZ^MU3}@-BYCQ zaIcw5?4Bax^}~I2yQj!-yQhfVQ^f8mV)qoWdy3RO$a7%#6tR1XQ2%wir-kRQ_@MM$>ZSAGcpL z^%a5kd)z+Jv>f@T)F0aH18w$wHv2rAeVq@5XbwlUGy66tLMMGikM6mgg86N#$D{YK z(&O>JhKaPC?uAEsJb0zYgP%dX-wQrl@#@}p;2ZCKM|wQsl^zd1m*SV@AbytO)xGY( z7ZOkRx+6Uv@k);eUrv0py^x{DBTnh@h*S5XgRkyI2Y=0p^!Qd(8}xYe6;^sYe5J?3 zkGcI45 zgjQ(l@uA2dng?5tkJx(r{zISe{|9V6-q6eKdo~*LK>x!^#{aOI{=bFxnPmJAi~ip- zWBT-vgZ_W(^x)cry2B}>4V}HqI0Nn97}&otuzzD<|Hi=nje-3e1N%1y_HPVGQ?h?! zVE@K|-Us$?4D8<+(EGstjlrq^#vlv&>OZQ-L+ez(OHf)W^mzD6k0&ja^mwJA0+oh} zZl7>opJK~idue8(I%2)SwB<+-zE6o&$cy&KV`09R+@bgJ;N50bI5wGs!2w&aD z5q=RZN4(PG5wGsw2w&a55&n`B_4%gdh*x?%;??~c;j8;K!f*C|jen0G|3$)y^?3A> zR(kxen&|Q1l^zd%2I=Ld9uHpW@!+HMc<@S(2Op)!gI9Vy_$WOdywc;rN9pn4mkHdP z5IpX4`TbG&N{@%H^mzD6kB5IkJ-!X=@#xK?^mzD6kB1+n$D?nw(&Lj@kB6`Hc=%Cz zJo1$uk9?)a!&iDd{3t#CpRo_Mt;fe+Ki$*T;|s9+*?K(xe#6$|Z9TrJ-fZjfwjQ5? z>()Et`t{DhF2F9puE4IqRA4Hw8?YNN4VVV(4(tx>0qg*MKtVBd4| z^j;qit@F5j-bs&F8Y)m}sLG6nHdOUok$HP~8JmS^$@rb`#px+OC=*vdmcGBY! zuk?7tD?J{*(&OQuHa#9gN8+9CKs>HTi-WG#X%~9wI1&T0RidL4EQEdwsp7RmvK4yg zh=jwA3L)Z08?o_Y6ID(Ez6W*+mc)-pOw{6?Sr}_nAS>&sx|9lsYZT#1hieSs^$yqh zgf}@{;|XtdxGp5T-Qk)@c$dR9neaY`E1U2^hie+)7KdvF;Uf;$#e`2dT(bzDa=0!f z+~si1Cfw_AT|xM~!!?KS1&3=c;foGeF5#;VS3cnz4ws+su)}pF;X4jjA>sQDR}tYy z4p#}`rw&&s;g=3qIpH@B*Al|tJ6y{M|IOjLn()sK*K)%Da<~G7|Lbst2;-ctO2P!E ztBSCd)3u7QjnlQ7u)WiD1L2uY*BZjJovxb+J2_oz2|GJo)r8%gu62Yxov!tSeVwj5 z3Hv);cM}eFy6z>+bh_>*9N~0rAoMz28wtlcU7HCnaJselwnuPSz%Hb2yb$_ULm~I>3WUucBktg;ayJGn}qi{U55xCbh?fZZgINaCVa%{ zdYAACr>mauDW~fL!d*_+hlG2bu8#?yce=XZJZvq_!@2@*1$H5RtON7o++CKq zyE*P2OWfTYcaJ6RUXHuZ5_d1h-DiorpW_~|#NE$v)YH?P7rG}C%^&^d)tr|N97n%@ zHRCpN+$KxhMvmKLiQCL^TP$&#Ic|$3ZY#$*~> zOWdOz_n0N_agL+=Fq!qzwKw!Wc)}7#=l3)(+bnT(u1|5>EOFa8?kP*$c8+_>61Rin z!j`xl92d64?c}&!mbjf9x62Z@o8z9g#O>y|r!8@NIBu^cZV$&%PfzpprF$|_z0X?W zp5Zw9{i_-G9LGIxiF=OYp0~u+a9qR^SHp1;OI$6-yc zxPu(`h9&MG$Gu^Rdz0hdvc$c~ac^1T4sqOJOWYxjJ8X$N!f|z$xFZ}#Jw46Wm+r|# z?}K+Nac^@R{r=UAdza(hv&6m2aqn5;>N)OxOI$t2y>E&8fa8u@;y&QGqn5Z2IqoA% z+=m?ZktOb9j{C$C_c6zPVu?G(adaOh^L;?q-q7p&sU_|+j{DpaN9Xz!_qiqR3y%BJ z688nieQAmNisOD`iTjG)%W=Q6#Qm1zerJjMmg9bJiTjr0 zsHdm-`qDj_=zZ{=CGHO#N56lixS65&$m;l&k%kRue3dA~%n9ylcVc2F)T&kGgsFHg zZm4NQvA3pWrn|aQv`uNibBfPw+qU%z;ZCb^yHjPmwtjcpwtlrNM9Y%=RU0y^XClYl zx^*bjrcLEkdM<9Rc|deXtL)GrmBwo&1|4p-NQerIvl8R15T3LOk0+(d<4L7Hkd?xd zq7cif#PHNH;ZR^~xH52lxGFF%ybArp0`)RWpluk?E6^qkI7)O;I9ha7I0miNh#xD^ z<_mbfKwB^1IMID#cx_-(xH>R7ye=>$yxtgZyui1mnf8_%&6ulHq5WN*n6nzv< z5`7g;7U>G7hW&KKPa!-oP3!$$%|;kN_D;dhPk{9I!J*LWq@xPWUcuv65?C!8KNKjVrmvRb1nBqWhX~Qt+m5hv3cOj=@{PXBp$I<{Gc(8gJkl zZ{!-+aE&){jW=_Rx163D*M(Dpw}-m~*N0PscZAc7ZCcAU-pVyrbB(ugjqA9^+quT| zT;m<0`@P|e;CW+Nv?6*>8WvNcusIvcy4faI5+roIN#W&?OfwiT;mR| zG0ZjY5hilx+H9o^NKFc*e$2C6BHP&#A zk<(M-fp94JVz@H+Qn)Jka(I=oO|@L(3tZzqu5mxtcz|nsk!yU3YkXOBKNwycd?Q>P zd^5Z*_*QtmG2Sa&ABMLFKML;%ejMHz{3N{F7_Xjde4lIlfNMO; zHGar7e#A9?%r$;;dTKn`^Pw!9s=XlMX_2Ogae7G3gyN$E0`A9Fwj=b4Z0{8Hw4J8GplCM|~Mm^2rfW71q`j!9dgIVNp|=9n}Tnq$&XXpTuMp*h|Qq&a>D zNOMe@2+c8RA2i3Leb5|}#zAvT8pkhXj%!gX&G8FBn&W*yn&bU62SCzaXpTvLp*bd< zh31%a7Mf$yQ)rG!PoX&`-Gt_tbQ79m(nn~HNgts(CLM(4m~@a|${Zg;tu)7^>ChaL zc0+Sa+6~PyX*4v)q|wkElNLjBOj-=hF=;L|$E3N?9De|$IVNp|=9n}Tnq$&XXpTuM zp*be4aZ&>WL4Lvu{J49zj= zFEq!bzt9|$&O&ocIt$G)=_xeFq^JB6=2#?^9BY-}3?;@_8v0N?e*8lE(5zXZ&?T3! zKJ-D&gr>R>;*P5eWf{8ANuS#Y|Bh;hKl4jK_>RF5YI;F)M7;ap}`W9kZV`>bN5E^QvRcexr`LFB^5_zHZc! z|CUjQzs{)R%J+;q3Xd9f6n*mZs-xs{qmI(w7i;zASpMHe z9f5|QR~;dzQ|%v>EuCursM4Kk|5(+|srHZ6XE@dVal@~iYX4aCYp2>jZcax0&uITx z+tsL}y1P-wy52?|>-!mX+&R#wbU>BpI03lMjLf(9B0(Ad4f^L)+t6E z4^KDhcyy*w$K#h7bv)_&dDXFfo>9k+`9>W(7Z`QyE;j1eQ)blh%u=I{=dLm8s0seO z>Zn~|)Uoe6qmBbN8g;yMi&4ibw;6T3c85{N!F!B4-hANaRmY)CMjb~UGU|ByF{6%m zw;6TRhmATuc-pAr!)J{;K92l+>Nx4+KF+g#N=<`~*Z&gd9hk?TQn&b5n#Z3~oA_6n z$DdMv_*a_8pHgG^SDMG4Qb+h#na2&dwjoYj^z=m+$492ttc;+=2CO22p6KckLUR&& zpjU}{xERrUTs>OjqH21eM~ZsnGyeJehGKg93g<&wP>*lfXSWe178EVWEA^y!Cyhy& zIB`tMfC1?l=@}_K`V|%z&Ce_KmE@HBi*gp``O1oY<^G&#hh-&Gc&F|^PA5#83R+8FMKMR$8XH>Xem-kqWQ7RAHr2D*2hfdzeOWu z>jTXk@MtMIHW_C*^N(Mso;3csnfbywjg)$_LH%tha+_@DZsvzCQBNBGyu$qd+}KE| zCm%5X+=J2l@0s8E*J%EKnNR&Rns2A3x20J3YBb-O`S=~t{6OZ{J`>H4Wj^IdG(U^^ z?cYT63z$!Iswa&LuVp^t%xE6}zr&-Y$Zqm}Xh}bzcO(k`;OO{`a635BxxdN#1p2dR zFItEY?{8T&EIrbTip%pvdhS(4Wmhc}>GO*gr~7lt{DS`yUtTKGON(=J%5y||dET;e z6`ixNU>@ka!qQ^wI=Q*naYXvO;)M(IiV%RY%5&xd{m3?+87K;K=H?ap<`w7W`SO+( zl(Sz_<|`}DDK*{loPt8y+2-Yyl{IFR=J^VXi%Wd{W0(wP7Uq;KP^0=xCa17qeoGAk1VT$=lx)mZyxU(KX>q>DSG%@(o)N#i8~mJD3@xWhxvKt=g&d&uw$MtnDd9< zSgFTI{3XuJujkLhdd~0BNz8mBe#1-^7c-nR90iU451jv6>gs r$1c6r=JuQKpDVzb+iuNII@`!M>o=E~>otAf)pVVaIn&$sM)B_fI literal 0 HcmV?d00001 diff --git a/hsa/gfx950/mla/mla_asm.csv b/hsa/gfx950/mla/mla_asm.csv index 6c27175923..99005f1cd4 100644 --- a/hsa/gfx950/mla/mla_asm.csv +++ b/hsa/gfx950/mla/mla_asm.csv @@ -16,6 +16,7 @@ fp8,fp8,16,0,1,0,0,0,_ZN5aiter33mla_a8w8_qh16_qseqlen1_gqaratio16E,mla_a8w8_qh16 fp8,fp8,16,0,2,0,0,0,_ZN5aiter33mla_a8w8_qh16_qseqlen2_gqaratio16E,mla_a8w8_qh16_qseqlen2_gqaratio16.co fp8,fp8,16,0,4,0,0,0,_ZN5aiter33mla_a8w8_qh64_qseqlen4_gqaratio16E,mla_a8w8_qh64_qseqlen4_gqaratio16.co fp8,fp8,32,1,4,0,0,0,_ZN5aiter36mla_a8w8_qh32_qseqlen4_gqaratio32_psE,mla_a8w8_qh32_qseqlen4_gqaratio32_ps.co +fp8,fp8,32,1,1,0,0,0,_ZN5aiter36mla_a8w8_qh32_qseqlen1_gqaratio32_psE,mla_a8w8_qh32_qseqlen1_gqaratio32_ps.co fp8,fp8,32,1,2,0,0,0,_ZN5aiter39mla_a8w8_qh32_qseqlen2_gqaratio32_v3_psE,mla_a8w8_qh32_qseqlen2_gqaratio32_v3_ps.co fp8,fp8,32,1,2,0,0,1,_ZN5aiter43mla_a8w8_qh32_qseqlen2_gqaratio32_lse_v3_psE,mla_a8w8_qh32_qseqlen2_gqaratio32_lse_v3_ps.co fp8,fp8,128,0,0,0,0,0,_ZN5aiter31mla_a8w8_qh128_m32x4_n16x2_msk1E,mla_a8w8_qh128_m32x4_n16x2_msk1.co diff --git a/op_tests/test_mla.py b/op_tests/test_mla.py index 8996e10ac3..50f15f31e9 100644 --- a/op_tests/test_mla.py +++ b/op_tests/test_mla.py @@ -515,7 +515,7 @@ def test_absorb_decode_gluon(): 128, ]: err, us_asm_decode = test_absorb_decode_bf16() - elif kvtype == dtypes.fp8 and nhead in [8, 16, 128]: + elif kvtype == dtypes.fp8 and nhead in [8, 16, 32, 128]: err, us_asm_decode = test_absorb_decode_fp8() ret["decode:err"] = err diff --git a/op_tests/test_mla_persistent.py b/op_tests/test_mla_persistent.py index c42449f3bb..25799dee6b 100644 --- a/op_tests/test_mla_persistent.py +++ b/op_tests/test_mla_persistent.py @@ -490,6 +490,13 @@ def torch_mla_extend_split_kv( and is_fp8_kvc and max_seqlen_q == 2 ) + or ( + get_gfx() == "gfx950" + and nheads == 32 + and is_fp8_q + and is_fp8_kvc + and max_seqlen_q == 1 + ) or ( get_gfx() == "gfx950" and nheads == 8