@@ -110,7 +110,11 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
110110
111111#if defined(DATA_A_Q4_0)
112112#define BLOCK_BYTE_SIZE 18
113+ #elif defined(DATA_A_Q4_1)
114+ #define BLOCK_BYTE_SIZE 20
115+ #endif
113116
117+ #if defined(DATA_A_Q4_0) || defined(DATA_A_Q4_1)
114118FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
115119 if (binding_idx == BINDING_IDX_K) {
116120 uint vui_lo = uint (k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0 ]);
@@ -119,19 +123,113 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
119123 vui_lo >>= shift;
120124 vui_hi >>= shift;
121125
122- return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8 ) & 0xF, vui_hi & 0xF, (vui_hi >> 8 ) & 0xF) - FLOAT_TYPE(8 .0f));
126+ FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8 ) & 0xF, vui_hi & 0xF, (vui_hi >> 8 ) & 0xF);
127+ #ifdef DATA_A_Q4_1
128+ return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
129+ #else
130+ return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8 .0f));
131+ #endif
123132 } else {
124133 uint vui_lo = uint (v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0 ]);
125134 uint vui_hi = uint (v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1 ]);
126135 uint shift = (iqs & 0x10) >> 2 ;
127136 vui_lo >>= shift;
128137 vui_hi >>= shift;
129138
130- return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8 ) & 0xF, vui_hi & 0xF, (vui_hi >> 8 ) & 0xF) - FLOAT_TYPE(8 .0f));
139+ FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8 ) & 0xF, vui_hi & 0xF, (vui_hi >> 8 ) & 0xF);
140+ #ifdef DATA_A_Q4_1
141+ return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
142+ #else
143+ return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8 .0f));
144+ #endif
131145 }
132146}
133147#endif
134148
149+ #if defined(DATA_A_Q5_0)
150+ #define BLOCK_BYTE_SIZE 22
151+ #elif defined(DATA_A_Q5_1)
152+ #define BLOCK_BYTE_SIZE 24
153+ #endif
154+
155+ #if defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
156+ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
157+ if (binding_idx == BINDING_IDX_K) {
158+ uint vui_lo = uint (k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0 ]);
159+ uint vui_hi = uint (k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1 ]);
160+ uint shift = (iqs & 0x10) >> 2 ;
161+ vui_lo >>= shift;
162+ vui_hi >>= shift;
163+
164+ #ifdef DATA_A_Q5_1
165+ uint qh = k_packed.k_data_packed16[a_offset + ib].qh;
166+ #else
167+ uint qh = uint (k_packed.k_data_packed16[a_offset + ib].qh[0 ]) | (uint (k_packed.k_data_packed16[a_offset + ib].qh[1 ]) << 16 );
168+ #endif
169+ FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1 , (qh >> (iqs + 1 )) & 1 , (qh >> (iqs + 2 )) & 1 , (qh >> (iqs + 3 )) & 1 ) * FLOAT_TYPE(16 .0f);
170+
171+ FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8 ) & 0xF, vui_hi & 0xF, (vui_hi >> 8 ) & 0xF);
172+ #ifdef DATA_A_Q5_1
173+ return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
174+ #else
175+ return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16 .0f));
176+ #endif
177+ } else {
178+ uint vui_lo = uint (v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0 ]);
179+ uint vui_hi = uint (v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1 ]);
180+ uint shift = (iqs & 0x10) >> 2 ;
181+ vui_lo >>= shift;
182+ vui_hi >>= shift;
183+
184+ #ifdef DATA_A_Q5_1
185+ uint qh = v_packed.v_data_packed16[a_offset + ib].qh;
186+ #else
187+ uint qh = uint (v_packed.v_data_packed16[a_offset + ib].qh[0 ]) | (uint (v_packed.v_data_packed16[a_offset + ib].qh[1 ]) << 16 );
188+ #endif
189+ FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1 , (qh >> (iqs + 1 )) & 1 , (qh >> (iqs + 2 )) & 1 , (qh >> (iqs + 3 )) & 1 ) * FLOAT_TYPE(16 .0f);
190+
191+ FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8 ) & 0xF, vui_hi & 0xF, (vui_hi >> 8 ) & 0xF);
192+ #ifdef DATA_A_Q5_1
193+ return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
194+ #else
195+ return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16 .0f));
196+ #endif
197+ }
198+ }
199+ #endif
200+
201+
202+ #if defined(DATA_A_IQ4_NL)
203+ #define BLOCK_BYTE_SIZE 18
204+
205+ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
206+ if (binding_idx == BINDING_IDX_K) {
207+ uint vui_lo = uint (k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0 ]);
208+ uint vui_hi = uint (k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1 ]);
209+ uint shift = (iqs & 0x10) >> 2 ;
210+ vui_lo >>= shift;
211+ vui_hi >>= shift;
212+
213+ return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
214+ kvalues_iq4nl[vui_lo & 0xF],
215+ kvalues_iq4nl[(vui_lo >> 8 ) & 0xF],
216+ kvalues_iq4nl[vui_hi & 0xF],
217+ kvalues_iq4nl[(vui_hi >> 8 ) & 0xF]);
218+ } else {
219+ uint vui_lo = uint (v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0 ]);
220+ uint vui_hi = uint (v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1 ]);
221+ uint shift = (iqs & 0x10) >> 2 ;
222+ vui_lo >>= shift;
223+ vui_hi >>= shift;
224+
225+ return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
226+ kvalues_iq4nl[vui_lo & 0xF],
227+ kvalues_iq4nl[(vui_lo >> 8 ) & 0xF],
228+ kvalues_iq4nl[vui_hi & 0xF],
229+ kvalues_iq4nl[(vui_hi >> 8 ) & 0xF]);
230+ }
231+ }
232+ #endif
135233#if defined(DATA_A_Q8_0)
136234#define BLOCK_BYTE_SIZE 34
137235FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
0 commit comments