@@ -1725,7 +1725,7 @@ void tq_metal_repack_q4(const uint8_t* src_qs, const float* src_scales,
17251725#define TQ_REPACK_CACHE_SIZE 128
17261726static struct { const void * key; id <MTLBuffer > qs; id <MTLBuffer > sc; int out_dim; int in_dim; }
17271727 g_repack_cache[TQ_REPACK_CACHE_SIZE];
1728- static int g_repack_count = 0 ;
1728+ static int g_repack_count __attribute__ ((unused)) = 0;
17291729
17301730static void encode_q4_matmul (id <MTLComputeCommandEncoder > enc,
17311731 id <MTLBuffer > input_buf,
@@ -1736,8 +1736,6 @@ static void encode_q4_matmul(id<MTLComputeCommandEncoder> enc,
17361736 if (!tq_pipe_matmul_tq_q4) return ;
17371737
17381738 int n_blocks = in_dim / 32 ;
1739- const int TILE = 32 ;
1740- int n_tiles = (out_dim + TILE - 1 ) / TILE;
17411739
17421740 /* Fast Q4 kernel: llama.cpp-inspired uint16 mask trick + SIMD-group.
17431741 * No repacking needed — reads original row-major Q4 layout.
@@ -2173,15 +2171,21 @@ int tq_metal_forward_layer(
21732171 [enc setBuffer: g_gpu_k offset: 0 atIndex: 1 ];
21742172 [enc setBuffer: pos_buf offset: 0 atIndex: 2 ];
21752173 [enc setBuffer: kvd_buf offset: 0 atIndex: 3 ];
2176- [enc dispatchThreads: MTLSizeMake (kv_dim, 1 , 1 )
2177- threadsPerThreadgroup: MTLSizeMake (MIN (kv_dim, 256 ), 1 , 1 )];
2174+ {
2175+ NSUInteger tg_w = (NSUInteger )(kv_dim < 256 ? kv_dim : 256 );
2176+ [enc dispatchThreads: MTLSizeMake (kv_dim, 1 , 1 )
2177+ threadsPerThreadgroup: MTLSizeMake (tg_w, 1 , 1 )];
2178+ }
21782179 [enc memoryBarrierWithScope: MTLBarrierScopeBuffers ];
21792180
21802181 /* Write V to cache */
21812182 [enc setBuffer: vc_buf offset: 0 atIndex: 0 ];
21822183 [enc setBuffer: g_gpu_v offset: 0 atIndex: 1 ];
2183- [enc dispatchThreads: MTLSizeMake (kv_dim, 1 , 1 )
2184- threadsPerThreadgroup: MTLSizeMake (MIN (kv_dim, 256 ), 1 , 1 )];
2184+ {
2185+ NSUInteger tg_w = (NSUInteger )(kv_dim < 256 ? kv_dim : 256 );
2186+ [enc dispatchThreads: MTLSizeMake (kv_dim, 1 , 1 )
2187+ threadsPerThreadgroup: MTLSizeMake (tg_w, 1 , 1 )];
2188+ }
21852189 [enc memoryBarrierWithScope: MTLBarrierScopeBuffers ];
21862190 }
21872191
0 commit comments