diff --git a/ds4.c b/ds4.c index 6b9c45a3..69763014 100644 --- a/ds4.c +++ b/ds4.c @@ -9560,19 +9560,23 @@ static bool metal_graph_encode_decode_layer( if (ok) { metal_graph_debug_dump_tensor("attn_norm", g->attn_norm, DS4_N_EMBD, il, pos); } - if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size, - layer->attn_q_a->abs_offset, - DS4_N_EMBD, q_rank, - g->attn_norm, 1) != 0; - if (ok) { - metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos); - } if (qkv_rms_fused) { - if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->kv_raw, model->map, model->size, - layer->attn_kv->abs_offset, - DS4_N_EMBD, DS4_N_HEAD_DIM, - g->attn_norm, 1) != 0; + /* Pair-fuse Q_A and KV_A matmuls: both read the same attn_norm input, + * so we share one prequantize pass and one warp-of-8 launch instead + * of two back-to-back launches. The same pair primitive already + * powers the shared_gate/shared_up fusion. At n_tok=1 the kernel + * uses identical Q8_0 quantization of x and the identical dp4a + * accumulation path as two sequential matmul_q8_0 calls; the only + * difference is one kernel launch and one prequantize amortized. */ + if (ok) ok = ds4_gpu_matmul_q8_0_pair_tensor(g->qr, g->kv_raw, + model->map, model->size, + layer->attn_q_a->abs_offset, + layer->attn_kv->abs_offset, + DS4_N_EMBD, + q_rank, DS4_N_HEAD_DIM, + g->attn_norm, 1) != 0; if (ok) { + metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos); metal_graph_debug_dump_tensor("KVraw", g->kv_raw, DS4_N_HEAD_DIM, il, pos); } if (ok) ok = ds4_gpu_dsv4_qkv_rms_norm_rows_tensor(g->qr_norm, @@ -9588,6 +9592,13 @@ static bool metal_graph_encode_decode_layer( 1, DS4_RMS_EPS) != 0; } else { + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size, + layer->attn_q_a->abs_offset, + DS4_N_EMBD, q_rank, + g->attn_norm, 1) != 0; + if (ok) { + metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos); + } if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->qr_norm, g->qr, model->map, model->size, layer->attn_q_a_norm->abs_offset, diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 6a864ba6..4821b841 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -2049,10 +2049,37 @@ __global__ static void matmul_q8_0_hc_expand_preq_warp8_kernel( acc += __half2float(*scale_h) * xscale[b] * (float)dot; } acc = warp_sum_f32(acc); - if (lane == 0) { - const uint32_t d = (uint32_t)row; - block_out[d] = acc; - float block_v = acc; + /* Broadcast the per-row block_v from lane 0 to all 32 lanes so we can + * parallelize the n_hc HC outputs across lanes 0..n_hc-1. The previous + * implementation collapsed to lane 0 for the entire HC epilogue, doing + * n_hc * n_hc serial HBM reads of residual_hc -- the dominant cost of + * this kernel. Using lanes 0..n_hc-1 we issue n_hc parallel residual + * loads and share them across lanes via __shfl_sync, then each lane + * computes one dst_hc output and writes in parallel. */ + const float block_v0 = __shfl_sync(0xffffffffu, acc, 0); + const uint32_t d = (uint32_t)row; + if (lane == 0) block_out[d] = block_v0; + if (n_hc <= 32u) { + float block_v = block_v0; + if (has_add) block_v += block_add[d]; + const float *post = split + n_hc; + const float *comb = split + 2u * n_hc; + const float my_res = (lane < n_hc) + ? residual_hc[(uint64_t)lane * n_embd + d] + : 0.0f; + if (lane < n_hc) { + const uint32_t dst_hc = lane; + float hc_acc = block_v * post[dst_hc]; + for (uint32_t src_hc = 0; src_hc < n_hc; src_hc++) { + const float src_res = __shfl_sync(0xffffffffu, my_res, src_hc); + hc_acc += comb[dst_hc + (uint64_t)src_hc * n_hc] * src_res; + } + out_hc[(uint64_t)dst_hc * n_embd + d] = hc_acc; + } + } else if (lane == 0) { + /* Fallback for unusual n_hc > warp size (not used by DSV4 which has + * n_hc = 4). Same serial path as before. */ + float block_v = block_v0; if (has_add) block_v += block_add[d]; const float *post = split + n_hc; const float *comb = split + 2u * n_hc; diff --git a/ds4_gpu.h b/ds4_gpu.h index ff64d2ab..fa277716 100644 --- a/ds4_gpu.h +++ b/ds4_gpu.h @@ -151,6 +151,23 @@ int ds4_gpu_matmul_q8_0_tensor( const ds4_gpu_tensor *x, uint64_t n_tok); +/* Pair-fused Q8_0 matmul: two weight matrices share a single prequantize of x + * and one warp-of-8 kernel launch. out0/out1 can have asymmetric output + * dimensions (e.g. Q_A and KV_A in DSV4). Falls back to two sequential + * ds4_gpu_matmul_q8_0_tensor calls for n_tok > 1. */ +int ds4_gpu_matmul_q8_0_pair_tensor( + ds4_gpu_tensor *out0, + ds4_gpu_tensor *out1, + const void *model_map, + uint64_t model_size, + uint64_t weight0_offset, + uint64_t weight1_offset, + uint64_t in_dim, + uint64_t out0_dim, + uint64_t out1_dim, + const ds4_gpu_tensor *x, + uint64_t n_tok); + int ds4_gpu_shared_gate_up_swiglu_q8_0_tensor( ds4_gpu_tensor *gate, ds4_gpu_tensor *up,