TrevorS · TrevorS · May 22, 2026 · May 22, 2026
diff --git a/ds4.c b/ds4.c
@@ -9560,19 +9560,23 @@ static bool metal_graph_encode_decode_layer(
     if (ok) {
         metal_graph_debug_dump_tensor("attn_norm", g->attn_norm, DS4_N_EMBD, il, pos);
     }
-    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size,
-                                              layer->attn_q_a->abs_offset,
-                                              DS4_N_EMBD, q_rank,
-                                              g->attn_norm, 1) != 0;
-    if (ok) {
-        metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
-    }
     if (qkv_rms_fused) {
-        if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->kv_raw, model->map, model->size,
-                                                  layer->attn_kv->abs_offset,
-                                                  DS4_N_EMBD, DS4_N_HEAD_DIM,
-                                                  g->attn_norm, 1) != 0;
+        /* Pair-fuse Q_A and KV_A matmuls: both read the same attn_norm input,
+         * so we share one prequantize pass and one warp-of-8 launch instead
+         * of two back-to-back launches.  The same pair primitive already
+         * powers the shared_gate/shared_up fusion.  At n_tok=1 the kernel
+         * uses identical Q8_0 quantization of x and the identical dp4a
+         * accumulation path as two sequential matmul_q8_0 calls; the only
+         * difference is one kernel launch and one prequantize amortized. */
+        if (ok) ok = ds4_gpu_matmul_q8_0_pair_tensor(g->qr, g->kv_raw,
+                                                       model->map, model->size,
+                                                       layer->attn_q_a->abs_offset,
+                                                       layer->attn_kv->abs_offset,
+                                                       DS4_N_EMBD,
+                                                       q_rank, DS4_N_HEAD_DIM,
+                                                       g->attn_norm, 1) != 0;
         if (ok) {
+            metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
             metal_graph_debug_dump_tensor("KVraw", g->kv_raw, DS4_N_HEAD_DIM, il, pos);
         }
         if (ok) ok = ds4_gpu_dsv4_qkv_rms_norm_rows_tensor(g->qr_norm,
@@ -9588,6 +9592,13 @@ static bool metal_graph_encode_decode_layer(
                                                              1,
                                                              DS4_RMS_EPS) != 0;
     } else {
+        if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size,
+                                                  layer->attn_q_a->abs_offset,
+                                                  DS4_N_EMBD, q_rank,
+                                                  g->attn_norm, 1) != 0;
+        if (ok) {
+            metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
+        }
         if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->qr_norm, g->qr,
                                                       model->map, model->size,
                                                       layer->attn_q_a_norm->abs_offset,

diff --git a/ds4_cuda.cu b/ds4_cuda.cu
@@ -2049,10 +2049,37 @@ __global__ static void matmul_q8_0_hc_expand_preq_warp8_kernel(
         acc += __half2float(*scale_h) * xscale[b] * (float)dot;
     }
     acc = warp_sum_f32(acc);
-    if (lane == 0) {
-        const uint32_t d = (uint32_t)row;
-        block_out[d] = acc;
-        float block_v = acc;
+    /* Broadcast the per-row block_v from lane 0 to all 32 lanes so we can
+     * parallelize the n_hc HC outputs across lanes 0..n_hc-1.  The previous
+     * implementation collapsed to lane 0 for the entire HC epilogue, doing
+     * n_hc * n_hc serial HBM reads of residual_hc -- the dominant cost of
+     * this kernel.  Using lanes 0..n_hc-1 we issue n_hc parallel residual
+     * loads and share them across lanes via __shfl_sync, then each lane
+     * computes one dst_hc output and writes in parallel. */
+    const float block_v0 = __shfl_sync(0xffffffffu, acc, 0);
+    const uint32_t d = (uint32_t)row;
+    if (lane == 0) block_out[d] = block_v0;
+    if (n_hc <= 32u) {
+        float block_v = block_v0;
+        if (has_add) block_v += block_add[d];
+        const float *post = split + n_hc;
+        const float *comb = split + 2u * n_hc;
+        const float my_res = (lane < n_hc)
+            ? residual_hc[(uint64_t)lane * n_embd + d]
+            : 0.0f;
+        if (lane < n_hc) {
+            const uint32_t dst_hc = lane;
+            float hc_acc = block_v * post[dst_hc];
+            for (uint32_t src_hc = 0; src_hc < n_hc; src_hc++) {
+                const float src_res = __shfl_sync(0xffffffffu, my_res, src_hc);
+                hc_acc += comb[dst_hc + (uint64_t)src_hc * n_hc] * src_res;
+            }
+            out_hc[(uint64_t)dst_hc * n_embd + d] = hc_acc;
+        }
+    } else if (lane == 0) {
+        /* Fallback for unusual n_hc > warp size (not used by DSV4 which has
+         * n_hc = 4).  Same serial path as before. */
+        float block_v = block_v0;
         if (has_add) block_v += block_add[d];
         const float *post = split + n_hc;
         const float *comb = split + 2u * n_hc;

diff --git a/ds4_gpu.h b/ds4_gpu.h
@@ -151,6 +151,23 @@ int ds4_gpu_matmul_q8_0_tensor(
         const ds4_gpu_tensor *x,
         uint64_t                n_tok);
 
+/* Pair-fused Q8_0 matmul: two weight matrices share a single prequantize of x
+ * and one warp-of-8 kernel launch.  out0/out1 can have asymmetric output
+ * dimensions (e.g. Q_A and KV_A in DSV4).  Falls back to two sequential
+ * ds4_gpu_matmul_q8_0_tensor calls for n_tok > 1. */
+int ds4_gpu_matmul_q8_0_pair_tensor(
+        ds4_gpu_tensor       *out0,
+        ds4_gpu_tensor       *out1,
+        const void             *model_map,
+        uint64_t                model_size,
+        uint64_t                weight0_offset,
+        uint64_t                weight1_offset,
+        uint64_t                in_dim,
+        uint64_t                out0_dim,
+        uint64_t                out1_dim,
+        const ds4_gpu_tensor *x,
+        uint64_t                n_tok);
+
 int ds4_gpu_shared_gate_up_swiglu_q8_0_tensor(
         ds4_gpu_tensor       *gate,
         ds4_gpu_tensor       *up,