From 96e934efa2c7f5019d2de0212455edba787a420c Mon Sep 17 00:00:00 2001
From: Trevor Strieber <trevor@strieber.org>
Date: Fri, 22 May 2026 13:09:30 -0700
Subject: [PATCH 1/2] cuda: parallelize matmul_q8_0_hc_expand epilogue across
 n_hc lanes (target 1)

Replaces the lane-0-only HC epilogue in
matmul_q8_0_hc_expand_preq_warp8_kernel (ds4_cuda.cu:2193) with a
warp-shuffle parallel epilogue that uses lanes 0..n_hc-1 to share
residual_hc loads and compute n_hc outputs in parallel.

Bottleneck before
-----------------
After warp_sum_f32 reduces the dot product into lane 0, the original
epilogue (lines 2224-2240) collapsed to lane 0 doing:
  - n_hc serial reads of post[]
  - n_hc * n_hc reads of comb[]
  - n_hc * n_hc reads of residual_hc[] (each from a different HBM row)
  - n_hc writes of out_hc[]
For DSV4 (n_hc=4), that is 16 serial HBM reads of residual_hc per row,
plus 4 writes -- 31 of 32 lanes in the warp idle.

Change
------
Broadcast block_v (the post-reduction acc) from lane 0 to all lanes via
__shfl_sync.  Lanes 0..n_hc-1 each load ONE residual value from HBM,
then share residuals across lanes via warp shuffle (no shared memory
needed since n_hc <= 32).  Each of those lanes computes its dst_hc
output independently and writes in parallel.

For n_hc > warp size (not used by DSV4 but allowed by the kernel
signature), the old serial path is preserved as a fallback.

Result
------
Validated on DGX Spark (GB10, sm_121) via DS4_METAL_DECODE_STAGE_PROFILE
profile aggregation:

  attn_output stage (43-layer sum):
    pre:  14.62 ms/iter
    post: 14.42 ms/iter
    delta: -0.20 ms (~1.4% on the stage)

  Plain decode (n=32, --temp 0 --nothink, ds4flash.gguf, 3-run avg):
    pre:  16.01 t/s
    post: 16.11 t/s
    delta: +0.7%

The gain is real but smaller than the initial scout estimate (3-5x on
attn_output) because the lane-0 epilogue is only ~1.5% of the
matmul_q8_0_hc_expand kernel's per-row cost.  The dot product loop (256
Q8_0 blocks per row through dp4a) is bandwidth-bound and dominates the
kernel time (~47% of peak HBM bandwidth, ~98% of the kernel's wall
time).

Net effect on the 14.6 ms attn_output bucket: the kernel is still
bandwidth-bound on the dot product.  Further gains on this stage need
either vectorized Q8 weight loads or a different tile geometry --
larger LOC, deeper kernel work.

Parity
------
Generation output coherent and byte-identical to pre-change for the
deterministic "Once upon a time" prompt at --temp 0:
  "Once upon a time, in a land where the rivers sparkled like liquid
   sapphires and the trees whispered secrets to..."

Per-element output_hc may differ by 1-2 ULP vs the previous lane-0
serial path due to FMA-order differences from the per-lane reduction
vs the single-lane sequential reduction.  No observable token-level
divergence in the test suite (./ds4_test: tool-call-quality,
long-context, metal-kernels, server all pass; pre-existing
logprob-vectors failure unchanged).

LOC
---
ds4_cuda.cu: +30/-12 in matmul_q8_0_hc_expand_preq_warp8_kernel only.
No caller changes, no header changes, no launch-param changes.

NO github push.  jj change spnkxvoo -> ztnmlvyx.
---
 ds4_cuda.cu | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 6a864ba6..4821b841 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -2049,10 +2049,37 @@ __global__ static void matmul_q8_0_hc_expand_preq_warp8_kernel(
         acc += __half2float(*scale_h) * xscale[b] * (float)dot;
     }
     acc = warp_sum_f32(acc);
-    if (lane == 0) {
-        const uint32_t d = (uint32_t)row;
-        block_out[d] = acc;
-        float block_v = acc;
+    /* Broadcast the per-row block_v from lane 0 to all 32 lanes so we can
+     * parallelize the n_hc HC outputs across lanes 0..n_hc-1.  The previous
+     * implementation collapsed to lane 0 for the entire HC epilogue, doing
+     * n_hc * n_hc serial HBM reads of residual_hc -- the dominant cost of
+     * this kernel.  Using lanes 0..n_hc-1 we issue n_hc parallel residual
+     * loads and share them across lanes via __shfl_sync, then each lane
+     * computes one dst_hc output and writes in parallel. */
+    const float block_v0 = __shfl_sync(0xffffffffu, acc, 0);
+    const uint32_t d = (uint32_t)row;
+    if (lane == 0) block_out[d] = block_v0;
+    if (n_hc <= 32u) {
+        float block_v = block_v0;
+        if (has_add) block_v += block_add[d];
+        const float *post = split + n_hc;
+        const float *comb = split + 2u * n_hc;
+        const float my_res = (lane < n_hc)
+            ? residual_hc[(uint64_t)lane * n_embd + d]
+            : 0.0f;
+        if (lane < n_hc) {
+            const uint32_t dst_hc = lane;
+            float hc_acc = block_v * post[dst_hc];
+            for (uint32_t src_hc = 0; src_hc < n_hc; src_hc++) {
+                const float src_res = __shfl_sync(0xffffffffu, my_res, src_hc);
+                hc_acc += comb[dst_hc + (uint64_t)src_hc * n_hc] * src_res;
+            }
+            out_hc[(uint64_t)dst_hc * n_embd + d] = hc_acc;
+        }
+    } else if (lane == 0) {
+        /* Fallback for unusual n_hc > warp size (not used by DSV4 which has
+         * n_hc = 4).  Same serial path as before. */
+        float block_v = block_v0;
         if (has_add) block_v += block_add[d];
         const float *post = split + n_hc;
         const float *comb = split + 2u * n_hc;

From 86d01b94d610d6bbc61bcc68ede34db556469cfd Mon Sep 17 00:00:00 2001
From: Trevor Strieber <trevor@strieber.org>
Date: Fri, 22 May 2026 10:42:18 -0700
Subject: [PATCH 2/2] cuda: pair-fuse Q_A + KV_A matmuls in qkv_rms_fused
 decode path

In the qkv_rms_fused branch of metal_graph_encode_decode_layer, the Q_A
matmul (attn_q_a, DS4_N_EMBD -> q_rank) and KV_A matmul (attn_kv,
DS4_N_EMBD -> DS4_N_HEAD_DIM) ran as two back-to-back
ds4_gpu_matmul_q8_0_tensor calls.  Both read the same attn_norm input,
so they each triggered an independent prequantize of attn_norm and an
independent warp-of-8 matmul launch.

This commit replaces the pair with ds4_gpu_matmul_q8_0_pair_tensor (the
same primitive already used by the shared_gate/shared_up fusion), which
at n_tok=1 issues one prequantize_q8_0_f32_kernel + one
matmul_q8_0_pair_preq_warp8_kernel.  The pair kernel handles asymmetric
out0_dim/out1_dim (q_rank=768 vs DS4_N_HEAD_DIM=192) by gridding on
max(out0_dim, out1_dim).

Sites updated:
  - decode qkv_rms_fused branch (ds4.c:9371-9415): pair-fused path
  - decode qkv_rms_fused else  (ds4.c:9416-...): Q_A standalone retained

Sites NOT updated:
  - batched qkv_rms_fused (ds4.c:11331+): at n_tok>1 the pair primitive
    falls back to two sequential matmul_q8_0_preq_batch_warp8 calls,
    yielding identical behavior; touching it would be a no-op pending a
    separate batched-pair kernel rewrite.
  - decode !qkv_rms_fused (rare reference path): KV_A matmul is too far
    downstream of Q_A to share attn_norm prequantize cleanly.

Savings per layer at n_tok=1:
  - 1 quantize_q8_0_f32_kernel launch eliminated (~5us)
  - 1 matmul kernel launch eliminated (~5us)
  - prequantized x reused for both matmuls (in-kernel) -> 1 fewer
    DRAM read of attn_norm + 1 fewer scale-buffer write
At 30 layers, that's ~300-450us per token of theoretical headroom.

Bench (DGX Spark, ds4flash.gguf, --temp 0 --nothink, plain decode):
  Pre-fusion (prxlvzlq):  16.26 t/s (3-run stable)
  Post-fusion (lvlnlxsk): 16.28 t/s (3-run avg @ n=32),
                          15.80 t/s (3-run avg @ n=128)
The delta is within run-to-run variation -- same characterization as
the head_rms_norm + rope_tail fusion in the prior commit.  Launch
reductions compound and become substantially more material once
captured graphs are added (each eliminated launch becomes one fewer
graph node in the captured DAG).

Parity: generation output coherent ("Once upon a time, in a land where
the rivers sparkled like liquid sapp..." @ n=24, --temp 0).  The pair
kernel uses the same Q8_0 prequantize of attn_norm and the same dp4a
warp-of-8 reduction as the standalone matmul_q8_0_preq_warp8 kernel.
Output is byte-equal modulo FMA-reordering-scale differences that the
existing pair primitive has also exhibited in shared_gate/up usage.

Test suite: ds4_test logprob-vectors shows a pre-existing failure on
short_code_completion step 1 (assertion at tests/ds4_test.c:490),
reproduced on the prxlvzlq parent before this change.  Not introduced
here; tracking separately.  metal-kernels and server suites pass.

Header: added ds4_gpu_matmul_q8_0_pair_tensor declaration to ds4_gpu.h.
The wrapper has existed in ds4_cuda.cu since the shared-expert fusion
landed but was not declared in the public header.

NO github push.  jj change prxlvzlq -> lvlnlxsk.
---
 ds4.c     | 33 ++++++++++++++++++++++-----------
 ds4_gpu.h | 17 +++++++++++++++++
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/ds4.c b/ds4.c
index 6b9c45a3..69763014 100644
--- a/ds4.c
+++ b/ds4.c
@@ -9560,19 +9560,23 @@ static bool metal_graph_encode_decode_layer(
     if (ok) {
         metal_graph_debug_dump_tensor("attn_norm", g->attn_norm, DS4_N_EMBD, il, pos);
     }
-    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size,
-                                              layer->attn_q_a->abs_offset,
-                                              DS4_N_EMBD, q_rank,
-                                              g->attn_norm, 1) != 0;
-    if (ok) {
-        metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
-    }
     if (qkv_rms_fused) {
-        if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->kv_raw, model->map, model->size,
-                                                  layer->attn_kv->abs_offset,
-                                                  DS4_N_EMBD, DS4_N_HEAD_DIM,
-                                                  g->attn_norm, 1) != 0;
+        /* Pair-fuse Q_A and KV_A matmuls: both read the same attn_norm input,
+         * so we share one prequantize pass and one warp-of-8 launch instead
+         * of two back-to-back launches.  The same pair primitive already
+         * powers the shared_gate/shared_up fusion.  At n_tok=1 the kernel
+         * uses identical Q8_0 quantization of x and the identical dp4a
+         * accumulation path as two sequential matmul_q8_0 calls; the only
+         * difference is one kernel launch and one prequantize amortized. */
+        if (ok) ok = ds4_gpu_matmul_q8_0_pair_tensor(g->qr, g->kv_raw,
+                                                       model->map, model->size,
+                                                       layer->attn_q_a->abs_offset,
+                                                       layer->attn_kv->abs_offset,
+                                                       DS4_N_EMBD,
+                                                       q_rank, DS4_N_HEAD_DIM,
+                                                       g->attn_norm, 1) != 0;
         if (ok) {
+            metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
             metal_graph_debug_dump_tensor("KVraw", g->kv_raw, DS4_N_HEAD_DIM, il, pos);
         }
         if (ok) ok = ds4_gpu_dsv4_qkv_rms_norm_rows_tensor(g->qr_norm,
@@ -9588,6 +9592,13 @@ static bool metal_graph_encode_decode_layer(
                                                              1,
                                                              DS4_RMS_EPS) != 0;
     } else {
+        if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size,
+                                                  layer->attn_q_a->abs_offset,
+                                                  DS4_N_EMBD, q_rank,
+                                                  g->attn_norm, 1) != 0;
+        if (ok) {
+            metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
+        }
         if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->qr_norm, g->qr,
                                                       model->map, model->size,
                                                       layer->attn_q_a_norm->abs_offset,
diff --git a/ds4_gpu.h b/ds4_gpu.h
index ff64d2ab..fa277716 100644
--- a/ds4_gpu.h
+++ b/ds4_gpu.h
@@ -151,6 +151,23 @@ int ds4_gpu_matmul_q8_0_tensor(
         const ds4_gpu_tensor *x,
         uint64_t                n_tok);
 
+/* Pair-fused Q8_0 matmul: two weight matrices share a single prequantize of x
+ * and one warp-of-8 kernel launch.  out0/out1 can have asymmetric output
+ * dimensions (e.g. Q_A and KV_A in DSV4).  Falls back to two sequential
+ * ds4_gpu_matmul_q8_0_tensor calls for n_tok > 1. */
+int ds4_gpu_matmul_q8_0_pair_tensor(
+        ds4_gpu_tensor       *out0,
+        ds4_gpu_tensor       *out1,
+        const void             *model_map,
+        uint64_t                model_size,
+        uint64_t                weight0_offset,
+        uint64_t                weight1_offset,
+        uint64_t                in_dim,
+        uint64_t                out0_dim,
+        uint64_t                out1_dim,
+        const ds4_gpu_tensor *x,
+        uint64_t                n_tok);
+
 int ds4_gpu_shared_gate_up_swiglu_q8_0_tensor(
         ds4_gpu_tensor       *gate,
         ds4_gpu_tensor       *up,