Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions ds4.c
Original file line number Diff line number Diff line change
Expand Up @@ -9560,19 +9560,23 @@ static bool metal_graph_encode_decode_layer(
if (ok) {
metal_graph_debug_dump_tensor("attn_norm", g->attn_norm, DS4_N_EMBD, il, pos);
}
if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size,
layer->attn_q_a->abs_offset,
DS4_N_EMBD, q_rank,
g->attn_norm, 1) != 0;
if (ok) {
metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
}
if (qkv_rms_fused) {
if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->kv_raw, model->map, model->size,
layer->attn_kv->abs_offset,
DS4_N_EMBD, DS4_N_HEAD_DIM,
g->attn_norm, 1) != 0;
/* Pair-fuse Q_A and KV_A matmuls: both read the same attn_norm input,
* so we share one prequantize pass and one warp-of-8 launch instead
* of two back-to-back launches. The same pair primitive already
* powers the shared_gate/shared_up fusion. At n_tok=1 the kernel
* uses identical Q8_0 quantization of x and the identical dp4a
* accumulation path as two sequential matmul_q8_0 calls; the only
* difference is one kernel launch and one prequantize amortized. */
if (ok) ok = ds4_gpu_matmul_q8_0_pair_tensor(g->qr, g->kv_raw,
model->map, model->size,
layer->attn_q_a->abs_offset,
layer->attn_kv->abs_offset,
DS4_N_EMBD,
q_rank, DS4_N_HEAD_DIM,
g->attn_norm, 1) != 0;
if (ok) {
metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
metal_graph_debug_dump_tensor("KVraw", g->kv_raw, DS4_N_HEAD_DIM, il, pos);
}
if (ok) ok = ds4_gpu_dsv4_qkv_rms_norm_rows_tensor(g->qr_norm,
Expand All @@ -9588,6 +9592,13 @@ static bool metal_graph_encode_decode_layer(
1,
DS4_RMS_EPS) != 0;
} else {
if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->qr, model->map, model->size,
layer->attn_q_a->abs_offset,
DS4_N_EMBD, q_rank,
g->attn_norm, 1) != 0;
if (ok) {
metal_graph_debug_dump_tensor("q_lora", g->qr, q_rank, il, pos);
}
if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->qr_norm, g->qr,
model->map, model->size,
layer->attn_q_a_norm->abs_offset,
Expand Down
35 changes: 31 additions & 4 deletions ds4_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2049,10 +2049,37 @@ __global__ static void matmul_q8_0_hc_expand_preq_warp8_kernel(
acc += __half2float(*scale_h) * xscale[b] * (float)dot;
}
acc = warp_sum_f32(acc);
if (lane == 0) {
const uint32_t d = (uint32_t)row;
block_out[d] = acc;
float block_v = acc;
/* Broadcast the per-row block_v from lane 0 to all 32 lanes so we can
* parallelize the n_hc HC outputs across lanes 0..n_hc-1. The previous
* implementation collapsed to lane 0 for the entire HC epilogue, doing
* n_hc * n_hc serial HBM reads of residual_hc -- the dominant cost of
* this kernel. Using lanes 0..n_hc-1 we issue n_hc parallel residual
* loads and share them across lanes via __shfl_sync, then each lane
* computes one dst_hc output and writes in parallel. */
const float block_v0 = __shfl_sync(0xffffffffu, acc, 0);
const uint32_t d = (uint32_t)row;
if (lane == 0) block_out[d] = block_v0;
if (n_hc <= 32u) {
float block_v = block_v0;
if (has_add) block_v += block_add[d];
const float *post = split + n_hc;
const float *comb = split + 2u * n_hc;
const float my_res = (lane < n_hc)
? residual_hc[(uint64_t)lane * n_embd + d]
: 0.0f;
if (lane < n_hc) {
const uint32_t dst_hc = lane;
float hc_acc = block_v * post[dst_hc];
for (uint32_t src_hc = 0; src_hc < n_hc; src_hc++) {
const float src_res = __shfl_sync(0xffffffffu, my_res, src_hc);
hc_acc += comb[dst_hc + (uint64_t)src_hc * n_hc] * src_res;
}
out_hc[(uint64_t)dst_hc * n_embd + d] = hc_acc;
}
} else if (lane == 0) {
/* Fallback for unusual n_hc > warp size (not used by DSV4 which has
* n_hc = 4). Same serial path as before. */
float block_v = block_v0;
if (has_add) block_v += block_add[d];
const float *post = split + n_hc;
const float *comb = split + 2u * n_hc;
Expand Down
17 changes: 17 additions & 0 deletions ds4_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,23 @@ int ds4_gpu_matmul_q8_0_tensor(
const ds4_gpu_tensor *x,
uint64_t n_tok);

/* Pair-fused Q8_0 matmul: two weight matrices share a single prequantize of x
* and one warp-of-8 kernel launch. out0/out1 can have asymmetric output
* dimensions (e.g. Q_A and KV_A in DSV4). Falls back to two sequential
* ds4_gpu_matmul_q8_0_tensor calls for n_tok > 1. */
int ds4_gpu_matmul_q8_0_pair_tensor(
ds4_gpu_tensor *out0,
ds4_gpu_tensor *out1,
const void *model_map,
uint64_t model_size,
uint64_t weight0_offset,
uint64_t weight1_offset,
uint64_t in_dim,
uint64_t out0_dim,
uint64_t out1_dim,
const ds4_gpu_tensor *x,
uint64_t n_tok);

int ds4_gpu_shared_gate_up_swiglu_q8_0_tensor(
ds4_gpu_tensor *gate,
ds4_gpu_tensor *up,
Expand Down