Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ help:
@echo " make clean Remove build outputs"

cuda-spark:
$(MAKE) ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH=
$(MAKE) ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH=sm_120

cuda-generic:
$(MAKE) ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH=native
Expand Down
1,268 changes: 1,105 additions & 163 deletions ds4.c

Large diffs are not rendered by default.

75 changes: 75 additions & 0 deletions ds4.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,77 @@ typedef enum {
DS4_BACKEND_CPU,
} ds4_backend;

/* KV cache compression dtype selection.
*
* DS4_KV_FP8 (default): the historical path. The non-RoPE part of each compressed
* KV row goes through an in-place E4M3 round trip in groups of 64 - values stay as
* float32 in memory but pick up the FP8 quantization error so the CPU reference
* matches what the Metal graph would store as packed FP8. No layout change.
*
* DS4_KV_TURBO3: TurboQuant+ port from TheTom/llama-cpp-turboquant. Storage layout
* is packed 3-bit Lloyd-Max indices + per-group FP8 scale bytes - the cache buffer
* is byte-addressed at `row * ds4_kv_row_bytes(head_dim, n_rot, ...)`, NOT
* float-addressed at `row * head_dim`. Every attention kernel inline-dequants
* the packed bytes on V-load. The Randomized Hadamard rotation + N(0,1) Lloyd-Max
* codebook + matched-norm L2 scale are computed once on cache store; reads pay only
* the dequant (one byte load + LUT lookup + FP8-to-f32 multiply per element).
*
* Memory savings per row: ds4 head_dim=512, n_rot=64, group=64.
* fp8 (float-sim): 512 * 4 = 2048 bytes
* turbo3 (packed): (448*3/8) + (448/64) + (64*4) = 168 + 7 + 256 = 431 bytes
* -> 4.75x smaller per row. The 9x figure in upstream TQ+ docs is the
* latent-only ratio (175/1792); RoPE-tail floats are unavoidable on MLA. */
typedef enum {
DS4_KV_FP8 = 0,
DS4_KV_TURBO3 = 1,
} ds4_kv_dtype;
const char *ds4_kv_dtype_name(ds4_kv_dtype dtype);
int ds4_kv_dtype_from_name(const char *name, ds4_kv_dtype *out);

/* Packed turbo3 byte layout per cache row. GROUP_SIZE is 64 - the same WHT
* group cadence the float-sim quantizer uses, one matched-norm L2 scale per
* 64 elements. See `dsv4_turbo3_kv_quantize_row_inplace_cpu` in ds4.c for
* the per-group algorithm.
*
* data section : (head_dim - n_rot) * 3 / 8 bytes
* packed 3-bit indices, 8 values per 3 bytes
* (b0 = v0|(v1<<3)|(v2<<6), b1 = (v2>>2)|(v3<<1)|..., b2 = ...)
* scale section : (head_dim - n_rot) / DS4_TURBO3_GROUP_SIZE FP8 E4M3 bytes
* one per 64-element group, matched-norm L2 scale
* rope tail : n_rot * sizeof(float)
* untouched RoPE coordinates (these carry positional freqs)
*
* Stored values are in the ORIGINAL basis (we apply the inverse rotation on
* write so the dequanted values match what the float-sim path produced).
* Readers dequant one 64-element group at a time into a small stack scratch
* via `dequant_group`: load 24 packed bytes + 1 FP8 scale -> 64 floats in the
* rotated basis (centroid * scale) -> 64-point iWHT-with-signs -> 64
* original-basis floats. This trades ~3.5x dequant compute per group for
* ~25x less memory traffic vs the fp8 float-sim cache. The advantage is
* that every existing reader (attention dot loops, compressor pool, disk
* save, MTP draft) sees the same original-basis values it did before - only
* the storage byte layout changes. */
#define DS4_TURBO3_GROUP_SIZE 64u
uint64_t ds4_kv_row_bytes(uint32_t head_dim, uint32_t n_rot, ds4_kv_dtype dtype);

/* Footprint estimator broken down by section, parameterized on dtype. Used by
* `ds4-bench --print-kv-footprint` to print side-by-side fp8 vs turbo3 sizes.
*
* raw_bytes : the SWA ring window across all layers.
* compressed_bytes : per-layer compressor output + indexer (always float).
* total_bytes : sum of the above.
*
* For turbo3, `raw_bytes` reflects the packed-byte layout. The compressed
* pools (attn_comp + index_comp) and the compressor state arrays remain
* float because the compressor pool integrates softmax-weighted accumulations
* that require an original-basis read. */
typedef struct {
uint64_t raw_bytes;
uint64_t compressed_bytes;
uint64_t total_bytes;
} ds4_kv_footprint;
ds4_kv_footprint ds4_kv_footprint_estimate(ds4_backend backend, int ctx_size, ds4_kv_dtype dtype);

typedef enum {
DS4_THINK_NONE,
DS4_THINK_HIGH,
Expand Down Expand Up @@ -73,6 +144,10 @@ typedef struct {
bool warm_weights;
bool quality;
bool inspect_only;
/* KV cache dtype. Default DS4_KV_FP8 keeps the historical path; DS4_KV_TURBO3
* swaps in the TurboQuant+ 3-bit-per-element quality simulation on CUDA and
* CPU reference. See ds4_kv_dtype above for the algorithm summary. */
ds4_kv_dtype kv_dtype;
} ds4_engine_options;

typedef void (*ds4_token_emit_fn)(void *ud, int token);
Expand Down
Loading