antirez · TheTom · May 24, 2026 · May 24, 2026 · May 24, 2026 · May 23, 2026
diff --git a/Makefile b/Makefile
@@ -82,7 +82,7 @@ help:
 	@echo "  make clean               Remove build outputs"
 
 cuda-spark:
-	$(MAKE) ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH=
+	$(MAKE) ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH=sm_120
 
 cuda-generic:
 	$(MAKE) ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH=native

diff --git a/ds4.c b/ds4.c
diff --git a/ds4.h b/ds4.h
@@ -20,6 +20,77 @@ typedef enum {
     DS4_BACKEND_CPU,
 } ds4_backend;
 
+/* KV cache compression dtype selection.
+ *
+ * DS4_KV_FP8 (default): the historical path.  The non-RoPE part of each compressed
+ * KV row goes through an in-place E4M3 round trip in groups of 64 - values stay as
+ * float32 in memory but pick up the FP8 quantization error so the CPU reference
+ * matches what the Metal graph would store as packed FP8.  No layout change.
+ *
+ * DS4_KV_TURBO3: TurboQuant+ port from TheTom/llama-cpp-turboquant.  Storage layout
+ * is packed 3-bit Lloyd-Max indices + per-group FP8 scale bytes - the cache buffer
+ * is byte-addressed at `row * ds4_kv_row_bytes(head_dim, n_rot, ...)`, NOT
+ * float-addressed at `row * head_dim`.  Every attention kernel inline-dequants
+ * the packed bytes on V-load.  The Randomized Hadamard rotation + N(0,1) Lloyd-Max
+ * codebook + matched-norm L2 scale are computed once on cache store; reads pay only
+ * the dequant (one byte load + LUT lookup + FP8-to-f32 multiply per element).
+ *
+ * Memory savings per row: ds4 head_dim=512, n_rot=64, group=64.
+ *   fp8 (float-sim): 512 * 4 = 2048 bytes
+ *   turbo3 (packed): (448*3/8) + (448/64) + (64*4) = 168 + 7 + 256 = 431 bytes
+ *   -> 4.75x smaller per row.  The 9x figure in upstream TQ+ docs is the
+ *      latent-only ratio (175/1792); RoPE-tail floats are unavoidable on MLA. */
+typedef enum {
+    DS4_KV_FP8 = 0,
+    DS4_KV_TURBO3 = 1,
+} ds4_kv_dtype;
+const char *ds4_kv_dtype_name(ds4_kv_dtype dtype);
+int ds4_kv_dtype_from_name(const char *name, ds4_kv_dtype *out);
+
+/* Packed turbo3 byte layout per cache row.  GROUP_SIZE is 64 - the same WHT
+ * group cadence the float-sim quantizer uses, one matched-norm L2 scale per
+ * 64 elements.  See `dsv4_turbo3_kv_quantize_row_inplace_cpu` in ds4.c for
+ * the per-group algorithm.
+ *
+ *   data section   : (head_dim - n_rot) * 3 / 8 bytes
+ *                    packed 3-bit indices, 8 values per 3 bytes
+ *                    (b0 = v0|(v1<<3)|(v2<<6), b1 = (v2>>2)|(v3<<1)|..., b2 = ...)
+ *   scale section  : (head_dim - n_rot) / DS4_TURBO3_GROUP_SIZE FP8 E4M3 bytes
+ *                    one per 64-element group, matched-norm L2 scale
+ *   rope tail      : n_rot * sizeof(float)
+ *                    untouched RoPE coordinates (these carry positional freqs)
+ *
+ * Stored values are in the ORIGINAL basis (we apply the inverse rotation on
+ * write so the dequanted values match what the float-sim path produced).
+ * Readers dequant one 64-element group at a time into a small stack scratch
+ * via `dequant_group`: load 24 packed bytes + 1 FP8 scale -> 64 floats in the
+ * rotated basis (centroid * scale) -> 64-point iWHT-with-signs -> 64
+ * original-basis floats.  This trades ~3.5x dequant compute per group for
+ * ~25x less memory traffic vs the fp8 float-sim cache.  The advantage is
+ * that every existing reader (attention dot loops, compressor pool, disk
+ * save, MTP draft) sees the same original-basis values it did before - only
+ * the storage byte layout changes. */
+#define DS4_TURBO3_GROUP_SIZE 64u
+uint64_t ds4_kv_row_bytes(uint32_t head_dim, uint32_t n_rot, ds4_kv_dtype dtype);
+
+/* Footprint estimator broken down by section, parameterized on dtype.  Used by
+ * `ds4-bench --print-kv-footprint` to print side-by-side fp8 vs turbo3 sizes.
+ *
+ *   raw_bytes        : the SWA ring window across all layers.
+ *   compressed_bytes : per-layer compressor output + indexer (always float).
+ *   total_bytes      : sum of the above.
+ *
+ * For turbo3, `raw_bytes` reflects the packed-byte layout.  The compressed
+ * pools (attn_comp + index_comp) and the compressor state arrays remain
+ * float because the compressor pool integrates softmax-weighted accumulations
+ * that require an original-basis read. */
+typedef struct {
+    uint64_t raw_bytes;
+    uint64_t compressed_bytes;
+    uint64_t total_bytes;
+} ds4_kv_footprint;
+ds4_kv_footprint ds4_kv_footprint_estimate(ds4_backend backend, int ctx_size, ds4_kv_dtype dtype);
+
 typedef enum {
     DS4_THINK_NONE,
     DS4_THINK_HIGH,
@@ -73,6 +144,10 @@ typedef struct {
     bool warm_weights;
     bool quality;
     bool inspect_only;
+    /* KV cache dtype.  Default DS4_KV_FP8 keeps the historical path; DS4_KV_TURBO3
+     * swaps in the TurboQuant+ 3-bit-per-element quality simulation on CUDA and
+     * CPU reference.  See ds4_kv_dtype above for the algorithm summary. */
+    ds4_kv_dtype kv_dtype;
 } ds4_engine_options;
 
 typedef void (*ds4_token_emit_fn)(void *ud, int token);