diff --git a/README.md b/README.md
index 8ad6ebf..a9beca0 100644
--- a/README.md
+++ b/README.md
@@ -31,10 +31,6 @@ cmake --build . --config Release -j$(nproc)
 
 Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE).
 
-**CI (GitHub Actions)**  
-- **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`.  
-- **Test generation**: on release or manual trigger only; runs the same checks as **local** `tests/run-generation-tests.sh`. Validate locally first (build + `./models.sh`, then `tests/run-generation-tests.sh`), then use CI to confirm. See `.github/workflows/`.
-
 ## Models
 
 Pre-quantized GGUFs on [Hugging Face](https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF).
@@ -143,16 +139,10 @@ cd examples
 ./partial.sh          # caption + lyrics + duration
 ./full.sh             # all metadata provided
 ./dit-only.sh         # skip LLM, DiT from noise
-./cover.sh            # cover mode: decode precomputed audio_codes (no LLM)
-./cover-reference.sh  # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3)
-./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
-./lora.sh             # DiT + LoRA adapter
 ```
 
 Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
-alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
-
-**LoRA adapters**: use `--lora <path>` and optional `--lora-scale <float>` with dit-vae to run the DiT with PEFT-style Ace-Step LoRAs.
+alongside the turbo default (8 steps, no CFG).
 
 ## Generation modes
 
@@ -180,11 +170,10 @@ Run `dit-vae` to decode existing codes. See `examples/dit-only.json`.
 
 ## Request JSON reference
 
-All fields with defaults. Only `caption` is required. Built-in modes (text2music, cover, repaint) and audio inputs follow the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md); see [docs/MODES.md](docs/MODES.md) for what is implemented.
+All fields with defaults. Only `caption` is required.
 
 ```json
 {
-    "task_type":          "text2music",
     "caption":            "",
     "lyrics":             "",
     "instrumental":       false,
@@ -199,12 +188,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music
     "lm_top_p":           0.9,
     "lm_top_k":           0,
     "lm_negative_prompt": "",
-    "reference_audio":    "",
-    "src_audio":          "",
     "audio_codes":        "",
-    "audio_cover_strength": 1.0,
-    "repainting_start":   0.0,
-    "repainting_end":     0.0,
     "inference_steps":    8,
     "guidance_scale":     7.0,
     "shift":              3.0
@@ -214,12 +198,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music
 Key fields: `seed` -1 means random (resolved once, then +1 per batch
 element). `audio_codes` is generated by ace-qwen3 and consumed by
 dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: path to a **WAV or MP3** for cover source; dit-vae encodes it (VAE + FSQ nearest-codeword) to codes internally, no Python required (see docs/MODES.md).
-
-**Reference and cover strength (not the same as guidance_scale):**
-- **`audio_cover_strength`** (0.0–1.0): Controls how strongly the **cover/source** (from `audio_codes` or `src_audio`) influences the DiT context. The context is blended with silence: `(1 - audio_cover_strength)*silence + audio_cover_strength*decoded`. Use 1.0 for full cover influence, lower values to soften it. Only applies when cover context is present.
-- **`reference_audio`**: Timbre from the reference file is applied at full strength; there is no separate strength parameter for reference timbre.
-- **`guidance_scale`**: This is **DiT classifier-free guidance** (conditioned vs unconditioned prediction), not reference or cover strength. Turbo models ignore it (forced to 1.0).
+skipped entirely.
 
 Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
 SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.
@@ -241,6 +220,7 @@ Output naming: input.json -> input0.json, input1.json, ... (last digit = batch i
 Debug:
   --max-seq <N>          KV cache size (default: 8192)
   --no-fsm               Disable FSM constrained decoding
+  --no-fa                Disable flash attention
   --dump-logits <path>   Dump prefill logits (binary f32)
   --dump-tokens <path>   Dump prompt token IDs (CSV)
 ```
@@ -262,10 +242,6 @@ Required:
   --dit <gguf>            DiT GGUF file
   --vae <gguf>            VAE GGUF file
 
-LoRA:
-  --lora <path>           LoRA adapter (adapter_model.safetensors)
-  --lora-scale <float>    LoRA scale, e.g. alpha/rank (default: 1.0)
-
 Batch:
   --batch <N>             DiT variations per request (default: 1, max 9)
 
@@ -276,6 +252,7 @@ VAE tiling (memory control):
   --vae-overlap <N>       Overlap frames per side (default: 64)
 
 Debug:
+  --no-fa                 Disable flash attention
   --dump <dir>            Dump intermediate tensors
 ```
 
@@ -320,10 +297,7 @@ conditional and N unconditional sequences are packed into a single forward pass
 `logits = uncond + scale * (cond - uncond)`. The KV cache is a single 4D tensor
 `[D, max_seq, Nkv, n_sets]` shared across all batch elements and CFG paths. Shared
 prompts are prefilled once and cloned to other KV sets via copy, avoiding redundant
-prefills. Embedding lookup bypasses ggml_get_rows entirely: rows are read directly
-from the mmap'd GGUF file on CPU, dequantized, and uploaded as F32 input tensors.
-Decode uses a dedicated single-backend graph allocator (gallocr) with no scheduler
-dispatch overhead, while prefill uses the multi-backend scheduler for flexibility.
+prefills.
 
 ## Accuracy
 
@@ -343,42 +317,42 @@ python3 debug-dit-cossim.py       # DiT: per-layer cossim GGML vs Python (turbo/
 
 ## Patched GGML fork
 
-Uses a patched GGML fork (submodule) with ops added for the Oobleck VAE decoder.
+Uses a patched GGML fork (submodule) with two new ops and a CUDA bugfix for the Oobleck
+VAE decoder. All backends: CPU, CUDA, Metal, Vulkan. F32/F16/BF16 data types.
+The DiT uses only standard GGML ops and needs no patches.
 
 The VAE reconstructs audio from latent space through 5 upsampling blocks (total 1920x),
 each running a transposed convolution followed by 3 WaveNet-style residual units with
 dilated convolutions and Snake activations. A single tile builds a graph of 36 snake
 activations, 5 transposed convolutions, and 32 regular convolutions. At the final blocks,
-sequence lengths reach 491520 timesteps, which stresses GGML ops designed for short NLP sequences.
-The DiT (flow matching diffusion transformer) uses only standard GGML ops and needs no patches.
-
-Patches on top of upstream GGML, oldest first:
-
-| Commit | Scope | Description |
-|--------|-------|-------------|
-| `8c70db84` | CUDA | `conv_transpose_1d`: replace O(T_in) brute-force loop with bounded range |
-| `b65bf458` | CUDA | `im2col`: grid-stride loop on OW to fix gridDim.y overflow when T > 65535 |
-| `e0e36f3c` | Metal | `conv_transpose_1d`: same bounded loop fix as CUDA |
-| `2b9080bd` | CPU, CUDA, Metal | New `GGML_OP_COL2IM_1D`: scatter-add for GEMM-based conv_transpose_1d decomposition |
-| `02c8041f` | CPU, CUDA, Metal | New `GGML_OP_SNAKE`: fused activation y = x + sin^2(a*x) / b (replaces 5 element-wise ops) |
-| `3f60b19c` | Metal | Fix snake kernel to use current C wrapper API |
-| `cb5d7067` | Vulkan | Guard `VK_EXT_layer_settings` for legacy Vulkan SDK (fixes MI50/gfx906) |
-| `1f0f4214` | Vulkan | `col2im_1d`: add Vulkan backend |
-| `efbf3df6` | Vulkan | `snake`: add Vulkan backend |
-| `6608cd11` | Vulkan | Fix rvalue ref for `col2im_1d` and `snake` push constants |
-| `06101d38` | Vulkan | Fix double-division dispatch for `col2im_1d` and `snake` |
-| `91416cee` | CPU, CUDA, Metal, Vulkan | `col2im_1d`: fuse padding crop via p0 parameter (saves 5 allocs + 5 memcpy per VAE tile) |
-| `20675b09` | Vulkan | `col2im_1d`, `snake`: 2D dispatch (fixes workgroup overflow on MI50) |
-
-**Why col2im_1d**: upstream `ggml_conv_transpose_1d` uses a naive CUDA kernel (one scalar
-FMA loop per output element, no shared memory, no tensor cores). The VAE spends 40% of its
-FLOP budget on transposed convolutions. We decompose it as `mul_mat + col2im_1d`, routing
-the heavy GEMM through cuBLAS/BLAS/MPS tensor cores. The col2im_1d gather has a 2-iteration
-inner loop and is pure bandwidth.
-
-**Why snake**: the Oobleck VAE uses Snake1d activation (x + sin^2(a*x) / b) 36 times per
-tile. Without a fused op, each activation requires 5 separate GGML kernels (mul, sin, sqr,
-mul, add), causing 5x the memory traffic. The fused kernel reads x once, writes y once.
+sequence lengths reach 491520 timesteps, which stresses GGML ops designed for short NLP
+sequences.
+
+### `GGML_OP_SNAKE` (fused Snake activation)
+
+Computes y = x + sin^2(a * x) * inv_b in a single kernel.
+The Oobleck VAE calls this 36 times per tile. Without a fused op, each activation
+requires 5 separate GGML kernels (mul, sin, sqr, mul, add), causing 5x the memory
+traffic. The fused kernel reads x once and writes y once. BF16 cast nodes before/after
+each snake call halve memory bandwidth at the cost of negligible precision loss
+(cossim > 0.999 vs F32 baseline).
+
+### `GGML_OP_COL2IM_1D` (scatter-add for GEMM-based conv_transpose_1d)
+
+Gather-based reconstruction of a 1D signal from GEMM columns [K*OC, T_in] to
+[T_out, OC], with fused padding crop via the p0 parameter.
+Upstream `ggml_conv_transpose_1d` uses a naive kernel (one scalar FMA loop per output
+element, no shared memory, no tensor cores). The VAE spends 40% of its FLOP budget on
+transposed convolutions. We decompose each as `mul_mat + col2im_1d`, routing the heavy
+GEMM through cuBLAS/BLAS/MPS tensor cores. The col2im_1d gather has a 2-iteration inner
+loop and is pure bandwidth. BF16 cast nodes around col2im_1d halve the scatter bandwidth.
+
+### Bugfix: `im2col` gridDim.y overflow (CUDA)
+
+Upstream `im2col_kernel` uses OW directly as grid dimension Y, which exceeds the CUDA
+65535 gridDim limit on long sequences. The VAE calls `ggml_conv_1d` (im2col path) 32
+times per tile at output widths up to 491520. Fixed with a grid-stride loop on OW and
+`MIN(OW, MAX_GRIDDIM_Z)` clamping.
 
 ## Acknowledgements
 
diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root
new file mode 120000
index 0000000..945c9b4
--- /dev/null
+++ b/_codeql_detected_source_root
@@ -0,0 +1 @@
+.
\ No newline at end of file
diff --git a/buildcuda.sh b/buildcuda.sh
new file mode 100755
index 0000000..67f711f
--- /dev/null
+++ b/buildcuda.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+rm -rf build
+mkdir build
+cd build
+
+cmake .. -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
+cmake --build . --config Release -j "$(nproc)"
diff --git a/ggml b/ggml
index c04770a..55e062a 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit c04770a7056267bf0264b7c96d34cd84b24b04e8
+Subproject commit 55e062ab597eccaa3e7ee7c7b230197d83d94bc8
diff --git a/src/cond-enc.h b/src/cond-enc.h
index 7de70a8..880cbf7 100644
--- a/src/cond-enc.h
+++ b/src/cond-enc.h
@@ -69,6 +69,7 @@ struct CondGGML {
     ggml_backend_t backend;
     ggml_backend_t cpu_backend;
     ggml_backend_sched_t sched;
+    bool use_flash_attn;
     WeightCtx wctx;
 };
 
@@ -78,6 +79,7 @@ static void cond_ggml_init_backend(CondGGML * m) {
     m->backend = bp.backend;
     m->cpu_backend = bp.cpu_backend;
     m->sched = backend_sched_new(bp, 8192);
+    m->use_flash_attn = true;
 }
 
 // Load from ACEStep DiT GGUF
@@ -191,7 +193,8 @@ static void cond_ggml_forward(CondGGML * m,
     for (int i = 0; i < m->lyric_cfg.n_layers; i++) {
         struct ggml_tensor * layer_mask = (i % 2 == 0) ? lyric_slide_mask : NULL;
         lyric_h = qwen3_build_layer(ctx, m->lyric_cfg, &m->lyric_layers[i],
-                                     lyric_h, lyric_pos, layer_mask, S_lyric);
+                                     lyric_h, lyric_pos, layer_mask, S_lyric,
+                                     m->use_flash_attn);
     }
     lyric_h = qwen3_rms_norm(ctx, lyric_h, m->lyric_norm, m->lyric_cfg.rms_norm_eps);
 
@@ -236,7 +239,8 @@ static void cond_ggml_forward(CondGGML * m,
         for (int i = 0; i < m->timbre_cfg.n_layers; i++) {
             struct ggml_tensor * layer_mask = (i % 2 == 0) ? timbre_slide_mask : NULL;
             timbre_h = qwen3_build_layer(ctx, m->timbre_cfg, &m->timbre_layers[i],
-                                          timbre_h, timbre_pos, layer_mask, S_ref);
+                                          timbre_h, timbre_pos, layer_mask, S_ref,
+                                          m->use_flash_attn);
         }
         timbre_h = qwen3_rms_norm(ctx, timbre_h, m->timbre_norm, m->timbre_cfg.rms_norm_eps);
 
diff --git a/src/fsq-detok.h b/src/fsq-detok.h
index 29eef5f..7430db7 100644
--- a/src/fsq-detok.h
+++ b/src/fsq-detok.h
@@ -64,6 +64,7 @@ struct DetokGGML {
     ggml_backend_t backend;
     ggml_backend_t cpu_backend;
     ggml_backend_sched_t sched;
+    bool use_flash_attn;
     WeightCtx wctx;
 };
 
@@ -73,6 +74,7 @@ static bool detok_ggml_load(DetokGGML * m, const char * gguf_path,
     m->cfg = detok_config();
     m->backend = backend;
     m->cpu_backend = cpu_backend;
+    m->use_flash_attn = true;
 
     GGUFModel gf;
     if (!gf_load(&gf, gguf_path)) {
@@ -169,7 +171,8 @@ static int detok_ggml_decode(DetokGGML * m, const int * codes, int T_5Hz,
 
     // 2L encoder + norm (non-causal, no mask needed at S=5)
     hidden = qwen3_build_layers(ctx, m->cfg, m->layers, m->norm,
-                                 hidden, positions, NULL, P);
+                                 hidden, positions, NULL, P,
+                                 m->use_flash_attn);
 
     // proj_out: [2048, 5] -> [64, 5]
     struct ggml_tensor * output = ggml_mul_mat(ctx, m->proj_out_w, hidden);
diff --git a/src/qwen3-enc.h b/src/qwen3-enc.h
index 07dce03..02bf9c9 100644
--- a/src/qwen3-enc.h
+++ b/src/qwen3-enc.h
@@ -71,6 +71,7 @@ struct Qwen3GGML {
     ggml_backend_t backend;
     ggml_backend_t cpu_backend;
     ggml_backend_sched_t sched;
+    bool use_flash_attn;
     WeightCtx wctx;
 };
 
@@ -94,6 +95,23 @@ static struct ggml_tensor * qwen3_linear_bias(struct ggml_context * ctx,
     return ggml_add(ctx, out, qwen3_f32(ctx, b));
 }
 
+// F32 manual attention (fallback when flash_attn_ext is disabled).
+// Works for 3D [D, S, X] and 4D [D, S, X, N] inputs.
+// Returns same layout as flash_attn_ext: dims 1 and 2 swapped vs input.
+static struct ggml_tensor * qwen3_attn_f32(
+        struct ggml_context * ctx,
+        struct ggml_tensor * q,
+        struct ggml_tensor * k,
+        struct ggml_tensor * v,
+        struct ggml_tensor * mask,
+        float scale) {
+    struct ggml_tensor * scores = ggml_mul_mat(ctx, k, q);
+    scores = ggml_soft_max_ext(ctx, scores, mask, scale, 0.0f);
+    struct ggml_tensor * vt = ggml_cont(ctx, ggml_transpose(ctx, v));
+    struct ggml_tensor * out = ggml_mul_mat(ctx, vt, scores);
+    return ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));
+}
+
 static struct ggml_tensor * qwen3_rms_norm(struct ggml_context * ctx,
                                             struct ggml_tensor * x,
                                             struct ggml_tensor * w,
@@ -114,7 +132,8 @@ static struct ggml_tensor * qwen3_build_self_attn(
         struct ggml_tensor * x,          // [H, S]
         struct ggml_tensor * positions,  // [S] int32
         struct ggml_tensor * mask,       // [S, S] or NULL
-        int S) {
+        int S,
+        bool use_flash_attn = true) {
 
     int D   = c.head_dim;
     int Nh  = c.n_heads;
@@ -164,10 +183,13 @@ static struct ggml_tensor * qwen3_build_self_attn(
     k = ggml_permute(ctx, k, 0, 2, 1, 3);
     v = ggml_permute(ctx, v, 0, 2, 1, 3);
 
-    // 6) Flash attention (handles GQA)
+    // 6) Attention (flash or F32 manual fallback)
     float scale = 1.0f / sqrtf((float)D);
-    struct ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0.0f, 0.0f);
-    ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); // F32 accumulation
+    struct ggml_tensor * attn = use_flash_attn
+        ? ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0.0f, 0.0f)
+        : qwen3_attn_f32(ctx, q, k, v, mask, scale);
+    if (use_flash_attn)
+        ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32);
 
     // 7) Reshape back: [D, Nh, S] -> [Nh*D, S]
     attn = ggml_reshape_2d(ctx, attn, Nh * D, S);
@@ -203,11 +225,12 @@ static struct ggml_tensor * qwen3_build_layer(
         struct ggml_tensor * hidden,
         struct ggml_tensor * positions,
         struct ggml_tensor * mask,
-        int S) {
+        int S,
+        bool use_flash_attn = true) {
 
     // Self-attention block
     struct ggml_tensor * norm = qwen3_rms_norm(ctx, hidden, ly->input_layernorm, c.rms_norm_eps);
-    struct ggml_tensor * attn = qwen3_build_self_attn(ctx, c, ly, norm, positions, mask, S);
+    struct ggml_tensor * attn = qwen3_build_self_attn(ctx, c, ly, norm, positions, mask, S, use_flash_attn);
     hidden = ggml_add(ctx, hidden, attn);
 
     // MLP block
@@ -227,10 +250,11 @@ static struct ggml_tensor * qwen3_build_layers(
         struct ggml_tensor * hidden,
         struct ggml_tensor * positions,
         struct ggml_tensor * mask,
-        int S) {
+        int S,
+        bool use_flash_attn = true) {
 
     for (int i = 0; i < c.n_layers; i++) {
-        hidden = qwen3_build_layer(ctx, c, &layers[i], hidden, positions, mask, S);
+        hidden = qwen3_build_layer(ctx, c, &layers[i], hidden, positions, mask, S, use_flash_attn);
     }
     return qwen3_rms_norm(ctx, hidden, final_norm_w, c.rms_norm_eps);
 }
@@ -287,6 +311,7 @@ static void qwen3_init_backend(Qwen3GGML * m) {
     m->backend = bp.backend;
     m->cpu_backend = bp.cpu_backend;
     m->sched = backend_sched_new(bp, 4096);
+    m->use_flash_attn = true;
 }
 
 // Load standalone text encoder (Qwen3-Embedding) from GGUF
@@ -372,7 +397,8 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o
 
     // N layers + final norm
     struct ggml_tensor * out = qwen3_build_layers(ctx, c, m->layers, m->final_norm,
-                                                   hidden, positions, mask, S);
+                                                   hidden, positions, mask, S,
+                                                   m->use_flash_attn);
     ggml_set_name(out, "output");
     ggml_set_output(out);
     ggml_build_forward_expand(gf, out);
@@ -409,27 +435,33 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o
     ggml_free(ctx);
 }
 
-// CPU vocab lookup utility
-// For lyric embedding: look up token IDs in text encoder's embed table (bf16 -> f32)
-// GGUF keeps mmapped data alive. Output: [H, S] float (H contiguous per token).
-//
-// embed_data: pointer to bf16 weight data [vocab, H] in PyTorch layout (H contiguous per row)
+// Embedding lookup via ggml graph (reuses text encoder weights + scheduler)
 // token_ids: [S] int32
 // output:    [H * S] float (ggml layout: H contiguous, S tokens)
-static void qwen3_cpu_embed_lookup(const void * embed_data, int H,
-                                    const int * token_ids, int S,
-                                    float * output) {
-    const uint16_t * bf16 = (const uint16_t *)embed_data;
-    for (int s = 0; s < S; s++) {
-        int tok = token_ids[s];
-        const uint16_t * row = bf16 + (int64_t)tok * H;
-        float * dst = output + (int64_t)s * H;
-        for (int h = 0; h < H; h++) {
-            // bf16 to f32: shift left 16 bits
-            uint32_t bits = (uint32_t)row[h] << 16;
-            memcpy(&dst[h], &bits, 4);
-        }
-    }
+static void qwen3_embed_lookup(Qwen3GGML * m, const int * token_ids, int S, float * output) {
+    int H = m->cfg.hidden_size;
+
+    size_t ctx_size = 16 * ggml_tensor_overhead() + ggml_graph_overhead();
+    struct ggml_init_params gp = { ctx_size, NULL, true };
+    struct ggml_context * ctx = ggml_init(gp);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    struct ggml_tensor * t_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, S);
+    ggml_set_name(t_ids, "token_ids");
+    ggml_set_input(t_ids);
+
+    struct ggml_tensor * out = ggml_get_rows(ctx, m->embed_tokens, t_ids);
+    ggml_set_name(out, "embed_out");
+    ggml_set_output(out);
+    ggml_build_forward_expand(gf, out);
+
+    ggml_backend_sched_alloc_graph(m->sched, gf);
+    ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int));
+    ggml_backend_sched_graph_compute(m->sched, gf);
+    ggml_backend_tensor_get(out, output, 0, (size_t)H * S * sizeof(float));
+
+    ggml_backend_sched_reset(m->sched);
+    ggml_free(ctx);
 }
 
 // Free
diff --git a/src/qwen3-lm.h b/src/qwen3-lm.h
index 5395b5a..29b254f 100644
--- a/src/qwen3-lm.h
+++ b/src/qwen3-lm.h
@@ -45,14 +45,8 @@ struct Qwen3LM {
     WeightCtx wctx;
     ggml_backend_t backend;
     ggml_backend_t cpu_backend;
-    ggml_backend_sched_t sched; // prefill (variable shapes, runs once)
-    ggml_gallocr_t galloc;      // decode  (single GPU, tight loop)
-
-    // CPU-side embed lookup via mmap (avoids ggml_get_rows which lacks
-    // CUDA K-quant support, preventing costly cross-backend tensor copies)
-    GGUFModel gf_mmap;
-    const void * embed_mmap_data;
-    enum ggml_type embed_type;
+    ggml_backend_sched_t sched;
+    bool use_flash_attn;
 
     // KV cache: per-set, per-layer [D, max_seq, Nkv] f16
     struct ggml_context  * kv_ctx;
@@ -150,7 +144,7 @@ static void qw3lm_init_backend(Qwen3LM * m) {
     m->backend = bp.backend;
     m->cpu_backend = bp.cpu_backend;
     m->sched = backend_sched_new(bp, 8192);
-    m->galloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m->backend));
+    m->use_flash_attn = true;
 }
 
 // Allocate KV cache
@@ -253,19 +247,7 @@ static bool qw3lm_load(Qwen3LM * m, const char * gguf_path, int max_seq_len, int
     }
 
     wctx_alloc(&m->wctx, m->backend);
-
-    // Keep mmap alive for CPU embed dequant lookup
-    m->embed_mmap_data = gf_get_data(gf, "model.embed_tokens.weight");
-    m->embed_type = m->embed_tokens->type;
-    if (!m->embed_mmap_data) {
-        fprintf(stderr, "[LM-Load] FATAL: embed_tokens not found in mmap\n");
-        gf_close(&gf);
-        return false;
-    }
-    m->gf_mmap = gf; // transfer ownership (no gf_close here)
-    fprintf(stderr, "[LM-Load] CPU embed lookup: type=%s, row=%zu bytes\n",
-            ggml_type_name(m->embed_type),
-            ggml_row_size(m->embed_type, c.hidden_size));
+    gf_close(&gf);
 
     // KV cache
     qw3lm_alloc_kv_cache(m, n_kv_sets > 0 ? n_kv_sets : 1);
@@ -287,7 +269,8 @@ static struct ggml_tensor * qw3lm_build_attn(
         struct ggml_tensor * cache_v, // [D, max_seq, Nkv] f16
         int kv_pos,
         int kv_len,
-        int n_tokens) {
+        int n_tokens,
+        bool use_flash_attn = true) {
 
     int D   = c.head_dim;
     int Nh  = c.n_heads;
@@ -356,10 +339,13 @@ static struct ggml_tensor * qw3lm_build_attn(
     struct ggml_tensor * k_full = ggml_view_3d(ctx, cache_k, D, kv_len, Nkv, nb1, nb2, 0);
     struct ggml_tensor * v_full = ggml_view_3d(ctx, cache_v, D, kv_len, Nkv, nb1, nb2, 0);
 
-    // Flash attention
+    // Attention (flash or F32 manual fallback)
     float scale = 1.0f / sqrtf((float)D);
-    struct ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k_full, v_full, mask, scale, 0.0f, 0.0f);
-    ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); // F32 accumulation
+    struct ggml_tensor * attn = use_flash_attn
+        ? ggml_flash_attn_ext(ctx, q, k_full, v_full, mask, scale, 0.0f, 0.0f)
+        : qwen3_attn_f32(ctx, q, k_full, v_full, mask, scale);
+    if (use_flash_attn)
+        ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32);
 
     // Reshape: [D, Nh, S] -> [Nh*D, S]
     attn = ggml_reshape_2d(ctx, attn, Nh * D, S);
@@ -401,14 +387,12 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens,
         ggml_set_input(mask);
     }
 
-    // Embedding: CPU dequant from mmap, fed as F32 input.
-    // This keeps embed_tokens out of get_rows (no CUDA K-quant support)
-    // and only in mul_mat (lm_head) which has full K-quant CUDA support.
-    struct ggml_tensor * embed_out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, H, n_tokens);
-    ggml_set_name(embed_out, "embed_out");
-    ggml_set_input(embed_out);
+    // Embedding via ggml_get_rows (scheduler handles backend fallback)
+    struct ggml_tensor * token_ids_t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+    ggml_set_name(token_ids_t, "token_ids");
+    ggml_set_input(token_ids_t);
 
-    struct ggml_tensor * hidden = embed_out;
+    struct ggml_tensor * hidden = ggml_get_rows(ctx, m->embed_tokens, token_ids_t);
 
     // Transformer layers
     for (int l = 0; l < c.n_layers; l++) {
@@ -421,7 +405,7 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens,
         struct ggml_tensor * attn = qw3lm_build_attn(
             ctx, gf, c, ly, norm, positions, mask,
             m->kv_k[kv_set][l], m->kv_v[kv_set][l],
-            kv_pos, kv_len, n_tokens);
+            kv_pos, kv_len, n_tokens, m->use_flash_attn);
 
         // Residual
         hidden = ggml_add(ctx, hidden, attn);
@@ -450,18 +434,8 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens,
     // Schedule + allocate
     ggml_backend_sched_alloc_graph(m->sched, gf);
 
-    // CPU-side embedding dequantization from mmap
-    {
-        const int64_t row_size = (int64_t)ggml_row_size(m->embed_type, H);
-        const ggml_to_float_t to_float = ggml_get_type_traits(m->embed_type)->to_float;
-        std::vector<float> embed_buf((size_t)H * n_tokens);
-        for (int i = 0; i < n_tokens; i++) {
-            const void * row = (const char *)m->embed_mmap_data + (int64_t)token_ids[i] * row_size;
-            to_float(row, embed_buf.data() + (int64_t)i * H, H);
-        }
-        ggml_backend_tensor_set(embed_out, embed_buf.data(), 0,
-            (size_t)H * n_tokens * sizeof(float));
-    }
+    // Set token IDs
+    ggml_backend_tensor_set(token_ids_t, token_ids, 0, n_tokens * sizeof(int));
 
     {
         std::vector<int> pos_data(n_tokens);
@@ -507,7 +481,6 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
                                   const int * kv_sets, int N, float * logits,
                                   int lm_offset = 0, int lm_count = 0) {
     const Qwen3LMConfig & c = m->cfg;
-    int H   = c.hidden_size;
     int D   = c.head_dim;
     int Nh  = c.n_heads;
     int Nkv = c.n_kv_heads;
@@ -530,10 +503,10 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
     struct ggml_context * ctx = ggml_init(gp);
     struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 16384, false);
 
-    // Embedding: [H, N]
-    struct ggml_tensor * embed_out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, H, N);
-    ggml_set_name(embed_out, "embed_out");
-    ggml_set_input(embed_out);
+    // Embedding via ggml_get_rows (scheduler handles backend fallback)
+    struct ggml_tensor * token_ids_t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    ggml_set_name(token_ids_t, "token_ids");
+    ggml_set_input(token_ids_t);
 
     // Positions: [N], per-element kv_pos
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
@@ -546,7 +519,7 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
     ggml_set_name(attn_mask, "attn_mask");
     ggml_set_input(attn_mask);
 
-    struct ggml_tensor * hidden = embed_out;
+    struct ggml_tensor * hidden = ggml_get_rows(ctx, m->embed_tokens, token_ids_t);
 
     for (int l = 0; l < c.n_layers; l++) {
         Qwen3Layer * ly = &m->layers[l];
@@ -639,10 +612,12 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
             m->kv_v4[l]->nb[1], m->kv_v4[l]->nb[2], m->kv_v4[l]->nb[3],
             (size_t)s0 * m->kv_v4[l]->nb[3]);
 
-        // Batched flash attention: 1 kernel per layer instead of N
-        struct ggml_tensor * attn_result = ggml_flash_attn_ext(ctx,
-            q4, k_batch, v_batch, attn_mask, scale, 0.0f, 0.0f);
-        ggml_flash_attn_ext_set_prec(attn_result, GGML_PREC_F32);
+        // Batched attention (flash or F32 manual fallback)
+        struct ggml_tensor * attn_result = m->use_flash_attn
+            ? ggml_flash_attn_ext(ctx, q4, k_batch, v_batch, attn_mask, scale, 0.0f, 0.0f)
+            : qwen3_attn_f32(ctx, q4, k_batch, v_batch, attn_mask, scale);
+        if (m->use_flash_attn)
+            ggml_flash_attn_ext_set_prec(attn_result, GGML_PREC_F32);
 
         // Output: [D, Nh, 1, N] -> [Nh*D, N]
         struct ggml_tensor * attn_cat = ggml_reshape_2d(ctx, attn_result, Nh * D, N);
@@ -673,20 +648,11 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
     ggml_set_output(lgt);
     ggml_build_forward_expand(gf, lgt);
 
-    // Allocate (gallocr: single-backend, no scheduler overhead)
-    ggml_gallocr_alloc_graph(m->galloc, gf);
+    // Allocate
+    ggml_backend_sched_alloc_graph(m->sched, gf);
 
-    // CPU-side embedding dequant
-    {
-        const int64_t row_size = (int64_t)ggml_row_size(m->embed_type, H);
-        const ggml_to_float_t to_float = ggml_get_type_traits(m->embed_type)->to_float;
-        std::vector<float> embed_buf((size_t)H * N);
-        for (int i = 0; i < N; i++) {
-            const void * row = (const char *)m->embed_mmap_data + (int64_t)token_ids[i] * row_size;
-            to_float(row, embed_buf.data() + (int64_t)i * H, H);
-        }
-        ggml_backend_tensor_set(embed_out, embed_buf.data(), 0, (size_t)H * N * sizeof(float));
-    }
+    // Set token IDs
+    ggml_backend_tensor_set(token_ids_t, token_ids, 0, N * sizeof(int));
 
     // Positions: per-element kv_pos
     {
@@ -710,8 +676,8 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
             mask_data.size() * sizeof(uint16_t));
     }
 
-    // Compute (direct backend, no scheduler dispatch)
-    ggml_backend_graph_compute(m->backend, gf);
+    // Compute
+    ggml_backend_sched_graph_compute(m->sched, gf);
 
     // Read logits [out_V, N]
     ggml_backend_tensor_get(lgt, logits, 0, (size_t)out_V * N * sizeof(float));
@@ -720,18 +686,17 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
     for (int i = 0; i < N; i++)
         m->kv_pos[kv_sets[i]]++;
 
+    ggml_backend_sched_reset(m->sched);
     ggml_free(ctx);
 }
 
 // Free all resources
 static void qw3lm_free(Qwen3LM * m) {
-    if (m->galloc) ggml_gallocr_free(m->galloc);
     if (m->sched) ggml_backend_sched_free(m->sched);
     if (m->kv_buf) ggml_backend_buffer_free(m->kv_buf);
     if (m->kv_ctx) ggml_free(m->kv_ctx);
     if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend);
     if (m->cpu_backend) ggml_backend_free(m->cpu_backend);
     wctx_free(&m->wctx);
-    gf_close(&m->gf_mmap);
     *m = {};
 }
diff --git a/tests/CPU-BF16.log b/tests/CPU-BF16.log
new file mode 100644
index 0000000..b20ebae
--- /dev/null
+++ b/tests/CPU-BF16.log
@@ -0,0 +1,257 @@
+[Load] DiT backend: CPU (CPU threads: 16)
+[Load] Backend init: 1.5 ms
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 464.0 ms
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CPU (CPU threads: 16)
+[VAE] Backend: CPU, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 651.3 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 31.9 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 226.8 ms
+[Encode] TextEncoder (70 tokens): 59.7 ms
+[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.7 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 230.8 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 274.9 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.758296 -0.049593 -0.132844 0.058496
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[WeightCtx] Loaded 30 tensors, 200.3 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 34.6 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 958.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.124160 1.435260 0.310138 -0.624584
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.124160 1.435260 0.310138 -0.624584
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 2129 nodes
+[Debug] tproj: [12288] first4: 0.260222 -0.161617 -0.097078 0.052346
+[Debug] temb: [2048] first4: 0.000077 -0.132559 -0.035432 0.064735
+[Debug] temb_t: [2048] first4: 0.001069 0.026790 -0.052756 0.063697
+[Debug] temb_r: [2048] first4: -0.000991 -0.159349 0.017324 0.001038
+[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049513 -0.051899 -0.014138 -0.038434
+[Debug] temb_lin1_r: [2048] first4: -0.013266 -0.018319 -0.016375 0.008532
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039830 -0.969685 0.533102 0.446442
+[Debug] proj_in_input: [192, 2170] first4: -0.124160 1.435260 0.310138 -0.624584
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.168787 0.814833 0.326668 -0.562433
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.719501 -0.764459 -0.047725 0.261760
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.541141 -1.045404 0.186748 0.455664
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.168787 0.814833 0.326668 -0.562433
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.500309 0.170627 -0.354600 0.512837
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.541141 -1.045404 0.186748 0.455664
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.541141 -1.045404 0.186748 0.455664
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.599016 -0.822108 -0.298718 0.492092
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.098095 0.568142 52.394512 -0.905627
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.346304 0.043589 33.440353 -4.467471
+[Debug] hidden_after_layer12: [2048, 1085] first4: -14.856287 -18.096371 72.046799 28.866295
+[Debug] hidden_after_layer18: [2048, 1085] first4: -27.298880 15.859982 59.802349 20.914667
+[Debug] hidden_after_layer23: [2048, 1085] first4: -11.120972 45.536430 196.515015 145.620667
+[Debug] dit_step0_vt: [2170, 64] first4: 0.017592 1.109134 0.340961 2.380328
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193536 2.105835 -0.187373 0.739460
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.231590 1.299610 -0.120825 1.895337
+[Debug] dit_step1_xt: [2170, 64] first4: 0.206168 2.034947 -0.180783 0.636078
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.025322 1.214425 0.100767 2.387164
+[Debug] dit_step2_xt: [2170, 64] first4: 0.207857 1.953985 -0.187501 0.476933
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.242072 1.092567 0.260294 2.643174
+[Debug] dit_step3_xt: [2170, 64] first4: 0.187684 1.862938 -0.209192 0.256669
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.292635 1.007325 0.109474 2.707222
+[Debug] dit_step4_xt: [2170, 64] first4: 0.156330 1.755010 -0.220921 -0.033391
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.268947 0.924783 -0.284788 2.767856
+[Debug] dit_step5_xt: [2170, 64] first4: 0.117909 1.622898 -0.180237 -0.428799
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.170391 0.634803 -0.816809 2.824526
+[Debug] dit_step6_xt: [2170, 64] first4: 0.083831 1.495938 -0.016875 -0.993704
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: 0.002176 0.183052 -1.467304 3.113325
+[Debug] dit_x0: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 18721.5 ms (18721.5 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 51818.0 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000519 0.001024 0.000897 0.001200
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:57:38.585 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:57:38.585 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:57:38.585 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:57:38.586 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:57:38.586 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:57:39.413 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:57:40.961 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:57:40.961 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:57:40.966 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:57:41.132 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:57:41.140 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:57:41.153 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:57:41.153 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:57:41.175 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:57:41.483 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:57:41.483 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:57:41.483 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00688624382019043, 'diffusion_time_cost': 0.30014586448669434, 'diffusion_per_step_time_cost': 0.03751823306083679, 'total_time_cost': 0.30703210830688477, 'offload_time_cost': 0.0}
+2026-03-01 19:57:41.498 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
+2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
+2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
+2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:57:41.775 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:57:41.777 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:57:41.780 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
+[GGML] Running acestep-v15-turbo-BF16.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999816
+  lyric_embed                          1.000000
+  enc_hidden                           0.999841
+  detok_output                         0.999995
+  context                              0.999997
+  noise                                1.000000
+  temb_t                               0.999999
+  hidden_after_proj_in                 0.999988
+  enc_after_cond_emb                   0.999832
+  layer0_sa_output                     0.999960
+  hidden_after_layer0                  0.999982
+  hidden_after_layer6                  0.999924
+  hidden_after_layer12                 0.999332
+  hidden_after_layer18                 0.996692
+  hidden_after_layer23                 0.993786
+  dit_step0_vt                         0.975712
+  dit_step0_xt                         0.999946
+  dit_step1_vt                         0.979525
+  dit_step1_xt                         0.999833
+  dit_step2_vt                         0.981808
+  dit_step2_xt                         0.999552
+  dit_step3_vt                         0.982382
+  dit_step3_xt                         0.998917
+  dit_step4_vt                         0.980777
+  dit_step4_xt                         0.997480
+  dit_step5_vt                         0.978078
+  dit_step5_xt                         0.994264
+  dit_step6_vt                         0.974849
+  dit_step6_xt                         0.988142
+  dit_step7_vt                         0.969102
+  dit_x0                               0.979106
+  vae_audio                            0.901370
+  vae_audio (STFT cosine)              0.975816
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999946   0.136541   0.006626  -0.002312   0.972951  -0.002342   0.972003
+  dit_step1_xt             0.999833   0.265486   0.011288  -0.005309   0.942692  -0.005313   0.941730
+  dit_step2_xt             0.999552   0.451896   0.017477  -0.009347   0.909217  -0.009311   0.908527
+  dit_step3_xt             0.998917   0.642624   0.025957  -0.014710   0.873863  -0.014577   0.873624
+  dit_step4_xt             0.997480   0.778374   0.037868  -0.021751   0.842047  -0.021660   0.841995
+  dit_step5_xt             0.994264   1.244624   0.055630  -0.031814   0.825360  -0.032109   0.824593
+  dit_step6_xt             0.988142   2.080976   0.082605  -0.046091   0.856212  -0.046482   0.855546
diff --git a/tests/CPU-Q4_K_M.log b/tests/CPU-Q4_K_M.log
new file mode 100644
index 0000000..508a20c
--- /dev/null
+++ b/tests/CPU-Q4_K_M.log
@@ -0,0 +1,257 @@
+[Load] DiT backend: CPU (CPU threads: 16)
+[Load] Backend init: 6.3 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 895.6 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 118.4 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CPU (CPU threads: 16)
+[VAE] Backend: CPU, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 696.2 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 33.0 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 148.2 ms
+[Encode] TextEncoder (70 tokens): 58.0 ms
+[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.6 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 352.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 37.5 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 294.2 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.759313 -0.049345 -0.129442 0.055759
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 64.7 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 10.1 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 354.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.106265 1.448869 0.309591 -0.650098
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.106265 1.448869 0.309591 -0.650098
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 2063 nodes
+[Debug] tproj: [12288] first4: 0.261574 -0.159668 -0.089874 0.048361
+[Debug] temb: [2048] first4: 0.000181 -0.133893 -0.034492 0.065095
+[Debug] temb_t: [2048] first4: 0.000984 0.025702 -0.052155 0.063359
+[Debug] temb_r: [2048] first4: -0.000803 -0.159595 0.017663 0.001736
+[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049462 -0.052971 -0.011985 -0.047441
+[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.057382 -0.990466 0.522861 0.451163
+[Debug] proj_in_input: [192, 2170] first4: -0.106265 1.448869 0.309591 -0.650098
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.171472 0.759029 0.290676 -0.533397
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.732369 -0.771010 -0.041992 0.259081
+[Debug] layer0_q_after_rope: [128, 16] first4: -0.171472 0.759029 0.290676 -0.533397
+[Debug] layer0_k_after_rope: [128, 8] first4: -1.587325 -1.063579 0.053489 0.460284
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.605205 0.165836 -0.485558 0.452734
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.587325 -1.063579 0.053489 0.460284
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.587325 -1.063579 0.053489 0.460284
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.703787 -0.846621 -0.436453 0.503148
+[Debug] hidden_after_layer0: [2048, 1085] first4: -8.930592 0.456150 48.587612 -0.801327
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.484295 -2.022109 30.954683 -3.475530
+[Debug] hidden_after_layer12: [2048, 1085] first4: -18.011547 -13.821573 70.228333 29.257874
+[Debug] hidden_after_layer18: [2048, 1085] first4: -17.142008 9.257736 59.313492 18.404408
+[Debug] hidden_after_layer23: [2048, 1085] first4: -20.417297 8.254404 182.146759 136.554886
+[Debug] dit_step0_vt: [2170, 64] first4: -0.054831 1.071052 0.246038 2.201593
+[Debug] dit_step0_xt: [2170, 64] first4: 0.196828 2.107566 -0.183059 0.747584
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.128807 1.226092 -0.249701 1.890724
+[Debug] dit_step1_xt: [2170, 64] first4: 0.203854 2.040688 -0.169438 0.644453
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: 0.003495 1.153559 0.065743 2.214043
+[Debug] dit_step2_xt: [2170, 64] first4: 0.203621 1.963784 -0.173821 0.496851
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.260204 1.180074 0.269396 2.564617
+[Debug] dit_step3_xt: [2170, 64] first4: 0.181937 1.865445 -0.196271 0.283133
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.294849 1.093781 0.087178 2.615031
+[Debug] dit_step4_xt: [2170, 64] first4: 0.150346 1.748254 -0.205612 0.002951
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.142651 1.068169 -0.503217 2.724137
+[Debug] dit_step5_xt: [2170, 64] first4: 0.129968 1.595658 -0.133723 -0.386212
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: -0.109419 1.023015 -1.102168 2.820799
+[Debug] dit_step6_xt: [2170, 64] first4: 0.151852 1.391055 0.086710 -0.950372
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.463452 0.896626 -1.673395 3.222673
+[Debug] dit_x0: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 21769.5 ms (21769.5 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 52184.7 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000272 0.000786 0.000556 0.000990
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 20:03:15.903 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 20:03:15.903 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 20:03:15.903 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 20:03:15.903 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 20:03:15.904 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 20:03:16.714 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 20:03:18.309 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 20:03:18.309 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 20:03:18.315 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 20:03:18.480 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 20:03:18.488 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 20:03:18.501 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 20:03:18.501 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 20:03:18.540 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 20:03:18.854 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 20:03:18.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 20:03:18.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006970643997192383, 'diffusion_time_cost': 0.3072662353515625, 'diffusion_per_step_time_cost': 0.03840827941894531, 'total_time_cost': 0.3142368793487549, 'offload_time_cost': 0.0}
+2026-03-01 20:03:18.869 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 20:03:19.148 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 20:03:19.151 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 20:03:19.154 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999816
+  lyric_embed                          1.000000
+  enc_hidden                           0.997095
+  detok_output                         0.999577
+  context                              0.999730
+  noise                                1.000000
+  temb_t                               0.999896
+  hidden_after_proj_in                 0.999903
+  enc_after_cond_emb                   0.997571
+  layer0_sa_output                     0.998370
+  hidden_after_layer0                  0.999619
+  hidden_after_layer6                  0.999177
+  hidden_after_layer12                 0.995111
+  hidden_after_layer18                 0.991459
+  hidden_after_layer23                 0.985217
+  dit_step0_vt                         0.946613
+  dit_step0_xt                         0.999883
+  dit_step1_vt                         0.947613
+  dit_step1_xt                         0.999611
+  dit_step2_vt                         0.958491
+  dit_step2_xt                         0.999010
+  dit_step3_vt                         0.962965
+  dit_step3_xt                         0.997773
+  dit_step4_vt                         0.960997
+  dit_step4_xt                         0.994989
+  dit_step5_vt                         0.957636
+  dit_step5_xt                         0.988832
+  dit_step6_vt                         0.952016
+  dit_step6_xt                         0.977196
+  dit_step7_vt                         0.939970
+  dit_x0                               0.959881
+  vae_audio                            0.834993
+  vae_audio (STFT cosine)              0.955098
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999883   0.167680   0.010319  -0.002256   0.973185  -0.002342   0.972003
+  dit_step1_xt             0.999611   0.268237   0.018204  -0.005104   0.943179  -0.005313   0.941730
+  dit_step2_xt             0.999010   0.434671   0.027774  -0.009029   0.910147  -0.009311   0.908527
+  dit_step3_xt             0.997773   0.601206   0.039926  -0.014325   0.875171  -0.014577   0.873624
+  dit_step4_xt             0.994989   0.892883   0.057385  -0.021274   0.843615  -0.021660   0.841995
+  dit_step5_xt             0.988832   1.381146   0.083605  -0.031218   0.827061  -0.032109   0.824593
+  dit_step6_xt             0.977196   2.021005   0.123750  -0.045473   0.858175  -0.046482   0.855546
diff --git a/tests/CPU-Q5_K_M.log b/tests/CPU-Q5_K_M.log
new file mode 100644
index 0000000..e0d9936
--- /dev/null
+++ b/tests/CPU-Q5_K_M.log
@@ -0,0 +1,257 @@
+[Load] DiT backend: CPU (CPU threads: 16)
+[Load] Backend init: 1.6 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 140.3 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CPU (CPU threads: 16)
+[VAE] Backend: CPU, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 699.1 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 33.4 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 149.7 ms
+[Encode] TextEncoder (70 tokens): 57.3 ms
+[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.5 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 412.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 45.1 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 387.5 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.760901 -0.053445 -0.132760 0.058505
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 73.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 11.3 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 447.0 ms
+[Debug] detok_output: [2170, 64] first4: -0.129311 1.458194 0.298132 -0.651512
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.129311 1.458194 0.298132 -0.651512
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 2063 nodes
+[Debug] tproj: [12288] first4: 0.261152 -0.161305 -0.103153 0.050892
+[Debug] temb: [2048] first4: -0.000119 -0.132132 -0.035650 0.065085
+[Debug] temb_t: [2048] first4: 0.000588 0.026848 -0.052924 0.063878
+[Debug] temb_r: [2048] first4: -0.000708 -0.158980 0.017274 0.001208
+[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.051319 -0.053246 -0.011899 -0.038818
+[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.048950 -0.942691 0.537616 0.450821
+[Debug] proj_in_input: [192, 2170] first4: -0.129311 1.458194 0.298132 -0.651512
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.151010 0.749188 0.347886 -0.528254
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.726623 -0.748099 -0.053174 0.262053
+[Debug] layer0_q_after_rope: [128, 16] first4: -0.151010 0.749188 0.347886 -0.528254
+[Debug] layer0_k_after_rope: [128, 8] first4: -1.551637 -1.002339 0.163270 0.462290
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.510043 0.134910 -0.385166 0.487419
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.551637 -1.002339 0.163270 0.462290
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.551637 -1.002339 0.163270 0.462290
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.601043 -0.768895 -0.323166 0.504161
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.313718 0.740223 52.142769 -0.880804
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.028343 0.455638 29.972351 -4.651019
+[Debug] hidden_after_layer12: [2048, 1085] first4: -17.875141 -17.099358 67.074074 24.887821
+[Debug] hidden_after_layer18: [2048, 1085] first4: -24.271315 11.994616 56.276474 19.815941
+[Debug] hidden_after_layer23: [2048, 1085] first4: -9.757540 40.914558 193.229523 152.458817
+[Debug] dit_step0_vt: [2170, 64] first4: -0.008601 1.160695 0.325083 2.395968
+[Debug] dit_step0_xt: [2170, 64] first4: 0.194727 2.103491 -0.186652 0.738749
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.246968 1.361296 -0.140900 1.930280
+[Debug] dit_step1_xt: [2170, 64] first4: 0.208198 2.029238 -0.178966 0.633461
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.093393 1.253966 0.122121 2.387282
+[Debug] dit_step2_xt: [2170, 64] first4: 0.214424 1.945641 -0.187107 0.474308
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.283676 1.140476 0.250461 2.641533
+[Debug] dit_step3_xt: [2170, 64] first4: 0.190784 1.850601 -0.207979 0.254181
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.314606 0.873225 0.069223 2.711446
+[Debug] dit_step4_xt: [2170, 64] first4: 0.157077 1.757041 -0.215396 -0.036331
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.377209 0.828215 -0.406894 2.727257
+[Debug] dit_step5_xt: [2170, 64] first4: 0.103190 1.638725 -0.157268 -0.425940
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.230187 0.630044 -0.936850 2.799204
+[Debug] dit_step6_xt: [2170, 64] first4: 0.057152 1.512716 0.030102 -0.985780
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.003599 0.325174 -1.377289 3.053612
+[Debug] dit_x0: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 27970.1 ms (27970.1 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 51966.1 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000740 0.001305 0.001083 0.001434
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 20:01:55.226 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 20:01:55.226 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 20:01:55.226 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 20:01:55.226 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 20:01:55.226 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 20:01:56.032 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 20:01:57.576 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 20:01:57.577 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 20:01:57.581 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 20:01:57.747 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 20:01:57.755 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 20:01:57.768 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 20:01:57.768 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 20:01:57.801 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 20:01:58.109 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 20:01:58.109 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 20:01:58.109 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007002353668212891, 'diffusion_time_cost': 0.30033254623413086, 'diffusion_per_step_time_cost': 0.03754156827926636, 'total_time_cost': 0.30733489990234375, 'offload_time_cost': 0.0}
+2026-03-01 20:01:58.124 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 20:01:58.401 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 20:01:58.403 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 20:01:58.406 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999816
+  lyric_embed                          1.000000
+  enc_hidden                           0.999099
+  detok_output                         0.999843
+  context                              0.999900
+  noise                                1.000000
+  temb_t                               0.999968
+  hidden_after_proj_in                 0.999954
+  enc_after_cond_emb                   0.999196
+  layer0_sa_output                     0.999388
+  hidden_after_layer0                  0.999773
+  hidden_after_layer6                  0.999687
+  hidden_after_layer12                 0.998560
+  hidden_after_layer18                 0.995178
+  hidden_after_layer23                 0.990907
+  dit_step0_vt                         0.966084
+  dit_step0_xt                         0.999926
+  dit_step1_vt                         0.972329
+  dit_step1_xt                         0.999780
+  dit_step2_vt                         0.971107
+  dit_step2_xt                         0.999383
+  dit_step3_vt                         0.973886
+  dit_step3_xt                         0.998543
+  dit_step4_vt                         0.971976
+  dit_step4_xt                         0.996642
+  dit_step5_vt                         0.967575
+  dit_step5_xt                         0.992211
+  dit_step6_vt                         0.962964
+  dit_step6_xt                         0.983513
+  dit_step7_vt                         0.954349
+  dit_x0                               0.970379
+  vae_audio                            0.874800
+  vae_audio (STFT cosine)              0.967703
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999926   0.135378   0.008030  -0.002303   0.973012  -0.002342   0.972003
+  dit_step1_xt             0.999780   0.276712   0.013491  -0.005310   0.942849  -0.005313   0.941730
+  dit_step2_xt             0.999383   0.460420   0.021261  -0.009337   0.909465  -0.009311   0.908527
+  dit_step3_xt             0.998543   0.681684   0.031463  -0.014739   0.874175  -0.014577   0.873624
+  dit_step4_xt             0.996642   0.853164   0.045737  -0.021967   0.842445  -0.021660   0.841995
+  dit_step5_xt             0.992211   1.314129   0.067657  -0.032346   0.825989  -0.032109   0.824593
+  dit_step6_xt             0.983513   2.191432   0.101363  -0.046949   0.857195  -0.046482   0.855546
diff --git a/tests/CPU-Q6_K.log b/tests/CPU-Q6_K.log
new file mode 100644
index 0000000..7d4c411
--- /dev/null
+++ b/tests/CPU-Q6_K.log
@@ -0,0 +1,257 @@
+[Load] DiT backend: CPU (CPU threads: 16)
+[Load] Backend init: 1.6 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 169.4 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CPU (CPU threads: 16)
+[VAE] Backend: CPU, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 699.2 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 32.5 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 148.3 ms
+[Encode] TextEncoder (70 tokens): 57.5 ms
+[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.6 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 476.3 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 52.6 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 348.9 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.761694 -0.052035 -0.131773 0.058231
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 82.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 12.3 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 414.3 ms
+[Debug] detok_output: [2170, 64] first4: -0.151355 1.462444 0.326907 -0.627213
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.151355 1.462444 0.326907 -0.627213
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 2129 nodes
+[Debug] tproj: [12288] first4: 0.261809 -0.161156 -0.099489 0.050901
+[Debug] temb: [2048] first4: 0.000441 -0.132284 -0.035603 0.064823
+[Debug] temb_t: [2048] first4: 0.001519 0.026983 -0.052936 0.063921
+[Debug] temb_r: [2048] first4: -0.001078 -0.159268 0.017333 0.000903
+[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049242 -0.050737 -0.017494 -0.036973
+[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.041706 -0.935163 0.543316 0.447904
+[Debug] proj_in_input: [192, 2170] first4: -0.151355 1.462444 0.326907 -0.627213
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.170483 0.826965 0.338536 -0.581525
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.719262 -0.743265 -0.048909 0.260726
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.546578 -1.031349 0.213821 0.458892
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.170483 0.826965 0.338536 -0.581525
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.510827 0.216662 -0.337830 0.522569
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.546578 -1.031349 0.213821 0.458892
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.546578 -1.031349 0.213821 0.458892
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.610117 -0.795587 -0.288174 0.502934
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.220036 0.587352 53.159882 -0.942435
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.447939 -0.975549 35.157303 -4.845882
+[Debug] hidden_after_layer12: [2048, 1085] first4: -16.561256 -16.121094 76.819672 30.808043
+[Debug] hidden_after_layer18: [2048, 1085] first4: -29.809811 13.925017 66.285889 19.847790
+[Debug] hidden_after_layer23: [2048, 1085] first4: -21.918661 46.159637 204.710663 138.480270
+[Debug] dit_step0_vt: [2170, 64] first4: 0.100316 1.102248 0.318693 2.394090
+[Debug] dit_step0_xt: [2170, 64] first4: 0.189776 2.106148 -0.186361 0.738834
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.077579 1.336049 -0.205877 1.979667
+[Debug] dit_step1_xt: [2170, 64] first4: 0.194008 2.033272 -0.175131 0.630852
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: 0.089277 1.192314 0.088705 2.392204
+[Debug] dit_step2_xt: [2170, 64] first4: 0.188056 1.953785 -0.181045 0.471372
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.306248 1.088670 0.212184 2.674479
+[Debug] dit_step3_xt: [2170, 64] first4: 0.162535 1.863062 -0.198727 0.248499
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.330824 1.012156 0.074096 2.759729
+[Debug] dit_step4_xt: [2170, 64] first4: 0.127090 1.754617 -0.206666 -0.047187
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.330529 0.879730 -0.335447 2.785841
+[Debug] dit_step5_xt: [2170, 64] first4: 0.079871 1.628941 -0.158745 -0.445164
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.198573 0.657394 -0.886720 2.779941
+[Debug] dit_step6_xt: [2170, 64] first4: 0.040157 1.497462 0.018599 -1.001152
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: 0.118016 0.207620 -1.266971 2.955565
+[Debug] dit_x0: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 25398.3 ms (25398.3 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 52074.7 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000467 0.001015 0.000873 0.001303
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 20:00:28.298 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 20:00:28.298 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 20:00:28.298 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 20:00:28.298 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 20:00:28.298 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 20:00:29.103 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 20:00:30.690 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 20:00:30.690 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 20:00:30.695 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 20:00:30.860 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 20:00:30.869 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 20:00:30.881 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 20:00:30.882 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 20:00:30.914 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 20:00:31.231 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 20:00:31.232 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 20:00:31.232 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006938934326171875, 'diffusion_time_cost': 0.31071925163269043, 'diffusion_per_step_time_cost': 0.038839906454086304, 'total_time_cost': 0.3176581859588623, 'offload_time_cost': 0.0}
+2026-03-01 20:00:31.246 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
+2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
+2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
+2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 20:00:31.524 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 20:00:31.527 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 20:00:31.531 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
+[GGML] Running acestep-v15-turbo-Q6_K.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999816
+  lyric_embed                          1.000000
+  enc_hidden                           0.999634
+  detok_output                         0.999927
+  context                              0.999954
+  noise                                1.000000
+  temb_t                               0.999986
+  hidden_after_proj_in                 0.999975
+  enc_after_cond_emb                   0.999619
+  layer0_sa_output                     0.999718
+  hidden_after_layer0                  0.999827
+  hidden_after_layer6                  0.999788
+  hidden_after_layer12                 0.998843
+  hidden_after_layer18                 0.995848
+  hidden_after_layer23                 0.992196
+  dit_step0_vt                         0.971124
+  dit_step0_xt                         0.999936
+  dit_step1_vt                         0.975111
+  dit_step1_xt                         0.999802
+  dit_step2_vt                         0.978218
+  dit_step2_xt                         0.999477
+  dit_step3_vt                         0.977576
+  dit_step3_xt                         0.998723
+  dit_step4_vt                         0.973938
+  dit_step4_xt                         0.996945
+  dit_step5_vt                         0.969356
+  dit_step5_xt                         0.992753
+  dit_step6_vt                         0.965671
+  dit_step6_xt                         0.984569
+  dit_step7_vt                         0.958147
+  dit_x0                               0.972312
+  vae_audio                            0.891761
+  vae_audio (STFT cosine)              0.969080
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999936   0.151952   0.007283  -0.002271   0.972870  -0.002342   0.972003
+  dit_step1_xt             0.999802   0.296519   0.012516  -0.005212   0.942575  -0.005313   0.941730
+  dit_step2_xt             0.999477   0.478400   0.019283  -0.009184   0.908992  -0.009311   0.908527
+  dit_step3_xt             0.998723   0.734609   0.028810  -0.014535   0.873457  -0.014577   0.873624
+  dit_step4_xt             0.996945   1.045720   0.042804  -0.021712   0.841447  -0.021660   0.841995
+  dit_step5_xt             0.992753   1.512605   0.064324  -0.032020   0.824620  -0.032109   0.824593
+  dit_step6_xt             0.984569   2.166596   0.096699  -0.046604   0.855715  -0.046482   0.855546
diff --git a/tests/CPU-Q8_0.log b/tests/CPU-Q8_0.log
new file mode 100644
index 0000000..76183ea
--- /dev/null
+++ b/tests/CPU-Q8_0.log
@@ -0,0 +1,257 @@
+[Load] DiT backend: CPU (CPU threads: 16)
+[Load] Backend init: 1.6 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 188.0 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CPU (CPU threads: 16)
+[VAE] Backend: CPU, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 690.8 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 32.8 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 160.0 ms
+[Encode] TextEncoder (70 tokens): 57.9 ms
+[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 13.0 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CPU (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 616.6 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 126.4 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 390.3 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.758873 -0.049568 -0.132802 0.057792
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 106.5 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 13.6 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 447.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.126218 1.441045 0.305219 -0.629688
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.126218 1.441045 0.305219 -0.629688
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 2129 nodes
+[Debug] tproj: [12288] first4: 0.259485 -0.161550 -0.096885 0.051766
+[Debug] temb: [2048] first4: 0.000214 -0.132557 -0.035428 0.064847
+[Debug] temb_t: [2048] first4: 0.001194 0.026823 -0.052744 0.063762
+[Debug] temb_r: [2048] first4: -0.000980 -0.159380 0.017316 0.001084
+[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049228 -0.051913 -0.015026 -0.038076
+[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018835 -0.015731 0.008462
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.038249 -0.957445 0.537078 0.447006
+[Debug] proj_in_input: [192, 2170] first4: -0.126218 1.441045 0.305219 -0.629688
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.176016 0.814970 0.334600 -0.563971
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.718529 -0.757126 -0.047071 0.261381
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.545586 -1.032032 0.192079 0.456504
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.176016 0.814970 0.334600 -0.563971
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.519029 0.168016 -0.353233 0.508560
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.545586 -1.032032 0.192079 0.456504
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.545586 -1.032032 0.192079 0.456504
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.604149 -0.815843 -0.286884 0.491781
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.102718 0.576853 52.433601 -0.866220
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.554432 0.201925 34.636509 -4.160976
+[Debug] hidden_after_layer12: [2048, 1085] first4: -15.075979 -18.545254 72.497665 28.997612
+[Debug] hidden_after_layer18: [2048, 1085] first4: -26.391603 14.396175 61.327370 20.126297
+[Debug] hidden_after_layer23: [2048, 1085] first4: -4.878841 39.642975 194.063141 143.022125
+[Debug] dit_step0_vt: [2170, 64] first4: 0.030129 1.134737 0.345365 2.365999
+[Debug] dit_step0_xt: [2170, 64] first4: 0.192966 2.104671 -0.187573 0.740111
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.191913 1.346320 -0.134135 1.880714
+[Debug] dit_step1_xt: [2170, 64] first4: 0.203434 2.031235 -0.180257 0.637526
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.032953 1.239032 0.099210 2.371356
+[Debug] dit_step2_xt: [2170, 64] first4: 0.205631 1.948633 -0.186871 0.479436
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.254387 1.085867 0.272314 2.643562
+[Debug] dit_step3_xt: [2170, 64] first4: 0.184432 1.858144 -0.209564 0.259139
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.255440 1.003537 0.102939 2.722830
+[Debug] dit_step4_xt: [2170, 64] first4: 0.157064 1.750623 -0.220593 -0.032593
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.281173 0.936761 -0.295195 2.736938
+[Debug] dit_step5_xt: [2170, 64] first4: 0.116896 1.616800 -0.178422 -0.423584
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.167723 0.621779 -0.826056 2.808025
+[Debug] dit_step6_xt: [2170, 64] first4: 0.083352 1.492444 -0.013211 -0.985189
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.037024 0.233524 -1.487499 3.098410
+[Debug] dit_x0: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 26043.3 ms (26043.3 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 52114.7 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000455 0.000930 0.000816 0.001121
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:59:03.882 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:59:03.882 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:59:03.882 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:59:03.883 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:59:03.883 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:59:04.691 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:59:06.262 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:59:06.262 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:59:06.268 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:59:06.433 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:59:06.443 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:59:06.457 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:59:06.457 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:59:06.478 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:59:06.802 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:59:06.803 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:59:06.803 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006929874420166016, 'diffusion_time_cost': 0.3164329528808594, 'diffusion_per_step_time_cost': 0.03955411911010742, 'total_time_cost': 0.3233628273010254, 'offload_time_cost': 0.0}
+2026-03-01 19:59:06.817 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:59:07.095 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:59:07.098 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:59:07.101 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
+[GGML] Running acestep-v15-turbo-Q8_0.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999816
+  lyric_embed                          1.000000
+  enc_hidden                           0.999814
+  detok_output                         0.999983
+  context                              0.999990
+  noise                                1.000000
+  temb_t                               0.999997
+  hidden_after_proj_in                 0.999985
+  enc_after_cond_emb                   0.999791
+  layer0_sa_output                     0.999925
+  hidden_after_layer0                  0.999955
+  hidden_after_layer6                  0.999892
+  hidden_after_layer12                 0.999219
+  hidden_after_layer18                 0.996644
+  hidden_after_layer23                 0.993707
+  dit_step0_vt                         0.975605
+  dit_step0_xt                         0.999946
+  dit_step1_vt                         0.978928
+  dit_step1_xt                         0.999831
+  dit_step2_vt                         0.981129
+  dit_step2_xt                         0.999551
+  dit_step3_vt                         0.982813
+  dit_step3_xt                         0.998932
+  dit_step4_vt                         0.981292
+  dit_step4_xt                         0.997544
+  dit_step5_vt                         0.979091
+  dit_step5_xt                         0.994467
+  dit_step6_vt                         0.976152
+  dit_step6_xt                         0.988647
+  dit_step7_vt                         0.970238
+  dit_x0                               0.980014
+  vae_audio                            0.903408
+  vae_audio (STFT cosine)              0.976427
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999946   0.139652   0.006645  -0.002330   0.972930  -0.002342   0.972003
+  dit_step1_xt             0.999831   0.267117   0.011368  -0.005325   0.942659  -0.005313   0.941730
+  dit_step2_xt             0.999551   0.452101   0.017578  -0.009369   0.909163  -0.009311   0.908527
+  dit_step3_xt             0.998932   0.629880   0.025911  -0.014735   0.873792  -0.014577   0.873624
+  dit_step4_xt             0.997544   0.759572   0.037583  -0.021796   0.841987  -0.021660   0.841995
+  dit_step5_xt             0.994467   1.235701   0.054893  -0.031886   0.825306  -0.032109   0.824593
+  dit_step6_xt             0.988647   2.096131   0.081207  -0.046181   0.856264  -0.046482   0.855546
diff --git a/tests/CUDA-BF16.log b/tests/CUDA-BF16.log
new file mode 100644
index 0000000..d73a934
--- /dev/null
+++ b/tests/CUDA-BF16.log
@@ -0,0 +1,259 @@
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
+[Load] DiT backend: CUDA0 (CPU threads: 16)
+[Load] Backend init: 70.8 ms
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 375.6 ms
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CUDA0 (CPU threads: 16)
+[VAE] Backend: CUDA0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 661.0 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 32.8 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 128.5 ms
+[Encode] TextEncoder (70 tokens): 50.6 ms
+[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.5 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 127.1 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 7.9 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.758648 -0.049409 -0.132412 0.058372
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[WeightCtx] Loaded 30 tensors, 200.3 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 24.2 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 141.9 ms
+[Debug] detok_output: [2170, 64] first4: -0.124204 1.435425 0.309963 -0.624679
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.124204 1.435425 0.309963 -0.624679
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] tproj: [12288] first4: 0.260062 -0.161562 -0.097030 0.052313
+[Debug] temb: [2048] first4: 0.000069 -0.132499 -0.035430 0.064753
+[Debug] temb_t: [2048] first4: 0.001065 0.026818 -0.052754 0.063717
+[Debug] temb_r: [2048] first4: -0.000996 -0.159317 0.017323 0.001036
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049318 -0.051829 -0.014251 -0.038444
+[Debug] temb_lin1_r: [2048] first4: -0.013266 -0.018319 -0.016375 0.008532
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039547 -0.969737 0.533554 0.446556
+[Debug] proj_in_input: [192, 2170] first4: -0.124204 1.435425 0.309963 -0.624679
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.166382 0.814621 0.325745 -0.561218
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.719041 -0.764240 -0.047643 0.261711
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.600161 -0.822879 -0.294099 0.491351
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.166382 0.814621 0.325745 -0.561218
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.500000 0.170898 -0.351562 0.515625
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.540346 -1.045535 0.190276 0.455950
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540346 -1.045535 0.190276 0.455950
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.600161 -0.822879 -0.294099 0.491351
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.073158 0.560212 52.141960 -0.912522
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.385975 0.074876 33.328918 -4.446253
+[Debug] hidden_after_layer12: [2048, 1085] first4: -15.000174 -17.960159 71.364281 28.422548
+[Debug] hidden_after_layer18: [2048, 1085] first4: -27.019310 15.715343 59.139381 20.656757
+[Debug] hidden_after_layer23: [2048, 1085] first4: -9.519342 45.743378 195.522568 144.389435
+[Debug] dit_step0_vt: [2170, 64] first4: 0.016157 1.119429 0.348312 2.379197
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193602 2.105367 -0.187707 0.739511
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.224607 1.308204 -0.126253 1.900889
+[Debug] dit_step1_xt: [2170, 64] first4: 0.205853 2.034010 -0.180821 0.635826
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.011260 1.217733 0.098172 2.384965
+[Debug] dit_step2_xt: [2170, 64] first4: 0.206604 1.952828 -0.187366 0.476828
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.242402 1.085806 0.261774 2.646892
+[Debug] dit_step3_xt: [2170, 64] first4: 0.186403 1.862344 -0.209180 0.256254
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.281105 1.015777 0.102466 2.709046
+[Debug] dit_step4_xt: [2170, 64] first4: 0.156285 1.753511 -0.220159 -0.034001
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.265994 0.916073 -0.297680 2.755516
+[Debug] dit_step5_xt: [2170, 64] first4: 0.118286 1.622644 -0.177633 -0.427646
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.172145 0.636800 -0.808572 2.809288
+[Debug] dit_step6_xt: [2170, 64] first4: 0.083857 1.495284 -0.015919 -0.989503
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.004009 0.190141 -1.466879 3.103273
+[Debug] dit_x0: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 248.3 ms (248.3 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 812.8 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000547 0.000898 0.000798 0.001064
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:54:08.539 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:54:08.540 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:54:08.540 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:54:08.540 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:54:08.540 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:54:09.277 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:54:10.804 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:54:10.804 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:54:10.810 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:54:10.970 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:54:10.978 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:54:10.991 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:54:10.991 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:54:11.023 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:54:11.329 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:54:11.330 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:54:11.330 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0068187713623046875, 'diffusion_time_cost': 0.2986173629760742, 'diffusion_per_step_time_cost': 0.03732717037200928, 'total_time_cost': 0.3054361343383789, 'offload_time_cost': 0.0}
+2026-03-01 19:54:11.344 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:54:11.625 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:54:11.628 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:54:11.632 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
+[GGML] Running acestep-v15-turbo-BF16.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999805
+  lyric_embed                          1.000000
+  enc_hidden                           0.999830
+  detok_output                         0.999996
+  context                              0.999998
+  noise                                1.000000
+  temb_t                               0.999999
+  hidden_after_proj_in                 0.999988
+  enc_after_cond_emb                   0.999818
+  layer0_sa_output                     0.999951
+  hidden_after_layer0                  0.999978
+  hidden_after_layer6                  0.999916
+  hidden_after_layer12                 0.999234
+  hidden_after_layer18                 0.996570
+  hidden_after_layer23                 0.993528
+  dit_step0_vt                         0.974876
+  dit_step0_xt                         0.999945
+  dit_step1_vt                         0.980053
+  dit_step1_xt                         0.999834
+  dit_step2_vt                         0.981541
+  dit_step2_xt                         0.999553
+  dit_step3_vt                         0.982418
+  dit_step3_xt                         0.998924
+  dit_step4_vt                         0.980811
+  dit_step4_xt                         0.997503
+  dit_step5_vt                         0.977877
+  dit_step5_xt                         0.994298
+  dit_step6_vt                         0.974930
+  dit_step6_xt                         0.988188
+  dit_step7_vt                         0.969375
+  dit_x0                               0.979213
+  vae_audio                            0.901377
+  vae_audio (STFT cosine)              0.975525
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999945   0.135628   0.006709  -0.002312   0.972932  -0.002342   0.972003
+  dit_step1_xt             0.999834   0.266762   0.011267  -0.005306   0.942657  -0.005313   0.941730
+  dit_step2_xt             0.999553   0.453190   0.017486  -0.009350   0.909152  -0.009311   0.908527
+  dit_step3_xt             0.998924   0.643865   0.025962  -0.014715   0.873769  -0.014577   0.873624
+  dit_step4_xt             0.997503   0.790038   0.037807  -0.021768   0.841938  -0.021660   0.841995
+  dit_step5_xt             0.994298   1.239881   0.055598  -0.031834   0.825214  -0.032109   0.824593
+  dit_step6_xt             0.988188   2.076383   0.082565  -0.046121   0.856115  -0.046482   0.855546
diff --git a/tests/CUDA-Q4_K_M.log b/tests/CUDA-Q4_K_M.log
new file mode 100644
index 0000000..189cb71
--- /dev/null
+++ b/tests/CUDA-Q4_K_M.log
@@ -0,0 +1,259 @@
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
+[Load] DiT backend: CUDA0 (CPU threads: 16)
+[Load] Backend init: 11.2 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 895.6 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 403.0 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CUDA0 (CPU threads: 16)
+[VAE] Backend: CUDA0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 655.9 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 31.4 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 126.3 ms
+[Encode] TextEncoder (70 tokens): 52.7 ms
+[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.1 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 352.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 118.9 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 12.7 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.759848 -0.046220 -0.129361 0.057668
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 64.7 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 22.1 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 124.0 ms
+[Debug] detok_output: [2170, 64] first4: -0.098446 1.438721 0.299255 -0.646500
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.098446 1.438721 0.299255 -0.646500
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+[Debug] tproj: [12288] first4: 0.260848 -0.159996 -0.090771 0.048441
+[Debug] temb: [2048] first4: 0.000246 -0.134045 -0.034408 0.064910
+[Debug] temb_t: [2048] first4: 0.001029 0.025591 -0.052085 0.063187
+[Debug] temb_r: [2048] first4: -0.000783 -0.159636 0.017677 0.001723
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049559 -0.053563 -0.011978 -0.047026
+[Debug] temb_lin1_r: [2048] first4: -0.015462 -0.031532 -0.021258 0.006134
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.048604 -0.990237 0.529252 0.453491
+[Debug] proj_in_input: [192, 2170] first4: -0.098446 1.438721 0.299255 -0.646500
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.164939 0.740011 0.286775 -0.551167
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.724411 -0.771269 -0.042124 0.260209
+[Debug] layer0_q_after_rope: [128, 16] first4: -26.611641 -0.173146 0.216591 0.344494
+[Debug] layer0_k_after_rope: [128, 8] first4: -3.965077 0.386751 0.211083 0.672416
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.608527 0.164282 -0.474735 0.450532
+[Debug] layer0_attn_out: [2048, 1085] first4: -26.943256 -0.119716 0.379954 0.343082
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.581287 -1.062661 0.069874 0.462384
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.708075 -0.853060 -0.446424 0.497258
+[Debug] hidden_after_layer0: [2048, 1085] first4: -8.841661 0.391934 47.472157 -0.764472
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.532463 -0.603226 30.787485 -3.431937
+[Debug] hidden_after_layer12: [2048, 1085] first4: -17.481373 -13.959963 61.344299 28.807806
+[Debug] hidden_after_layer18: [2048, 1085] first4: -15.247349 10.312581 47.860855 16.436914
+[Debug] hidden_after_layer23: [2048, 1085] first4: -13.968861 1.714361 170.159424 132.288422
+[Debug] dit_step0_vt: [2170, 64] first4: -0.165321 1.077570 0.220752 2.218085
+[Debug] dit_step0_xt: [2170, 64] first4: 0.201851 2.107270 -0.181909 0.746834
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.110858 1.235523 -0.287918 1.796672
+[Debug] dit_step1_xt: [2170, 64] first4: 0.207897 2.039877 -0.166205 0.648834
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.030571 1.208156 0.092450 2.195761
+[Debug] dit_step2_xt: [2170, 64] first4: 0.209935 1.959334 -0.172368 0.502450
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.247537 1.164770 0.276511 2.503829
+[Debug] dit_step3_xt: [2170, 64] first4: 0.189307 1.862270 -0.195410 0.293797
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.384617 1.107927 0.073075 2.612695
+[Debug] dit_step4_xt: [2170, 64] first4: 0.148098 1.743563 -0.203240 0.013866
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.180515 0.944257 -0.458470 2.697840
+[Debug] dit_step5_xt: [2170, 64] first4: 0.122310 1.608669 -0.137744 -0.371540
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: -0.245520 0.941769 -1.135058 2.750750
+[Debug] dit_step6_xt: [2170, 64] first4: 0.171414 1.420316 0.089267 -0.921690
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.488470 0.849564 -1.659694 3.185843
+[Debug] dit_x0: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 249.1 ms (249.1 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 820.0 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000325 0.000812 0.000671 0.000911
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:54:39.264 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:54:39.265 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:54:39.265 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:54:39.265 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:54:39.265 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:54:40.025 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:54:41.587 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:54:41.587 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:54:41.592 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:54:41.751 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:54:41.759 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:54:41.771 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:54:41.772 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:54:41.805 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:54:42.113 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:54:42.114 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:54:42.114 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006765604019165039, 'diffusion_time_cost': 0.3010725975036621, 'diffusion_per_step_time_cost': 0.037634074687957764, 'total_time_cost': 0.30783820152282715, 'offload_time_cost': 0.0}
+2026-03-01 19:54:42.128 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:54:42.405 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:54:42.408 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:54:42.411 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999805
+  lyric_embed                          1.000000
+  enc_hidden                           0.997032
+  detok_output                         0.999610
+  context                              0.999750
+  noise                                1.000000
+  temb_t                               0.999902
+  hidden_after_proj_in                 0.999908
+  enc_after_cond_emb                   0.997517
+  layer0_sa_output                     0.998371
+  hidden_after_layer0                  0.999675
+  hidden_after_layer6                  0.999257
+  hidden_after_layer12                 0.995500
+  hidden_after_layer18                 0.991597
+  hidden_after_layer23                 0.985460
+  dit_step0_vt                         0.947383
+  dit_step0_xt                         0.999885
+  dit_step1_vt                         0.947784
+  dit_step1_xt                         0.999617
+  dit_step2_vt                         0.957305
+  dit_step2_xt                         0.999014
+  dit_step3_vt                         0.961931
+  dit_step3_xt                         0.997757
+  dit_step4_vt                         0.959773
+  dit_step4_xt                         0.994900
+  dit_step5_vt                         0.956611
+  dit_step5_xt                         0.988539
+  dit_step6_vt                         0.950669
+  dit_step6_xt                         0.976494
+  dit_step7_vt                         0.938658
+  dit_x0                               0.958725
+  vae_audio                            0.837763
+  vae_audio (STFT cosine)              0.954448
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999885   0.165835   0.010206  -0.002260   0.973133  -0.002342   0.972003
+  dit_step1_xt             0.999617   0.269038   0.018058  -0.005119   0.943095  -0.005313   0.941730
+  dit_step2_xt             0.999014   0.433553   0.027847  -0.009033   0.910111  -0.009311   0.908527
+  dit_step3_xt             0.997757   0.593449   0.040253  -0.014301   0.875156  -0.014577   0.873624
+  dit_step4_xt             0.994900   0.889597   0.058068  -0.021205   0.843622  -0.021660   0.841995
+  dit_step5_xt             0.988539   1.371047   0.084767  -0.031100   0.827136  -0.032109   0.824593
+  dit_step6_xt             0.976494   1.997185   0.125556  -0.045244   0.858177  -0.046482   0.855546
diff --git a/tests/CUDA-Q5_K_M.log b/tests/CUDA-Q5_K_M.log
new file mode 100644
index 0000000..00b9652
--- /dev/null
+++ b/tests/CUDA-Q5_K_M.log
@@ -0,0 +1,259 @@
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
+[Load] DiT backend: CUDA0 (CPU threads: 16)
+[Load] Backend init: 25.7 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 465.4 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CUDA0 (CPU threads: 16)
+[VAE] Backend: CUDA0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 656.4 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 31.3 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 127.3 ms
+[Encode] TextEncoder (70 tokens): 49.5 ms
+[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.4 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 412.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 138.7 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 13.1 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.760389 -0.050879 -0.130835 0.059141
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 73.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 24.2 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 121.7 ms
+[Debug] detok_output: [2170, 64] first4: -0.125017 1.460327 0.292545 -0.654237
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.125017 1.460327 0.292545 -0.654237
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+[Debug] tproj: [12288] first4: 0.260130 -0.161413 -0.102271 0.051211
+[Debug] temb: [2048] first4: -0.000033 -0.132307 -0.035515 0.064775
+[Debug] temb_t: [2048] first4: 0.000653 0.026699 -0.052806 0.063542
+[Debug] temb_r: [2048] first4: -0.000685 -0.159005 0.017290 0.001234
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.051436 -0.053873 -0.011918 -0.038393
+[Debug] temb_lin1_r: [2048] first4: -0.016164 -0.021120 -0.015800 -0.000525
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.043269 -0.943395 0.541080 0.455623
+[Debug] proj_in_input: [192, 2170] first4: -0.125017 1.460327 0.292545 -0.654237
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.158078 0.738352 0.324930 -0.519564
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.721699 -0.748479 -0.051910 0.264453
+[Debug] layer0_q_after_rope: [128, 16] first4: -26.700098 -0.191763 0.241664 0.327243
+[Debug] layer0_k_after_rope: [128, 8] first4: -3.876794 0.412444 0.096899 0.724944
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.497476 0.145466 -0.380354 0.485316
+[Debug] layer0_attn_out: [2048, 1085] first4: -27.034651 -0.125372 0.405539 0.333085
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540176 -1.007621 0.171218 0.466798
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.603106 -0.810148 -0.307159 0.493001
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.100931 0.548624 50.178547 -0.840484
+[Debug] hidden_after_layer6: [2048, 1085] first4: -20.448851 0.734318 29.757233 -4.634385
+[Debug] hidden_after_layer12: [2048, 1085] first4: -18.620174 -17.772619 67.315002 24.878105
+[Debug] hidden_after_layer18: [2048, 1085] first4: -25.252079 10.759434 60.574448 19.297585
+[Debug] hidden_after_layer23: [2048, 1085] first4: -3.474268 32.243759 194.636520 160.608047
+[Debug] dit_step0_vt: [2170, 64] first4: 0.008642 1.131305 0.289193 2.355634
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193943 2.104827 -0.185020 0.740582
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.205228 1.406502 -0.196234 1.800572
+[Debug] dit_step1_xt: [2170, 64] first4: 0.205137 2.028109 -0.174316 0.642369
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.122410 1.295395 0.084284 2.386701
+[Debug] dit_step2_xt: [2170, 64] first4: 0.213298 1.941749 -0.179935 0.483256
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.323829 1.081727 0.260844 2.578709
+[Debug] dit_step3_xt: [2170, 64] first4: 0.186312 1.851605 -0.201672 0.268363
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.355370 0.943008 0.097293 2.745308
+[Debug] dit_step4_xt: [2170, 64] first4: 0.148237 1.750569 -0.212097 -0.025777
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.370461 0.859429 -0.430240 2.691899
+[Debug] dit_step5_xt: [2170, 64] first4: 0.095314 1.627793 -0.150634 -0.410334
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.268117 0.608156 -0.982653 2.831516
+[Debug] dit_step6_xt: [2170, 64] first4: 0.041691 1.506162 0.045897 -0.976637
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: 0.031181 0.378487 -1.509792 3.095486
+[Debug] dit_x0: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 251.1 ms (251.1 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 804.2 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000692 0.001098 0.000938 0.001230
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:54:31.395 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:54:31.395 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:54:31.395 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:54:31.395 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:54:31.395 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:54:32.168 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:54:33.881 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:54:33.882 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:54:33.887 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:54:34.060 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:54:34.068 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:54:34.081 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:54:34.081 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:54:34.105 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:54:34.415 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:54:34.416 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:54:34.416 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006921052932739258, 'diffusion_time_cost': 0.3029003143310547, 'diffusion_per_step_time_cost': 0.037862539291381836, 'total_time_cost': 0.30982136726379395, 'offload_time_cost': 0.0}
+2026-03-01 19:54:34.431 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:54:34.714 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:54:34.716 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:54:34.720 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999805
+  lyric_embed                          1.000000
+  enc_hidden                           0.999038
+  detok_output                         0.999875
+  context                              0.999920
+  noise                                1.000000
+  temb_t                               0.999972
+  hidden_after_proj_in                 0.999960
+  enc_after_cond_emb                   0.999148
+  layer0_sa_output                     0.999386
+  hidden_after_layer0                  0.999829
+  hidden_after_layer6                  0.999741
+  hidden_after_layer12                 0.998654
+  hidden_after_layer18                 0.995432
+  hidden_after_layer23                 0.991374
+  dit_step0_vt                         0.968035
+  dit_step0_xt                         0.999930
+  dit_step1_vt                         0.971217
+  dit_step1_xt                         0.999785
+  dit_step2_vt                         0.970740
+  dit_step2_xt                         0.999391
+  dit_step3_vt                         0.973678
+  dit_step3_xt                         0.998557
+  dit_step4_vt                         0.972169
+  dit_step4_xt                         0.996665
+  dit_step5_vt                         0.967356
+  dit_step5_xt                         0.992218
+  dit_step6_vt                         0.962469
+  dit_step6_xt                         0.983446
+  dit_step7_vt                         0.953383
+  dit_x0                               0.970119
+  vae_audio                            0.883226
+  vae_audio (STFT cosine)              0.968463
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999930   0.139407   0.007818  -0.002306   0.973025  -0.002342   0.972003
+  dit_step1_xt             0.999785   0.264377   0.013418  -0.005299   0.942885  -0.005313   0.941730
+  dit_step2_xt             0.999391   0.455966   0.021259  -0.009285   0.909477  -0.009311   0.908527
+  dit_step3_xt             0.998557   0.657160   0.031461  -0.014661   0.874187  -0.014577   0.873624
+  dit_step4_xt             0.996665   0.973354   0.045708  -0.021890   0.842366  -0.021660   0.841995
+  dit_step5_xt             0.992218   1.446589   0.067697  -0.032248   0.825911  -0.032109   0.824593
+  dit_step6_xt             0.983446   2.092730   0.101558  -0.046788   0.857148  -0.046482   0.855546
diff --git a/tests/CUDA-Q6_K.log b/tests/CUDA-Q6_K.log
new file mode 100644
index 0000000..10b9a7a
--- /dev/null
+++ b/tests/CUDA-Q6_K.log
@@ -0,0 +1,259 @@
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
+[Load] DiT backend: CUDA0 (CPU threads: 16)
+[Load] Backend init: 9.5 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 514.8 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CUDA0 (CPU threads: 16)
+[VAE] Backend: CUDA0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 657.3 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 30.7 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 125.7 ms
+[Encode] TextEncoder (70 tokens): 49.2 ms
+[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.3 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 476.3 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 145.8 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 11.0 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.760759 -0.050104 -0.133269 0.058044
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 82.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 26.4 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 123.5 ms
+[Debug] detok_output: [2170, 64] first4: -0.140341 1.456987 0.310602 -0.632665
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.140341 1.456987 0.310602 -0.632665
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] tproj: [12288] first4: 0.259936 -0.161027 -0.098424 0.051532
+[Debug] temb: [2048] first4: 0.000362 -0.132329 -0.035400 0.064685
+[Debug] temb_t: [2048] first4: 0.001493 0.026964 -0.052786 0.063738
+[Debug] temb_r: [2048] first4: -0.001131 -0.159293 0.017385 0.000947
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049350 -0.051345 -0.017496 -0.036550
+[Debug] temb_lin1_r: [2048] first4: -0.014407 -0.020607 -0.015728 0.003874
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.035398 -0.945894 0.539823 0.447660
+[Debug] proj_in_input: [192, 2170] first4: -0.140341 1.456987 0.310602 -0.632665
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.173062 0.808074 0.315076 -0.565566
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.714711 -0.749357 -0.048320 0.261221
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.602913 -0.815329 -0.317055 0.489857
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.173062 0.808074 0.315076 -0.565566
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.503780 0.189824 -0.364929 0.517029
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.537518 -1.029960 0.183371 0.458036
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.537518 -1.029960 0.183371 0.458036
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.602913 -0.815329 -0.317055 0.489857
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.163809 0.540625 51.895596 -0.846802
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.398865 0.172627 33.376564 -4.390195
+[Debug] hidden_after_layer12: [2048, 1085] first4: -14.881160 -16.518404 74.148743 29.243643
+[Debug] hidden_after_layer18: [2048, 1085] first4: -27.662983 14.134428 61.787987 20.210526
+[Debug] hidden_after_layer23: [2048, 1085] first4: -15.642601 51.246216 194.762726 138.743362
+[Debug] dit_step0_vt: [2170, 64] first4: 0.094566 1.115330 0.308673 2.389967
+[Debug] dit_step0_xt: [2170, 64] first4: 0.190037 2.105553 -0.185906 0.739021
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.145169 1.334249 -0.184111 1.908013
+[Debug] dit_step1_xt: [2170, 64] first4: 0.197956 2.032776 -0.175863 0.634948
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: 0.039341 1.248196 0.097777 2.389248
+[Debug] dit_step2_xt: [2170, 64] first4: 0.195333 1.949563 -0.182382 0.475665
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.285024 1.101088 0.266534 2.655225
+[Debug] dit_step3_xt: [2170, 64] first4: 0.171581 1.857805 -0.204593 0.254396
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.327536 1.017564 0.096598 2.731005
+[Debug] dit_step4_xt: [2170, 64] first4: 0.136488 1.748781 -0.214943 -0.038212
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.307848 0.903341 -0.319663 2.789687
+[Debug] dit_step5_xt: [2170, 64] first4: 0.092510 1.619732 -0.169276 -0.436738
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.196603 0.584326 -0.838176 2.772917
+[Debug] dit_step6_xt: [2170, 64] first4: 0.053189 1.502867 -0.001641 -0.991322
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: 0.081321 0.135461 -1.397063 2.986206
+[Debug] dit_x0: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 273.2 ms (273.2 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 804.3 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000481 0.000872 0.000838 0.001216
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:54:23.682 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:54:23.683 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:54:23.683 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:54:23.683 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:54:23.683 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:54:24.419 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:54:25.992 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:54:25.992 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:54:25.998 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:54:26.157 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:54:26.166 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:54:26.178 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:54:26.178 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:54:26.214 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:54:26.528 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:54:26.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:54:26.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00680994987487793, 'diffusion_time_cost': 0.30716919898986816, 'diffusion_per_step_time_cost': 0.03839614987373352, 'total_time_cost': 0.3139791488647461, 'offload_time_cost': 0.0}
+2026-03-01 19:54:26.543 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
+2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
+2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
+2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:54:26.821 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:54:26.824 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:54:26.828 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
+[GGML] Running acestep-v15-turbo-Q6_K.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999805
+  lyric_embed                          1.000000
+  enc_hidden                           0.999638
+  detok_output                         0.999962
+  context                              0.999976
+  noise                                1.000000
+  temb_t                               0.999990
+  hidden_after_proj_in                 0.999980
+  enc_after_cond_emb                   0.999648
+  layer0_sa_output                     0.999763
+  hidden_after_layer0                  0.999888
+  hidden_after_layer6                  0.999853
+  hidden_after_layer12                 0.998917
+  hidden_after_layer18                 0.995924
+  hidden_after_layer23                 0.992281
+  dit_step0_vt                         0.971207
+  dit_step0_xt                         0.999937
+  dit_step1_vt                         0.975354
+  dit_step1_xt                         0.999803
+  dit_step2_vt                         0.978312
+  dit_step2_xt                         0.999479
+  dit_step3_vt                         0.977879
+  dit_step3_xt                         0.998730
+  dit_step4_vt                         0.976291
+  dit_step4_xt                         0.997040
+  dit_step5_vt                         0.973193
+  dit_step5_xt                         0.993208
+  dit_step6_vt                         0.969738
+  dit_step6_xt                         0.985862
+  dit_step7_vt                         0.962454
+  dit_x0                               0.974866
+  vae_audio                            0.893678
+  vae_audio (STFT cosine)              0.969663
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999937   0.147590   0.007252  -0.002265   0.972930  -0.002342   0.972003
+  dit_step1_xt             0.999803   0.291665   0.012432  -0.005192   0.942660  -0.005313   0.941730
+  dit_step2_xt             0.999479   0.474224   0.019215  -0.009147   0.909068  -0.009311   0.908527
+  dit_step3_xt             0.998730   0.730810   0.028734  -0.014438   0.873565  -0.014577   0.873624
+  dit_step4_xt             0.997040   1.058607   0.042049  -0.021507   0.841532  -0.021660   0.841995
+  dit_step5_xt             0.993208   1.534989   0.062024  -0.031604   0.824595  -0.032109   0.824593
+  dit_step6_xt             0.985862   2.188862   0.092252  -0.045920   0.855268  -0.046482   0.855546
diff --git a/tests/CUDA-Q8_0.log b/tests/CUDA-Q8_0.log
new file mode 100644
index 0000000..3a84ce1
--- /dev/null
+++ b/tests/CUDA-Q8_0.log
@@ -0,0 +1,259 @@
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
+[Load] DiT backend: CUDA0 (CPU threads: 16)
+[Load] Backend init: 9.5 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 221.9 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: CUDA0 (CPU threads: 16)
+[VAE] Backend: CUDA0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 658.9 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 31.2 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 127.0 ms
+[Encode] TextEncoder (70 tokens): 68.2 ms
+[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 12.3 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 616.6 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 65.2 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 8.9 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.759220 -0.049559 -0.133467 0.058389
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 106.5 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 12.1 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 104.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.120490 1.436288 0.301594 -0.632564
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.120490 1.436288 0.301594 -0.632564
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] tproj: [12288] first4: 0.259485 -0.161550 -0.096885 0.051766
+[Debug] temb: [2048] first4: 0.000214 -0.132557 -0.035428 0.064847
+[Debug] temb_t: [2048] first4: 0.001194 0.026823 -0.052744 0.063762
+[Debug] temb_r: [2048] first4: -0.000980 -0.159380 0.017316 0.001084
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049228 -0.051913 -0.015026 -0.038076
+[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018835 -0.015731 0.008462
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.038152 -0.959088 0.538689 0.447583
+[Debug] proj_in_input: [192, 2170] first4: -0.120490 1.436288 0.301594 -0.632564
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.179956 0.813643 0.335613 -0.560954
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.718369 -0.758056 -0.046880 0.261627
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.602359 -0.824703 -0.282831 0.487491
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.179956 0.813643 0.335613 -0.560954
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.515045 0.163439 -0.354657 0.502281
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.544158 -1.031644 0.192299 0.456963
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.544158 -1.031644 0.192299 0.456963
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.602359 -0.824703 -0.282831 0.487491
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.065077 0.563297 52.194237 -0.851381
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.390320 0.130250 33.949810 -4.149052
+[Debug] hidden_after_layer12: [2048, 1085] first4: -15.173199 -18.820404 72.616402 28.693943
+[Debug] hidden_after_layer18: [2048, 1085] first4: -25.768595 14.047658 61.759544 20.186539
+[Debug] hidden_after_layer23: [2048, 1085] first4: -4.011688 41.168625 196.180222 144.774246
+[Debug] dit_step0_vt: [2170, 64] first4: 0.018630 1.127245 0.345143 2.384104
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193489 2.105012 -0.187563 0.739288
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.199466 1.323973 -0.114465 1.890695
+[Debug] dit_step1_xt: [2170, 64] first4: 0.204369 2.032795 -0.181320 0.636159
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.009733 1.241250 0.116473 2.389213
+[Debug] dit_step2_xt: [2170, 64] first4: 0.205018 1.950045 -0.189085 0.476878
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.246129 1.078655 0.270095 2.675214
+[Debug] dit_step3_xt: [2170, 64] first4: 0.184507 1.860157 -0.211593 0.253944
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.271080 1.036363 0.114070 2.726085
+[Debug] dit_step4_xt: [2170, 64] first4: 0.155463 1.749118 -0.223814 -0.038137
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.276045 0.944946 -0.294077 2.780135
+[Debug] dit_step5_xt: [2170, 64] first4: 0.116028 1.614126 -0.181803 -0.435299
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.156088 0.649257 -0.836919 2.794098
+[Debug] dit_step6_xt: [2170, 64] first4: 0.084810 1.484275 -0.014420 -0.994119
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.007394 0.229067 -1.488817 3.083439
+[Debug] dit_x0: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 242.9 ms (242.9 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 822.6 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000524 0.000859 0.000752 0.001056
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:54:15.905 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:54:15.906 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:54:15.906 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:54:15.906 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:54:15.906 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:54:16.672 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:54:18.198 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:54:18.198 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:54:18.207 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:54:18.371 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:54:18.380 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:54:18.392 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:54:18.392 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:54:18.418 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:54:18.724 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:54:18.724 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:54:18.724 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006882190704345703, 'diffusion_time_cost': 0.298403263092041, 'diffusion_per_step_time_cost': 0.03730040788650513, 'total_time_cost': 0.3052854537963867, 'offload_time_cost': 0.0}
+2026-03-01 19:54:18.739 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
+2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
+2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
+2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:54:19.031 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:54:19.034 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:54:19.037 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
+[GGML] Running acestep-v15-turbo-Q8_0.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999805
+  lyric_embed                          1.000000
+  enc_hidden                           0.999784
+  detok_output                         0.999983
+  context                              0.999990
+  noise                                1.000000
+  temb_t                               0.999997
+  hidden_after_proj_in                 0.999986
+  enc_after_cond_emb                   0.999765
+  layer0_sa_output                     0.999924
+  hidden_after_layer0                  0.999957
+  hidden_after_layer6                  0.999892
+  hidden_after_layer12                 0.999346
+  hidden_after_layer18                 0.996758
+  hidden_after_layer23                 0.993881
+  dit_step0_vt                         0.976421
+  dit_step0_xt                         0.999948
+  dit_step1_vt                         0.979128
+  dit_step1_xt                         0.999834
+  dit_step2_vt                         0.982059
+  dit_step2_xt                         0.999561
+  dit_step3_vt                         0.983029
+  dit_step3_xt                         0.998948
+  dit_step4_vt                         0.981353
+  dit_step4_xt                         0.997565
+  dit_step5_vt                         0.978860
+  dit_step5_xt                         0.994480
+  dit_step6_vt                         0.976051
+  dit_step6_xt                         0.988641
+  dit_step7_vt                         0.970144
+  dit_x0                               0.979969
+  vae_audio                            0.905525
+  vae_audio (STFT cosine)              0.976530
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999948   0.134961   0.006551  -0.002307   0.972901  -0.002342   0.972003
+  dit_step1_xt             0.999834   0.262688   0.011280  -0.005306   0.942604  -0.005313   0.941730
+  dit_step2_xt             0.999561   0.448301   0.017428  -0.009351   0.909110  -0.009311   0.908527
+  dit_step3_xt             0.998948   0.617858   0.025766  -0.014708   0.873709  -0.014577   0.873624
+  dit_step4_xt             0.997565   0.740504   0.037507  -0.021763   0.841873  -0.021660   0.841995
+  dit_step5_xt             0.994480   1.211945   0.054863  -0.031844   0.825164  -0.032109   0.824593
+  dit_step6_xt             0.988641   2.056566   0.081142  -0.046105   0.856063  -0.046482   0.855546
diff --git a/tests/Metal-Q4_K_M.log b/tests/Metal-Q4_K_M.log
new file mode 100644
index 0000000..e1ad24f
--- /dev/null
+++ b/tests/Metal-Q4_K_M.log
@@ -0,0 +1,835 @@
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.006 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 20.9 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 895.6 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 1421.5 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 337.8 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 42.3 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 593.9 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x11de0dee0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x11de0e340 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x11de0ebb0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x11de0f030 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x11de0f8a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x11de0fed0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x11de107b0 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x11de11170 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x11de10350 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 44.4 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 33.8 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 140 tensors, 352.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 543.9 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x11de1b4b0 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1            0x11de1ba60 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x11de1bea0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x11de1c500 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 149.3 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.751263 -0.045978 -0.129705 0.058765
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 64.7 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 113.4 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x11de10d70 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x11de0aff0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x11de0b950 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x11de1c9a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8      0x11de1d9f0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x11de1dfa0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x11de1e320 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x11de1e580 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x11de1ef20 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 1044.0 ms
+[Debug] detok_output: [2170, 64] first4: -0.105288 1.440285 0.304742 -0.636920
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.105288 1.440285 0.304742 -0.636920
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x11f008d70 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x11f009830 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x11f009c40 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2                  0x11f00ac80 | th_max =  768 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x11f00b000 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x11f00b6c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x11f00b920 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x11f00bec0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x11f00c450 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x11f00cd60 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.260912 -0.160417 -0.090199 0.048634
+[Debug] temb: [2048] first4: 0.000215 -0.133911 -0.034469 0.065007
+[Debug] temb_t: [2048] first4: 0.000971 0.025677 -0.052124 0.063327
+[Debug] temb_r: [2048] first4: -0.000756 -0.159588 0.017655 0.001680
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049286 -0.053324 -0.012254 -0.047666
+[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.050396 -0.992003 0.526498 0.458000
+[Debug] proj_in_input: [192, 2170] first4: -0.105288 1.440285 0.304742 -0.636920
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.174268 0.781178 0.275122 -0.515942
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.726228 -0.772737 -0.041859 0.262417
+[Debug] layer0_q_after_rope: [128, 16] first4: -12.136272 0.820533 1.509364 1.799582
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.174268 0.781178 0.275122 -0.515942
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.599759 0.160940 -0.480259 0.455996
+[Debug] layer0_attn_out: [2048, 1085] first4: -12.315464 1.144032 1.760677 1.796125
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.579560 -1.062863 0.061853 0.466855
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.703488 -0.838320 -0.450424 0.503514
+[Debug] hidden_after_layer0: [2048, 1085] first4: -8.870923 0.423529 48.381233 -0.778579
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.397562 -1.526012 29.991730 -3.928804
+[Debug] hidden_after_layer12: [2048, 1085] first4: -17.419617 -13.309786 66.317848 28.914410
+[Debug] hidden_after_layer18: [2048, 1085] first4: -16.562674 9.657765 55.222641 17.661957
+[Debug] hidden_after_layer23: [2048, 1085] first4: -19.112629 7.039753 181.464966 133.927719
+[Debug] dit_step0_vt: [2170, 64] first4: -0.112419 1.107940 0.244994 2.200569
+[Debug] dit_step0_xt: [2170, 64] first4: 0.199446 2.105889 -0.183011 0.747630
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.082195 1.204432 -0.273788 1.824850
+[Debug] dit_step1_xt: [2170, 64] first4: 0.203929 2.040193 -0.168077 0.648093
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.043690 1.209513 0.074423 2.191977
+[Debug] dit_step2_xt: [2170, 64] first4: 0.206842 1.959559 -0.173039 0.501961
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.238132 1.171738 0.272480 2.506455
+[Debug] dit_step3_xt: [2170, 64] first4: 0.186998 1.861914 -0.195745 0.293090
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.293275 1.147521 0.096848 2.639339
+[Debug] dit_step4_xt: [2170, 64] first4: 0.155575 1.738965 -0.206122 0.010304
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.200179 1.089847 -0.403776 2.739777
+[Debug] dit_step5_xt: [2170, 64] first4: 0.126978 1.583273 -0.148440 -0.381093
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: -0.078240 0.999644 -1.058107 2.768797
+[Debug] dit_step6_xt: [2170, 64] first4: 0.142626 1.383344 0.063182 -0.934852
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.417903 0.862772 -1.662739 3.246292
+[Debug] dit_x0: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 7809.5 ms (7809.5 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x11de1ab80 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x11de1ceb0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x11de1f410 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x11de1f670 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x11de1fa20 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x11de20200 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x11de20760 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x11de216c0 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x11de21920 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609663.4 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000307 0.000830 0.000664 0.001050
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.006 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 18.8 ms
+[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 895.6 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 1269.3 ms
+[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 272.1 ms
+[Request 1/1] ggml-sft/request0.json (batch=1)
+[Request] parsed ggml-sft/request0.json (18 fields)
+[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 41.8 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 231.9 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x15570a490 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x15570a8f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x15570b160 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x15570b5e0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x15570be50 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x15570c480 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x15570cd60 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x15570d170 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x15570d3d0 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 48.9 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 33.9 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 140 tensors, 352.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 601.2 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x155717100 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1            0x1557176b0 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x155717a30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x155718090 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 151.9 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.751314 -0.046022 -0.129862 0.058756
+[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 30 tensors, 64.7 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 101.7 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x15570ebf0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x155707790 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x155707dc0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x1557074e0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8      0x1557192f0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x1557198a0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x155719c20 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x155719e80 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x15571a8c0 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 1040.2 ms
+[Debug] detok_output: [2170, 64] first4: -0.105274 1.439665 0.307319 -0.637002
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.105274 1.439665 0.307319 -0.637002
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x15560cd80 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x15560d720 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x15560db30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2                  0x15560eb70 | th_max =  768 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x15560eef0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x15560f5b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x15560f810 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x15560fdb0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x155610340 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x155610d60 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.154826 -0.114975 -0.093002 0.082122
+[Debug] temb: [2048] first4: -0.003593 -0.176168 0.003892 -0.001352
+[Debug] temb_t: [2048] first4: -0.002002 0.003482 -0.013423 -0.001611
+[Debug] temb_r: [2048] first4: -0.001591 -0.179650 0.017315 0.000259
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.026166 0.013606 0.032789 -0.028782
+[Debug] temb_lin1_r: [2048] first4: -0.001795 -0.011535 -0.006725 -0.011136
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.101326 -0.948224 0.490204 0.449757
+[Debug] proj_in_input: [192, 2170] first4: -0.105274 1.439665 0.307319 -0.637002
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.189214 0.805478 0.284418 -0.472295
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.923880 -0.725952 -0.044805 0.297821
+[Debug] layer0_q_after_rope: [128, 16] first4: -12.125128 0.516320 1.460617 1.783048
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.189214 0.805478 0.284418 -0.472295
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.642741 0.751692 -0.708185 0.515940
+[Debug] layer0_attn_out: [2048, 1085] first4: -11.610563 1.032188 1.685498 1.814675
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803507 -1.373816 -0.306776 0.394307
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.998315 -1.012332 -0.558752 0.397301
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.408201 1.261657 57.661659 -1.674409
+[Debug] hidden_after_layer6: [2048, 1085] first4: -13.125732 4.401457 57.923130 1.593087
+[Debug] hidden_after_layer12: [2048, 1085] first4: -12.760151 8.784775 -27.576780 1.266083
+[Debug] hidden_after_layer18: [2048, 1085] first4: -3.029438 18.924910 -37.522003 -17.408060
+[Debug] hidden_after_layer23: [2048, 1085] first4: 25.718348 50.253456 58.487469 -24.616550
+[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.549879 2.587143 -0.199758 1.525680
+[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.221552 2.068977 -0.854510 1.731250
+[Debug] dit_step0_vt: [2170, 64] first4: -0.770128 3.170936 0.103367 1.213956
+[Debug] dit_step0_xt: [2170, 64] first4: 0.209738 2.092831 -0.173942 0.823377
+[DiT] step 1/50 t=1.000
+[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.553963 2.540515 -0.004453 1.412831
+[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.507386 2.385438 -0.093360 1.515296
+[Debug] dit_step1_vt: [2170, 64] first4: -0.244245 1.996188 -0.241419 1.557151
+[Debug] dit_step1_xt: [2170, 64] first4: 0.214623 2.052907 -0.169114 0.792234
+[DiT] step 2/50 t=0.980
+[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.537810 2.506870 -0.002615 1.406658
+[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.493937 2.362162 -0.101829 1.455003
+[Debug] dit_step2_vt: [2170, 64] first4: -0.705231 2.991064 0.252674 1.183649
+[Debug] dit_step2_xt: [2170, 64] first4: 0.228728 1.993086 -0.174167 0.768561
+[DiT] step 3/50 t=0.960
+[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.501613 2.438805 -0.019274 1.410215
+[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.521661 2.364079 -0.095044 1.376828
+[Debug] dit_step3_vt: [2170, 64] first4: -0.201260 2.055526 -0.239553 1.689172
+[Debug] dit_step3_xt: [2170, 64] first4: 0.232753 1.951976 -0.169376 0.734778
+[DiT] step 4/50 t=0.940
+[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.465795 2.359768 -0.032364 1.399407
+[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.532122 2.334485 -0.099644 1.341739
+[Debug] dit_step4_vt: [2170, 64] first4: -0.511269 2.624130 0.214392 1.268924
+[Debug] dit_step4_xt: [2170, 64] first4: 0.242979 1.899493 -0.173664 0.709399
+[DiT] step 5/50 t=0.920
+[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.416940 2.273875 -0.055556 1.387350
+[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.572103 2.299005 -0.092359 1.353066
+[Debug] dit_step5_vt: [2170, 64] first4: 0.057514 1.863401 -0.254107 1.537004
+[Debug] dit_step5_xt: [2170, 64] first4: 0.241828 1.862225 -0.168582 0.678659
+[DiT] step 6/50 t=0.900
+[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.399254 2.210152 -0.071076 1.369134
+[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.539425 2.227666 -0.114236 1.361075
+[Debug] dit_step6_vt: [2170, 64] first4: -0.380751 2.356979 0.167812 1.213706
+[Debug] dit_step6_xt: [2170, 64] first4: 0.249443 1.815086 -0.171938 0.654385
+[DiT] step 7/50 t=0.880
+[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.339429 2.118412 -0.091855 1.350106
+[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.475619 2.122700 -0.120878 1.360558
+[Debug] dit_step7_vt: [2170, 64] first4: 0.052995 1.858614 -0.256165 1.379718
+[Debug] dit_step7_xt: [2170, 64] first4: 0.248383 1.777913 -0.166815 0.626791
+[DiT] step 8/50 t=0.860
+[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.274483 2.023758 -0.093816 1.332238
+[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.393477 2.023790 -0.130114 1.332444
+[Debug] dit_step8_vt: [2170, 64] first4: -0.218486 2.105614 0.134615 1.230365
+[Debug] dit_step8_xt: [2170, 64] first4: 0.252753 1.735801 -0.169507 0.602183
+[DiT] step 9/50 t=0.840
+[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.208702 1.940720 -0.100297 1.317338
+[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.302713 1.942014 -0.150935 1.306566
+[Debug] dit_step9_vt: [2170, 64] first4: 0.068625 1.756381 -0.163156 1.360642
+[Debug] dit_step9_xt: [2170, 64] first4: 0.251381 1.700673 -0.166244 0.574971
+[DiT] step 10/50 t=0.820
+[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.162154 1.880021 -0.110640 1.303073
+[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.210358 1.886162 -0.152094 1.303815
+[Debug] dit_step10_vt: [2170, 64] first4: -0.200484 1.879984 0.061434 1.187651
+[Debug] dit_step10_xt: [2170, 64] first4: 0.255390 1.663074 -0.167473 0.551217
+[DiT] step 11/50 t=0.800
+[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.132763 1.841353 -0.152935 1.280443
+[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.156466 1.839952 -0.166283 1.309973
+[Debug] dit_step11_vt: [2170, 64] first4: -0.006319 1.715424 -0.248815 1.180641
+[Debug] dit_step11_xt: [2170, 64] first4: 0.255517 1.628765 -0.162497 0.527605
+[DiT] step 12/50 t=0.780
+[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.108732 1.804132 -0.204569 1.271017
+[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.137749 1.799717 -0.174060 1.349185
+[Debug] dit_step12_vt: [2170, 64] first4: -0.093850 1.775385 -0.218540 0.972914
+[Debug] dit_step12_xt: [2170, 64] first4: 0.257394 1.593257 -0.158126 0.508146
+[DiT] step 13/50 t=0.760
+[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.084325 1.755919 -0.251734 1.253830
+[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.116151 1.744928 -0.223829 1.345488
+[Debug] dit_step13_vt: [2170, 64] first4: 0.034148 1.681178 -0.334965 1.042164
+[Debug] dit_step13_xt: [2170, 64] first4: 0.256711 1.559634 -0.151426 0.487303
+[DiT] step 14/50 t=0.740
+[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.062454 1.706585 -0.275264 1.242871
+[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.092396 1.687153 -0.270903 1.319513
+[Debug] dit_step14_vt: [2170, 64] first4: -0.030339 1.704105 -0.218537 1.004399
+[Debug] dit_step14_xt: [2170, 64] first4: 0.257318 1.525552 -0.147056 0.467215
+[DiT] step 15/50 t=0.720
+[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.039531 1.653934 -0.274129 1.244472
+[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.065533 1.623524 -0.308950 1.280105
+[Debug] dit_step15_vt: [2170, 64] first4: 0.042593 1.646848 -0.174753 1.192683
+[Debug] dit_step15_xt: [2170, 64] first4: 0.256466 1.492615 -0.143561 0.443362
+[DiT] step 16/50 t=0.700
+[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.024221 1.582624 -0.288380 1.229998
+[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.041512 1.552975 -0.330420 1.243577
+[Debug] dit_step16_vt: [2170, 64] first4: -0.014702 1.584471 -0.181940 1.121346
+[Debug] dit_step16_xt: [2170, 64] first4: 0.256760 1.460925 -0.139922 0.420935
+[DiT] step 17/50 t=0.680
+[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.016144 1.507916 -0.306446 1.209517
+[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.023216 1.483080 -0.342848 1.208134
+[Debug] dit_step17_vt: [2170, 64] first4: 0.010192 1.492126 -0.218166 1.213425
+[Debug] dit_step17_xt: [2170, 64] first4: 0.256556 1.431083 -0.135559 0.396666
+[DiT] step 18/50 t=0.660
+[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.011327 1.429419 -0.322466 1.189975
+[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.006504 1.414708 -0.351011 1.186830
+[Debug] dit_step18_vt: [2170, 64] first4: -0.055648 1.401301 -0.242752 1.127735
+[Debug] dit_step18_xt: [2170, 64] first4: 0.257669 1.403057 -0.130704 0.374111
+[DiT] step 19/50 t=0.640
+[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.008919 1.352955 -0.336887 1.164963
+[Debug] dit_step19_vt_uncond: [2170, 64] first4: 0.006420 1.358623 -0.354804 1.168313
+[Debug] dit_step19_vt: [2170, 64] first4: -0.054127 1.236317 -0.295143 1.130394
+[Debug] dit_step19_xt: [2170, 64] first4: 0.258751 1.378330 -0.124801 0.351504
+[DiT] step 20/50 t=0.620
+[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.004449 1.272026 -0.345863 1.142193
+[Debug] dit_step20_vt_uncond: [2170, 64] first4: 0.019787 1.305161 -0.354228 1.148333
+[Debug] dit_step20_vt: [2170, 64] first4: -0.100401 1.079987 -0.320124 1.076506
+[Debug] dit_step20_xt: [2170, 64] first4: 0.260759 1.356731 -0.118398 0.329973
+[DiT] step 21/50 t=0.600
+[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.002161 1.194354 -0.356476 1.115376
+[Debug] dit_step21_vt_uncond: [2170, 64] first4: 0.027066 1.258520 -0.355503 1.123235
+[Debug] dit_step21_vt: [2170, 64] first4: -0.089629 0.890893 -0.367078 1.066256
+[Debug] dit_step21_xt: [2170, 64] first4: 0.262552 1.338913 -0.111057 0.308648
+[DiT] step 22/50 t=0.580
+[Debug] dit_step22_vt_cond: [2170, 64] first4: 0.001542 1.116787 -0.366798 1.082653
+[Debug] dit_step22_vt_uncond: [2170, 64] first4: 0.034784 1.215104 -0.359348 1.094688
+[Debug] dit_step22_vt: [2170, 64] first4: -0.114017 0.710875 -0.381058 1.001636
+[Debug] dit_step22_xt: [2170, 64] first4: 0.264832 1.324695 -0.103435 0.288616
+[DiT] step 23/50 t=0.560
+[Debug] dit_step23_vt_cond: [2170, 64] first4: 0.004356 1.043939 -0.376088 1.054782
+[Debug] dit_step23_vt_uncond: [2170, 64] first4: 0.040331 1.176215 -0.358597 1.069999
+[Debug] dit_step23_vt: [2170, 64] first4: -0.106657 0.513238 -0.439613 0.976581
+[Debug] dit_step23_xt: [2170, 64] first4: 0.266965 1.314431 -0.094643 0.269084
+[DiT] step 24/50 t=0.540
+[Debug] dit_step24_vt_cond: [2170, 64] first4: 0.004404 0.961254 -0.387939 1.015311
+[Debug] dit_step24_vt_uncond: [2170, 64] first4: 0.043793 1.129819 -0.356263 1.035491
+[Debug] dit_step24_vt: [2170, 64] first4: -0.131273 0.309370 -0.487982 0.900439
+[Debug] dit_step24_xt: [2170, 64] first4: 0.269591 1.308243 -0.084884 0.251075
+[DiT] step 25/50 t=0.520
+[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.001606 0.858703 -0.396162 0.970976
+[Debug] dit_step25_vt_uncond: [2170, 64] first4: 0.045187 1.067146 -0.350258 0.994534
+[Debug] dit_step25_vt: [2170, 64] first4: -0.160841 0.082930 -0.542274 0.862474
+[Debug] dit_step25_xt: [2170, 64] first4: 0.272808 1.306585 -0.074038 0.233826
+[DiT] step 26/50 t=0.500
+[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.011834 0.743138 -0.406478 0.912916
+[Debug] dit_step26_vt_uncond: [2170, 64] first4: 0.044098 0.988983 -0.348666 0.943761
+[Debug] dit_step26_vt: [2170, 64] first4: -0.203731 -0.135469 -0.575882 0.759197
+[Debug] dit_step26_xt: [2170, 64] first4: 0.276882 1.309294 -0.062520 0.218642
+[DiT] step 27/50 t=0.480
+[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.028043 0.640231 -0.413465 0.856122
+[Debug] dit_step27_vt_uncond: [2170, 64] first4: 0.038067 0.910543 -0.350117 0.887872
+[Debug] dit_step27_vt: [2170, 64] first4: -0.249926 -0.275849 -0.588337 0.733838
+[Debug] dit_step27_xt: [2170, 64] first4: 0.281881 1.314811 -0.050754 0.203965
+[DiT] step 28/50 t=0.460
+[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.048697 0.519480 -0.427048 0.785924
+[Debug] dit_step28_vt_uncond: [2170, 64] first4: 0.029577 0.811304 -0.356754 0.820204
+[Debug] dit_step28_vt: [2170, 64] first4: -0.313111 -0.465662 -0.625360 0.626629
+[Debug] dit_step28_xt: [2170, 64] first4: 0.288143 1.324124 -0.038247 0.191432
+[DiT] step 29/50 t=0.440
+[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.073682 0.390412 -0.435695 0.713586
+[Debug] dit_step29_vt_uncond: [2170, 64] first4: 0.022755 0.688592 -0.366629 0.750458
+[Debug] dit_step29_vt: [2170, 64] first4: -0.404692 -0.558608 -0.601264 0.570632
+[Debug] dit_step29_xt: [2170, 64] first4: 0.296237 1.335296 -0.026221 0.180020
+[DiT] step 30/50 t=0.420
+[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.100612 0.256910 -0.442863 0.643070
+[Debug] dit_step30_vt_uncond: [2170, 64] first4: 0.014270 0.550700 -0.380145 0.680719
+[Debug] dit_step30_vt: [2170, 64] first4: -0.477652 -0.675684 -0.591087 0.486411
+[Debug] dit_step30_xt: [2170, 64] first4: 0.305790 1.348810 -0.014400 0.170292
+[DiT] step 31/50 t=0.400
+[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.127005 0.130974 -0.446946 0.576489
+[Debug] dit_step31_vt_uncond: [2170, 64] first4: 0.003612 0.415976 -0.399074 0.614345
+[Debug] dit_step31_vt: [2170, 64] first4: -0.549710 -0.743030 -0.526327 0.431312
+[Debug] dit_step31_xt: [2170, 64] first4: 0.316784 1.363671 -0.003873 0.161665
+[DiT] step 32/50 t=0.380
+[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.154932 -0.000795 -0.447535 0.511295
+[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.007317 0.275916 -0.413101 0.549311
+[Debug] dit_step32_vt: [2170, 64] first4: -0.628125 -0.848536 -0.505066 0.360242
+[Debug] dit_step32_xt: [2170, 64] first4: 0.329347 1.380641 0.006228 0.154460
+[DiT] step 33/50 t=0.360
+[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.183072 -0.130801 -0.438493 0.449678
+[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.021971 0.136892 -0.420384 0.490091
+[Debug] dit_step33_vt: [2170, 64] first4: -0.685087 -0.931651 -0.428386 0.294226
+[Debug] dit_step33_xt: [2170, 64] first4: 0.343048 1.399274 0.014796 0.148576
+[DiT] step 34/50 t=0.340
+[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.207282 -0.251064 -0.429462 0.399560
+[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.035614 0.010201 -0.426610 0.442224
+[Debug] dit_step34_vt: [2170, 64] first4: -0.740469 -1.039289 -0.393755 0.238626
+[Debug] dit_step34_xt: [2170, 64] first4: 0.357858 1.420060 0.022671 0.143803
+[DiT] step 35/50 t=0.320
+[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.234011 -0.373429 -0.414613 0.349351
+[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.051328 -0.116322 -0.423153 0.392585
+[Debug] dit_step35_vt: [2170, 64] first4: -0.800518 -1.139187 -0.342183 0.192528
+[Debug] dit_step35_xt: [2170, 64] first4: 0.373868 1.442844 0.029515 0.139953
+[DiT] step 36/50 t=0.300
+[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.261591 -0.503509 -0.392160 0.303680
+[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.072050 -0.249828 -0.410849 0.351470
+[Debug] dit_step36_vt: [2170, 64] first4: -0.838416 -1.260836 -0.298992 0.122180
+[Debug] dit_step36_xt: [2170, 64] first4: 0.390637 1.468061 0.035495 0.137509
+[DiT] step 37/50 t=0.280
+[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.290611 -0.615966 -0.361295 0.261135
+[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.095822 -0.367916 -0.388325 0.310791
+[Debug] dit_step37_vt: [2170, 64] first4: -0.893251 -1.349895 -0.245346 0.089192
+[Debug] dit_step37_xt: [2170, 64] first4: 0.408502 1.495059 0.040402 0.135725
+[DiT] step 38/50 t=0.260
+[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.316862 -0.724614 -0.326989 0.221074
+[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.120406 -0.482601 -0.361356 0.272140
+[Debug] dit_step38_vt: [2170, 64] first4: -0.917953 -1.452874 -0.195436 0.033767
+[Debug] dit_step38_xt: [2170, 64] first4: 0.426861 1.524116 0.044310 0.135050
+[DiT] step 39/50 t=0.240
+[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.344701 -0.840724 -0.280406 0.181682
+[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.151500 -0.605403 -0.318787 0.232017
+[Debug] dit_step39_vt: [2170, 64] first4: -0.945851 -1.537027 -0.144223 0.008566
+[Debug] dit_step39_xt: [2170, 64] first4: 0.445778 1.554857 0.047195 0.134879
+[DiT] step 40/50 t=0.220
+[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.369051 -0.939547 -0.228334 0.139823
+[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.182335 -0.713639 -0.270236 0.191739
+[Debug] dit_step40_vt: [2170, 64] first4: -0.958883 -1.593756 -0.082150 -0.057526
+[Debug] dit_step40_xt: [2170, 64] first4: 0.464955 1.586732 0.048838 0.136029
+[DiT] step 41/50 t=0.200
+[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.388759 -1.034758 -0.170808 0.098079
+[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.215027 -0.818665 -0.212783 0.153622
+[Debug] dit_step41_vt: [2170, 64] first4: -0.929079 -1.656826 -0.036355 -0.101313
+[Debug] dit_step41_xt: [2170, 64] first4: 0.483537 1.619868 0.049565 0.138056
+[DiT] step 42/50 t=0.180
+[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.404481 -1.121373 -0.110304 0.048469
+[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.250394 -0.918649 -0.148512 0.113292
+[Debug] dit_step42_vt: [2170, 64] first4: -0.870129 -1.689520 0.009394 -0.198920
+[Debug] dit_step42_xt: [2170, 64] first4: 0.500939 1.653659 0.049377 0.142034
+[DiT] step 43/50 t=0.160
+[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.416518 -1.199422 -0.047277 -0.004303
+[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.285961 -1.014739 -0.080642 0.076449
+[Debug] dit_step43_vt: [2170, 64] first4: -0.799869 -1.709703 0.054214 -0.305297
+[Debug] dit_step43_xt: [2170, 64] first4: 0.516937 1.687853 0.048293 0.148140
+[DiT] step 44/50 t=0.140
+[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.422798 -1.270758 0.022277 -0.058297
+[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.318056 -1.108378 -0.007512 0.042141
+[Debug] dit_step44_vt: [2170, 64] first4: -0.718613 -1.710690 0.113612 -0.432909
+[Debug] dit_step44_xt: [2170, 64] first4: 0.531309 1.722067 0.046020 0.156798
+[DiT] step 45/50 t=0.120
+[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.430461 -1.334901 0.090295 -0.107751
+[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.346132 -1.190932 0.060499 0.012419
+[Debug] dit_step45_vt: [2170, 64] first4: -0.676233 -1.740750 0.184198 -0.543741
+[Debug] dit_step45_xt: [2170, 64] first4: 0.544834 1.756882 0.042336 0.167673
+[DiT] step 46/50 t=0.100
+[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.442548 -1.408986 0.177202 -0.124432
+[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.366053 -1.276834 0.124694 -0.015436
+[Debug] dit_step46_vt: [2170, 64] first4: -0.689058 -1.805405 0.393872 -0.448936
+[Debug] dit_step46_xt: [2170, 64] first4: 0.558615 1.792990 0.034459 0.176652
+[DiT] step 47/50 t=0.080
+[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.439555 -1.466634 0.226367 -0.147289
+[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.380429 -1.352640 0.169813 -0.038167
+[Debug] dit_step47_vt: [2170, 64] first4: -0.594441 -1.800792 0.366639 -0.559854
+[Debug] dit_step47_xt: [2170, 64] first4: 0.570504 1.829005 0.027126 0.187849
+[DiT] step 48/50 t=0.060
+[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.421519 -1.502992 0.243896 -0.165260
+[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.386849 -1.417176 0.200885 -0.065191
+[Debug] dit_step48_vt: [2170, 64] first4: -0.516278 -1.762812 0.360980 -0.463950
+[Debug] dit_step48_xt: [2170, 64] first4: 0.580829 1.864262 0.019907 0.197128
+[DiT] step 49/50 t=0.040
+[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.442348 -1.531937 0.237906 -0.192473
+[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.399571 -1.435245 0.199709 -0.084932
+[Debug] dit_step49_vt: [2170, 64] first4: -0.632891 -1.901084 0.347748 -0.622644
+[Debug] dit_x0: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581
+[DiT] step 50/50 t=0.020
+[DiT] Total generation: 97237.2 ms (97237.2 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x1556105a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x1556166d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x155616930 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x155616fc0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x155617400 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x155617a00 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x155617f60 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x155618e40 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x10b705130 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609639.3 ms
+[Debug] vae_audio: [2, 4166400] first4: -0.001780 -0.001606 -0.001703 -0.001406
+[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+[Request] Loaded request0.json
+[Noise] Reusing existing rng_philox_seed42.bf16
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
+[GGML] Done, 47 dump files
+[Turbo] Reusing existing Python dumps: python-turbo
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.997096
+  detok_output                         0.999629
+  context                              0.999763
+  noise                                1.000000
+  temb_t                               0.999906
+  hidden_after_proj_in                 0.999918
+  enc_after_cond_emb                   0.997606
+  layer0_sa_output                     0.998452
+  hidden_after_layer0                  0.999696
+  hidden_after_layer6                  0.999330
+  hidden_after_layer12                 0.995408
+  hidden_after_layer18                 0.991270
+  hidden_after_layer23                 0.984826
+  dit_step0_vt                         0.944528
+  dit_step0_xt                         0.999878
+  dit_step1_vt                         0.947871
+  dit_step1_xt                         0.999609
+  dit_step2_vt                         0.956355
+  dit_step2_xt                         0.998980
+  dit_step3_vt                         0.961293
+  dit_step3_xt                         0.997669
+  dit_step4_vt                         0.958834
+  dit_step4_xt                         0.994713
+  dit_step5_vt                         0.956132
+  dit_step5_xt                         0.988221
+  dit_step6_vt                         0.950838
+  dit_step6_xt                         0.976124
+  dit_step7_vt                         0.938802
+  dit_x0                               0.958347
+  vae_audio                            0.832313
+  vae_audio (log spectral)             0.999533
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999877   0.165977   0.010464  -0.002251   0.973155  -0.002342   0.972003
+  dit_step1_xt             0.999608   0.266862   0.018170  -0.005108   0.943161  -0.005313   0.941730
+  dit_step2_xt             0.998979   0.448963   0.028101  -0.009001   0.910184  -0.009311   0.908527
+  dit_step3_xt             0.997667   0.610427   0.040689  -0.014279   0.875248  -0.014577   0.873624
+  dit_step4_xt             0.994712   0.903635   0.058677  -0.021196   0.843722  -0.021660   0.841995
+  dit_step5_xt             0.988220   1.370464   0.085448  -0.031128   0.827283  -0.032109   0.824593
+  dit_step6_xt             0.976123   1.998804   0.126069  -0.045345   0.858424  -0.046482   0.855546
+[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf
+[GGML] Running acestep-v15-sft-Q4_K_M.gguf...
+[GGML] Done, 233 dump files
+[SFT] Reusing existing Python dumps: python-sft
+[SFT] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.997097
+  detok_output                         0.999629
+  context                              0.999763
+  noise                                1.000000
+  temb_t                               0.999673
+  hidden_after_proj_in                 0.999917
+  enc_after_cond_emb                   0.997598
+  layer0_sa_output                     0.998569
+  hidden_after_layer0                  0.999686
+  hidden_after_layer6                  0.999172
+  hidden_after_layer12                 0.997776
+  hidden_after_layer18                 0.996818
+  hidden_after_layer23                 0.997039
+  null_condition_emb                   1.000000
+  null_enc_hidden                      1.000000
+  dit_step0_vt_cond                    0.996934
+  dit_step0_vt_uncond                  0.996212
+  dit_step0_vt                         0.990566
+  dit_step0_xt                         0.999995
+  dit_step5_vt_cond                    0.995434
+  dit_step5_vt                         0.980046
+  dit_step5_xt                         0.999823
+  dit_step10_vt_cond                   0.991133
+  dit_step10_vt                        0.971906
+  dit_step10_xt                        0.999207
+  dit_step15_vt_cond                   0.982704
+  dit_step15_vt                        0.950629
+  dit_step15_xt                        0.997454
+  dit_step20_vt_cond                   0.968600
+  dit_step20_vt                        0.929360
+  dit_step20_xt                        0.993412
+  dit_step25_vt_cond                   0.951686
+  dit_step25_vt                        0.903442
+  dit_step25_xt                        0.986280
+  dit_step30_vt_cond                   0.931805
+  dit_step30_vt                        0.881992
+  dit_step30_xt                        0.976117
+  dit_step35_vt_cond                   0.911309
+  dit_step35_vt                        0.858516
+  dit_step35_xt                        0.964745
+  dit_step40_vt_cond                   0.898448
+  dit_step40_vt                        0.843064
+  dit_step40_xt                        0.954421
+  dit_step45_vt_cond                   0.908747
+  dit_step45_vt                        0.865504
+  dit_step45_xt                        0.947533
+  dit_step49_vt_cond                   0.927312
+  dit_step49_vt                        0.885368
+  dit_x0                               0.945292
+  vae_audio                            0.825801
+  vae_audio (log spectral)             0.999459
+[SFT] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999994   0.035677   0.002825  -0.001840   0.980345  -0.001741   0.980402
+  dit_step5_xt             0.999822   0.191921   0.012992  -0.007283   0.890515  -0.007143   0.887999
+  dit_step10_xt            0.999206   0.526469   0.024282  -0.012946   0.812557  -0.012603   0.811299
+  dit_step15_xt            0.997453   0.836399   0.039177  -0.018559   0.748243  -0.018114   0.745269
+  dit_step20_xt            0.993411   1.152330   0.058726  -0.024275   0.703300  -0.023808   0.699582
+  dit_step25_xt            0.986279   1.542745   0.081991  -0.030177   0.682229  -0.029311   0.679278
+  dit_step30_xt            0.976117   1.915049   0.109049  -0.036245   0.688533  -0.035027   0.685262
+  dit_step35_xt            0.964744   2.242426   0.138946  -0.042318   0.720837  -0.040716   0.717196
+  dit_step40_xt            0.954421   2.562076   0.170565  -0.048389   0.775001  -0.046462   0.771853
+  dit_step45_xt            0.947532   2.889421   0.200672  -0.054787   0.846930  -0.052475   0.843036
diff --git a/tests/Metal-Q5_K_M.log b/tests/Metal-Q5_K_M.log
new file mode 100644
index 0000000..a25afc6
--- /dev/null
+++ b/tests/Metal-Q5_K_M.log
@@ -0,0 +1,835 @@
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.007 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 21.7 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 1538.7 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 275.7 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 42.1 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 230.3 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x11cf0b930 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x11cf0bd90 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x11cf0c600 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x11cf0ca80 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x11cf0d2f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x11cf0d920 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x11cf0e200 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x11cf0e610 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x11cf0e870 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 44.0 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 33.7 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 140 tensors, 412.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 572.5 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x11ce0c140 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1            0x11cf17e80 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x11cf18860 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x11cf18ec0 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 158.6 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 73.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 113.6 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x11ce0cf30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x11ce0d840 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x11ce0ddf0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x11ce0e050 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8      0x11ce0ea30 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x11ce0efe0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x11ce0f360 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x11ce0f5c0 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x11ce10000 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 1065.0 ms
+[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x11cf09240 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x11cf19120 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x11cf19380 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2                  0x11cf1a3c0 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x11cf1a740 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x11cf1ae00 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x11cf1b060 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x11cf1b600 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x11cf1bb90 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x11cf1c5b0 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.260409 -0.161609 -0.102203 0.051602
+[Debug] temb: [2048] first4: -0.000151 -0.132293 -0.035516 0.064751
+[Debug] temb_t: [2048] first4: 0.000578 0.026708 -0.052786 0.063514
+[Debug] temb_r: [2048] first4: -0.000729 -0.159001 0.017269 0.001237
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.051153 -0.053631 -0.012192 -0.039024
+[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.044511 -0.951831 0.540187 0.457322
+[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.153168 0.787275 0.319340 -0.492001
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.722961 -0.753736 -0.051927 0.265661
+[Debug] layer0_q_after_rope: [128, 16] first4: -12.602057 0.798570 1.518488 1.778495
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.153168 0.787275 0.319340 -0.492001
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.498292 0.150378 -0.398807 0.484326
+[Debug] layer0_attn_out: [2048, 1085] first4: -12.773369 1.105118 1.773309 1.768943
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542001 -1.018193 0.152304 0.468235
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.605642 -0.786551 -0.346129 0.499558
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.171107 0.593998 51.751106 -0.883031
+[Debug] hidden_after_layer6: [2048, 1085] first4: -20.936150 0.582827 29.989494 -4.872031
+[Debug] hidden_after_layer12: [2048, 1085] first4: -18.277052 -17.088211 71.559052 24.992846
+[Debug] hidden_after_layer18: [2048, 1085] first4: -25.915581 10.692349 65.928192 19.066517
+[Debug] hidden_after_layer23: [2048, 1085] first4: -6.799564 38.425339 203.467468 153.140854
+[Debug] dit_step0_vt: [2170, 64] first4: 0.015160 1.163890 0.353989 2.352075
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193647 2.103346 -0.187965 0.740744
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.238755 1.372093 -0.135596 1.879695
+[Debug] dit_step1_xt: [2170, 64] first4: 0.206670 2.028504 -0.180569 0.638215
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.034453 1.243445 0.102498 2.382742
+[Debug] dit_step2_xt: [2170, 64] first4: 0.208967 1.945608 -0.187402 0.479365
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.286766 1.110088 0.263285 2.616079
+[Debug] dit_step3_xt: [2170, 64] first4: 0.185070 1.853101 -0.209343 0.261359
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.359540 0.909711 0.077998 2.701789
+[Debug] dit_step4_xt: [2170, 64] first4: 0.146547 1.755632 -0.217700 -0.028118
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.361413 0.800675 -0.393300 2.731152
+[Debug] dit_step5_xt: [2170, 64] first4: 0.094917 1.641250 -0.161514 -0.418283
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.247442 0.617176 -0.960503 2.789753
+[Debug] dit_step6_xt: [2170, 64] first4: 0.045429 1.517814 0.030587 -0.976234
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: 0.019480 0.316869 -1.427597 3.153955
+[Debug] dit_x0: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 8546.5 ms (8546.5 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x11ce0b610 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x11ce10380 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x11ce10870 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x11ce10ad0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x11ce10e80 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x11ce11660 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x11ce11bc0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x11ce12b20 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x11ce12d80 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609611.2 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000665 0.001184 0.001013 0.001406
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.006 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 21.5 ms
+[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 1513.9 ms
+[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 272.4 ms
+[Request 1/1] ggml-sft/request0.json (batch=1)
+[Request] parsed ggml-sft/request0.json (18 fields)
+[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 42.2 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 225.9 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x12de0dc30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x12de0e090 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x12de0e900 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x12de0ed80 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x12de0f5f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x12de0fc20 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x12de10500 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x12de10910 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x12de10b70 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 49.1 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 42.5 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 140 tensors, 412.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 760.1 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x103e0be70 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1            0x103e0c420 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x103e0c860 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x103e0cfd0 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 162.8 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982
+[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 30 tensors, 73.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 115.2 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x103f05ca0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x103f065b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x103f06b60 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x103f06dc0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8      0x103f07ca0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x103f08250 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x103f085d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x103f08830 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x103f091d0 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 1055.5 ms
+[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x103e0dfd0 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x103e0e530 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x103e0e940 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2                  0x103e0f980 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x103e0fd00 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x103e103c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x103e10620 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x103e10bc0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x103e11150 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x103e11b70 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.155137 -0.118305 -0.084248 0.082623
+[Debug] temb: [2048] first4: -0.002843 -0.176820 0.004745 -0.001924
+[Debug] temb_t: [2048] first4: -0.001351 0.003023 -0.012552 -0.001712
+[Debug] temb_r: [2048] first4: -0.001491 -0.179843 0.017298 -0.000212
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.034754 0.028817 0.027120 -0.031729
+[Debug] temb_lin1_r: [2048] first4: -0.002680 0.004202 0.000655 -0.002088
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.095122 -0.908724 0.502637 0.450925
+[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.173051 0.876023 0.351566 -0.532545
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.920384 -0.707757 -0.034391 0.299813
+[Debug] layer0_q_after_rope: [128, 16] first4: -12.596228 0.535827 1.482060 1.773901
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.173051 0.876023 0.351566 -0.532545
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.637092 0.777700 -0.593592 0.534410
+[Debug] layer0_attn_out: [2048, 1085] first4: -12.115236 1.011131 1.711030 1.787191
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803575 -1.350455 -0.166552 0.391822
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.983645 -0.994904 -0.442195 0.398258
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.639871 1.314413 59.265984 -1.678902
+[Debug] hidden_after_layer6: [2048, 1085] first4: -15.443645 3.665146 59.001129 -0.430717
+[Debug] hidden_after_layer12: [2048, 1085] first4: -13.175318 0.713974 -24.768734 -0.709223
+[Debug] hidden_after_layer18: [2048, 1085] first4: -5.504215 14.850023 -44.686668 -9.688757
+[Debug] hidden_after_layer23: [2048, 1085] first4: 32.081551 63.384781 49.094582 -14.152830
+[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.659668 2.541722 -0.175072 1.431705
+[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.363007 2.087823 -0.714051 1.721254
+[Debug] dit_step0_vt: [2170, 64] first4: -0.859789 3.013237 0.095956 1.039126
+[Debug] dit_step0_xt: [2170, 64] first4: 0.211532 2.095985 -0.173794 0.826874
+[DiT] step 1/50 t=1.000
+[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.721169 2.535385 -0.028817 1.265576
+[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.587318 2.330142 -0.122353 1.467132
+[Debug] dit_step1_vt: [2170, 64] first4: -0.560584 2.181154 -0.231214 1.319647
+[Debug] dit_step1_xt: [2170, 64] first4: 0.222743 2.052362 -0.169170 0.800481
+[DiT] step 2/50 t=0.980
+[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.714072 2.499362 0.020078 1.278304
+[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.600958 2.309783 -0.102115 1.400073
+[Debug] dit_step2_vt: [2170, 64] first4: -0.861209 2.897427 0.258299 0.998091
+[Debug] dit_step2_xt: [2170, 64] first4: 0.239968 1.994414 -0.174336 0.780519
+[DiT] step 3/50 t=0.960
+[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.674846 2.427423 0.055491 1.297147
+[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.635843 2.332050 -0.021028 1.341071
+[Debug] dit_step3_vt: [2170, 64] first4: -0.498860 2.133466 -0.141608 1.438968
+[Debug] dit_step3_xt: [2170, 64] first4: 0.249945 1.951744 -0.171504 0.751740
+[DiT] step 4/50 t=0.940
+[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.605752 2.344337 0.051579 1.306835
+[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.617271 2.293072 -0.026830 1.327208
+[Debug] dit_step4_vt: [2170, 64] first4: -0.645479 2.581793 0.280999 1.082338
+[Debug] dit_step4_xt: [2170, 64] first4: 0.262854 1.900108 -0.177124 0.730093
+[DiT] step 5/50 t=0.920
+[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.529691 2.265976 0.024912 1.308485
+[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.613722 2.254884 0.007785 1.353816
+[Debug] dit_step5_vt: [2170, 64] first4: -0.220550 1.958779 -0.197940 1.314650
+[Debug] dit_step5_xt: [2170, 64] first4: 0.267265 1.860933 -0.173165 0.703800
+[DiT] step 6/50 t=0.900
+[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492980 2.220784 0.013978 1.314708
+[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.574326 2.194118 0.017516 1.377784
+[Debug] dit_step6_vt: [2170, 64] first4: -0.484828 2.424557 0.188767 1.065213
+[Debug] dit_step6_xt: [2170, 64] first4: 0.276962 1.812442 -0.176940 0.682496
+[DiT] step 7/50 t=0.880
+[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.442502 2.140999 -0.004548 1.307788
+[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.527369 2.112375 0.025084 1.375065
+[Debug] dit_step7_vt: [2170, 64] first4: -0.162851 1.943518 -0.225018 1.235321
+[Debug] dit_step7_xt: [2170, 64] first4: 0.280219 1.773571 -0.172440 0.657789
+[DiT] step 8/50 t=0.860
+[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.402943 2.052812 -0.006712 1.290008
+[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.459439 2.025463 0.030814 1.342298
+[Debug] dit_step8_vt: [2170, 64] first4: -0.414733 2.160834 0.094455 1.088282
+[Debug] dit_step8_xt: [2170, 64] first4: 0.288514 1.730355 -0.174329 0.636023
+[DiT] step 9/50 t=0.840
+[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.369760 1.969441 -0.010690 1.271576
+[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.396671 1.951135 0.030780 1.310076
+[Debug] dit_step9_vt: [2170, 64] first4: -0.235488 1.803751 -0.198291 1.206838
+[Debug] dit_step9_xt: [2170, 64] first4: 0.293223 1.694280 -0.170363 0.611887
+[DiT] step 10/50 t=0.820
+[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.344175 1.892885 0.003662 1.257559
+[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.342085 1.891713 0.047752 1.301667
+[Debug] dit_step10_vt: [2170, 64] first4: -0.420278 1.858245 0.037965 1.052360
+[Debug] dit_step10_xt: [2170, 64] first4: 0.301629 1.657115 -0.171122 0.590840
+[DiT] step 11/50 t=0.800
+[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.321399 1.822005 0.005853 1.241717
+[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.313690 1.840958 0.057351 1.308882
+[Debug] dit_step11_vt: [2170, 64] first4: -0.256015 1.588119 -0.173446 1.051672
+[Debug] dit_step11_xt: [2170, 64] first4: 0.306749 1.625353 -0.167653 0.569806
+[DiT] step 12/50 t=0.780
+[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.300376 1.758913 0.017258 1.234417
+[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.284777 1.790303 0.055927 1.316604
+[Debug] dit_step12_vt: [2170, 64] first4: -0.382294 1.631086 0.039774 0.961382
+[Debug] dit_step12_xt: [2170, 64] first4: 0.314395 1.592731 -0.168449 0.550578
+[DiT] step 13/50 t=0.760
+[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.282512 1.699357 0.023821 1.222209
+[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.256755 1.739260 0.045941 1.313994
+[Debug] dit_step13_vt: [2170, 64] first4: -0.292858 1.451734 -0.071923 0.964463
+[Debug] dit_step13_xt: [2170, 64] first4: 0.320252 1.563696 -0.167011 0.531289
+[DiT] step 14/50 t=0.740
+[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.268355 1.643249 0.035884 1.219110
+[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.234205 1.685310 0.038125 1.306284
+[Debug] dit_step14_vt: [2170, 64] first4: -0.380481 1.484198 0.105917 0.956716
+[Debug] dit_step14_xt: [2170, 64] first4: 0.327862 1.534012 -0.169129 0.512155
+[DiT] step 15/50 t=0.720
+[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.257674 1.591244 0.046474 1.215229
+[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.215398 1.632289 0.020497 1.288409
+[Debug] dit_step15_vt: [2170, 64] first4: -0.335985 1.372340 0.098681 1.003970
+[Debug] dit_step15_xt: [2170, 64] first4: 0.334582 1.506565 -0.171102 0.492076
+[DiT] step 16/50 t=0.700
+[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.250808 1.534114 0.052932 1.211927
+[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.205743 1.580690 0.005785 1.270423
+[Debug] dit_step16_vt: [2170, 64] first4: -0.382949 1.342146 0.222679 1.022127
+[Debug] dit_step16_xt: [2170, 64] first4: 0.342241 1.479722 -0.175556 0.471633
+[DiT] step 17/50 t=0.680
+[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.246631 1.471787 0.045976 1.197702
+[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.202947 1.526995 -0.016021 1.248045
+[Debug] dit_step17_vt: [2170, 64] first4: -0.343461 1.218771 0.212458 1.019693
+[Debug] dit_step17_xt: [2170, 64] first4: 0.349110 1.455347 -0.179805 0.451239
+[DiT] step 18/50 t=0.660
+[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.243157 1.409443 0.036330 1.184456
+[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.204529 1.477009 -0.037415 1.231383
+[Debug] dit_step18_vt: [2170, 64] first4: -0.354021 1.150632 0.298290 1.001465
+[Debug] dit_step18_xt: [2170, 64] first4: 0.356190 1.432334 -0.185771 0.431210
+[DiT] step 19/50 t=0.640
+[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.237715 1.343371 0.017683 1.161345
+[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.204017 1.427205 -0.062926 1.210816
+[Debug] dit_step19_vt: [2170, 64] first4: -0.318427 0.994659 0.274882 0.959663
+[Debug] dit_step19_xt: [2170, 64] first4: 0.362559 1.412441 -0.191269 0.412017
+[DiT] step 20/50 t=0.620
+[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.232230 1.263409 -0.001007 1.136674
+[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.200904 1.370160 -0.090913 1.187652
+[Debug] dit_step20_vt: [2170, 64] first4: -0.327181 0.859238 0.330215 0.939772
+[Debug] dit_step20_xt: [2170, 64] first4: 0.369102 1.395257 -0.197873 0.393221
+[DiT] step 21/50 t=0.600
+[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.230263 1.180950 -0.026976 1.107196
+[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.199895 1.312066 -0.122788 1.160427
+[Debug] dit_step21_vt: [2170, 64] first4: -0.309986 0.682107 0.288212 0.884258
+[Debug] dit_step21_xt: [2170, 64] first4: 0.375302 1.381614 -0.203637 0.375536
+[DiT] step 22/50 t=0.580
+[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.226975 1.087024 -0.049604 1.072087
+[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.197016 1.244136 -0.149074 1.129456
+[Debug] dit_step22_vt: [2170, 64] first4: -0.315666 0.502494 0.302365 0.836432
+[Debug] dit_step22_xt: [2170, 64] first4: 0.381615 1.371564 -0.209685 0.358807
+[DiT] step 23/50 t=0.560
+[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.229945 0.987764 -0.068857 1.041486
+[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.196528 1.172394 -0.166018 1.101852
+[Debug] dit_step23_vt: [2170, 64] first4: -0.331807 0.301117 0.239278 0.795295
+[Debug] dit_step23_xt: [2170, 64] first4: 0.388252 1.365542 -0.214470 0.342901
+[DiT] step 24/50 t=0.540
+[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.234566 0.878375 -0.086414 0.999502
+[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.196025 1.088575 -0.178235 1.066777
+[Debug] dit_step24_vt: [2170, 64] first4: -0.353102 0.129063 0.219470 0.719129
+[Debug] dit_step24_xt: [2170, 64] first4: 0.395314 1.362961 -0.218859 0.328519
+[DiT] step 25/50 t=0.520
+[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.244918 0.762549 -0.099009 0.962606
+[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.200310 0.995220 -0.184174 1.033471
+[Debug] dit_step25_vt: [2170, 64] first4: -0.384523 -0.079339 0.165061 0.686562
+[Debug] dit_step25_xt: [2170, 64] first4: 0.403004 1.364548 -0.222161 0.314788
+[DiT] step 26/50 t=0.500
+[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.256133 0.634451 -0.110018 0.919318
+[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.204624 0.887617 -0.187237 0.997615
+[Debug] dit_step26_vt: [2170, 64] first4: -0.416561 -0.257153 0.136664 0.594071
+[Debug] dit_step26_xt: [2170, 64] first4: 0.411335 1.369691 -0.224894 0.302906
+[DiT] step 27/50 t=0.480
+[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.269545 0.510911 -0.116178 0.879919
+[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.210466 0.778243 -0.183774 0.961990
+[Debug] dit_step27_vt: [2170, 64] first4: -0.454081 -0.397413 0.085143 0.568775
+[Debug] dit_step27_xt: [2170, 64] first4: 0.420417 1.377639 -0.226597 0.291531
+[DiT] step 28/50 t=0.460
+[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.282172 0.372450 -0.120048 0.831178
+[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.216011 0.655762 -0.179339 0.918627
+[Debug] dit_step28_vt: [2170, 64] first4: -0.483525 -0.575904 0.063843 0.478002
+[Debug] dit_step28_xt: [2170, 64] first4: 0.430087 1.389157 -0.227874 0.281971
+[DiT] step 29/50 t=0.440
+[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.293168 0.231214 -0.124586 0.781733
+[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.219691 0.525349 -0.170166 0.873745
+[Debug] dit_step29_vt: [2170, 64] first4: -0.518206 -0.710899 -0.005456 0.435657
+[Debug] dit_step29_xt: [2170, 64] first4: 0.440451 1.403375 -0.227765 0.273257
+[DiT] step 30/50 t=0.420
+[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.302506 0.086124 -0.124549 0.731757
+[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.221950 0.385721 -0.158065 0.830404
+[Debug] dit_step30_vt: [2170, 64] first4: -0.544895 -0.854978 -0.023964 0.343625
+[Debug] dit_step30_xt: [2170, 64] first4: 0.451349 1.420475 -0.227285 0.266385
+[DiT] step 31/50 t=0.400
+[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.310194 -0.052540 -0.126955 0.680247
+[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.222645 0.246419 -0.151881 0.786437
+[Debug] dit_step31_vt: [2170, 64] first4: -0.574628 -0.958966 -0.058730 0.282347
+[Debug] dit_step31_xt: [2170, 64] first4: 0.462842 1.439654 -0.226111 0.260738
+[DiT] step 32/50 t=0.380
+[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.314911 -0.194979 -0.126385 0.629081
+[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.220352 0.102109 -0.144540 0.743076
+[Debug] dit_step32_vt: [2170, 64] first4: -0.598710 -1.085776 -0.069698 0.196720
+[Debug] dit_step32_xt: [2170, 64] first4: 0.474816 1.461369 -0.224717 0.256804
+[DiT] step 33/50 t=0.360
+[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.318386 -0.334863 -0.120002 0.581452
+[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.218793 -0.045891 -0.135709 0.701808
+[Debug] dit_step33_vt: [2170, 64] first4: -0.615695 -1.169131 -0.067985 0.140436
+[Debug] dit_step33_xt: [2170, 64] first4: 0.487130 1.484752 -0.223357 0.253995
+[DiT] step 34/50 t=0.340
+[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.319302 -0.465754 -0.110709 0.538110
+[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211723 -0.189661 -0.128844 0.664668
+[Debug] dit_step34_vt: [2170, 64] first4: -0.653452 -1.253376 -0.044227 0.068009
+[Debug] dit_step34_xt: [2170, 64] first4: 0.500199 1.509819 -0.222472 0.252635
+[DiT] step 35/50 t=0.320
+[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.319083 -0.598486 -0.097674 0.492459
+[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.205768 -0.335506 -0.119003 0.623259
+[Debug] dit_step35_vt: [2170, 64] first4: -0.661011 -1.330961 -0.027972 0.020086
+[Debug] dit_step35_xt: [2170, 64] first4: 0.513419 1.536439 -0.221913 0.252233
+[DiT] step 36/50 t=0.300
+[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.318070 -0.730204 -0.080886 0.449659
+[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.199361 -0.482989 -0.108305 0.581833
+[Debug] dit_step36_vt: [2170, 64] first4: -0.690251 -1.401439 0.013562 -0.032488
+[Debug] dit_step36_xt: [2170, 64] first4: 0.527224 1.564467 -0.222184 0.252883
+[DiT] step 37/50 t=0.280
+[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.316356 -0.845106 -0.054751 0.408449
+[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.194137 -0.614114 -0.090670 0.540072
+[Debug] dit_step37_vt: [2170, 64] first4: -0.694876 -1.462489 0.059738 -0.070205
+[Debug] dit_step37_xt: [2170, 64] first4: 0.541122 1.593717 -0.223379 0.254287
+[DiT] step 38/50 t=0.260
+[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.317038 -0.954626 -0.027545 0.366891
+[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.190420 -0.739252 -0.071602 0.495985
+[Debug] dit_step38_vt: [2170, 64] first4: -0.728120 -1.522043 0.107842 -0.107965
+[Debug] dit_step38_xt: [2170, 64] first4: 0.555684 1.624158 -0.225536 0.256446
+[DiT] step 39/50 t=0.240
+[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.317547 -1.061431 0.009151 0.320700
+[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.189703 -0.862508 -0.047778 0.446086
+[Debug] dit_step39_vt: [2170, 64] first4: -0.718521 -1.567608 0.187925 -0.137347
+[Debug] dit_step39_xt: [2170, 64] first4: 0.570055 1.655510 -0.229294 0.259193
+[DiT] step 40/50 t=0.220
+[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.321349 -1.147692 0.054264 0.273095
+[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.194555 -0.963666 -0.014199 0.392062
+[Debug] dit_step40_vt: [2170, 64] first4: -0.725549 -1.599213 0.254155 -0.152648
+[Debug] dit_step40_xt: [2170, 64] first4: 0.584566 1.687495 -0.234377 0.262246
+[DiT] step 41/50 t=0.200
+[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.323505 -1.229047 0.104885 0.223181
+[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.200140 -1.061003 0.024296 0.333073
+[Debug] dit_step41_vt: [2170, 64] first4: -0.706612 -1.620246 0.347929 -0.161344
+[Debug] dit_step41_xt: [2170, 64] first4: 0.598698 1.719899 -0.241336 0.265473
+[DiT] step 42/50 t=0.180
+[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.325457 -1.305037 0.162371 0.163510
+[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.210729 -1.152184 0.073317 0.265414
+[Debug] dit_step42_vt: [2170, 64] first4: -0.675503 -1.653079 0.417194 -0.191713
+[Debug] dit_step42_xt: [2170, 64] first4: 0.612208 1.752961 -0.249680 0.269307
+[DiT] step 43/50 t=0.160
+[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.327040 -1.367895 0.222307 0.103005
+[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.223734 -1.229896 0.124878 0.195856
+[Debug] dit_step43_vt: [2170, 64] first4: -0.637198 -1.669221 0.519679 -0.207779
+[Debug] dit_step43_xt: [2170, 64] first4: 0.624952 1.786345 -0.260074 0.273463
+[DiT] step 44/50 t=0.140
+[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.324606 -1.422529 0.282540 0.041568
+[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.235813 -1.298376 0.179092 0.128338
+[Debug] dit_step44_vt: [2170, 64] first4: -0.582920 -1.697035 0.593491 -0.255212
+[Debug] dit_step44_xt: [2170, 64] first4: 0.636610 1.820286 -0.271943 0.278567
+[DiT] step 45/50 t=0.120
+[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.322754 -1.471227 0.337328 -0.019488
+[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.253130 -1.357642 0.232497 0.062397
+[Debug] dit_step45_vt: [2170, 64] first4: -0.511094 -1.728875 0.663531 -0.289321
+[Debug] dit_step45_xt: [2170, 64] first4: 0.646832 1.854864 -0.285214 0.284353
+[DiT] step 46/50 t=0.100
+[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.317589 -1.518562 0.387160 -0.074592
+[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.269982 -1.418198 0.282772 0.001510
+[Debug] dit_step46_vt: [2170, 64] first4: -0.434923 -1.750584 0.706043 -0.325901
+[Debug] dit_step46_xt: [2170, 64] first4: 0.655531 1.889875 -0.299335 0.290871
+[DiT] step 47/50 t=0.080
+[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.316530 -1.561702 0.429371 -0.113134
+[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.290275 -1.474672 0.330953 -0.045588
+[Debug] dit_step47_vt: [2170, 64] first4: -0.369515 -1.780826 0.721569 -0.327625
+[Debug] dit_step47_xt: [2170, 64] first4: 0.662921 1.925492 -0.313766 0.297424
+[DiT] step 48/50 t=0.060
+[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.304095 -1.593375 0.469693 -0.135493
+[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.298372 -1.526686 0.379661 -0.093868
+[Debug] dit_step48_vt: [2170, 64] first4: -0.296147 -1.763528 0.744123 -0.229345
+[Debug] dit_step48_xt: [2170, 64] first4: 0.668844 1.960763 -0.328649 0.302011
+[DiT] step 49/50 t=0.040
+[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.330730 -1.622756 0.480628 -0.154374
+[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.324303 -1.543248 0.396626 -0.105187
+[Debug] dit_step49_vt: [2170, 64] first4: -0.371182 -1.888395 0.739232 -0.345413
+[Debug] dit_x0: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919
+[DiT] step 50/50 t=0.020
+[DiT] Total generation: 106456.5 ms (106456.5 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x103f04c20 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x103f072d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x103f09950 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x103f05240 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x103f09cd0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x103f0a8c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x103f0ab20 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x103f0beb0 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x103f04080 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609604.9 ms
+[Debug] vae_audio: [2, 4166400] first4: -0.002491 -0.002402 -0.002394 -0.002024
+[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+[Request] Loaded request0.json
+[Noise] Reusing existing rng_philox_seed42.bf16
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
+[GGML] Done, 47 dump files
+[Turbo] Reusing existing Python dumps: python-turbo
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.999083
+  detok_output                         0.999884
+  context                              0.999927
+  noise                                1.000000
+  temb_t                               0.999972
+  hidden_after_proj_in                 0.999966
+  enc_after_cond_emb                   0.999209
+  layer0_sa_output                     0.999459
+  hidden_after_layer0                  0.999838
+  hidden_after_layer6                  0.999790
+  hidden_after_layer12                 0.998662
+  hidden_after_layer18                 0.995505
+  hidden_after_layer23                 0.991560
+  dit_step0_vt                         0.968885
+  dit_step0_xt                         0.999932
+  dit_step1_vt                         0.972718
+  dit_step1_xt                         0.999793
+  dit_step2_vt                         0.970980
+  dit_step2_xt                         0.999392
+  dit_step3_vt                         0.974057
+  dit_step3_xt                         0.998550
+  dit_step4_vt                         0.972601
+  dit_step4_xt                         0.996666
+  dit_step5_vt                         0.967840
+  dit_step5_xt                         0.992262
+  dit_step6_vt                         0.963419
+  dit_step6_xt                         0.983648
+  dit_step7_vt                         0.954759
+  dit_x0                               0.970661
+  vae_audio                            0.881689
+  vae_audio (log spectral)             0.999788
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999930   0.140512   0.007718  -0.002317   0.973035  -0.002342   0.972003
+  dit_step1_xt             0.999791   0.264415   0.013154  -0.005313   0.942911  -0.005313   0.941730
+  dit_step2_xt             0.999391   0.457878   0.021002  -0.009296   0.909537  -0.009311   0.908527
+  dit_step3_xt             0.998548   0.672565   0.031169  -0.014659   0.874300  -0.014577   0.873624
+  dit_step4_xt             0.996664   0.977397   0.045289  -0.021867   0.842610  -0.021660   0.841995
+  dit_step5_xt             0.992261   1.456099   0.067099  -0.032222   0.826249  -0.032109   0.824593
+  dit_step6_xt             0.983647   2.128287   0.100579  -0.046802   0.857538  -0.046482   0.855546
+[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf
+[GGML] Running acestep-v15-sft-Q5_K_M.gguf...
+[GGML] Done, 233 dump files
+[SFT] Reusing existing Python dumps: python-sft
+[SFT] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.999083
+  detok_output                         0.999884
+  context                              0.999927
+  noise                                1.000000
+  temb_t                               0.999900
+  hidden_after_proj_in                 0.999966
+  enc_after_cond_emb                   0.999209
+  layer0_sa_output                     0.999536
+  hidden_after_layer0                  0.999891
+  hidden_after_layer6                  0.999626
+  hidden_after_layer12                 0.998995
+  hidden_after_layer18                 0.998026
+  hidden_after_layer23                 0.998535
+  null_condition_emb                   1.000000
+  null_enc_hidden                      1.000000
+  dit_step0_vt_cond                    0.998436
+  dit_step0_vt_uncond                  0.998344
+  dit_step0_vt                         0.994668
+  dit_step0_xt                         0.999997
+  dit_step5_vt_cond                    0.998676
+  dit_step5_vt                         0.989830
+  dit_step5_xt                         0.999935
+  dit_step10_vt_cond                   0.996806
+  dit_step10_vt                        0.987585
+  dit_step10_xt                        0.999744
+  dit_step15_vt_cond                   0.992244
+  dit_step15_vt                        0.973038
+  dit_step15_xt                        0.999108
+  dit_step20_vt_cond                   0.984474
+  dit_step20_vt                        0.958153
+  dit_step20_xt                        0.997397
+  dit_step25_vt_cond                   0.974096
+  dit_step25_vt                        0.945640
+  dit_step25_xt                        0.994154
+  dit_step30_vt_cond                   0.962790
+  dit_step30_vt                        0.934107
+  dit_step30_xt                        0.989253
+  dit_step35_vt_cond                   0.951958
+  dit_step35_vt                        0.920426
+  dit_step35_xt                        0.983572
+  dit_step40_vt_cond                   0.945880
+  dit_step40_vt                        0.910054
+  dit_step40_xt                        0.978292
+  dit_step45_vt_cond                   0.952542
+  dit_step45_vt                        0.924831
+  dit_step45_xt                        0.974685
+  dit_step49_vt_cond                   0.963084
+  dit_step49_vt                        0.916267
+  dit_x0                               0.973449
+  vae_audio                            0.878623
+  vae_audio (log spectral)             0.999566
+[SFT] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999996   0.038422   0.002252  -0.001777   0.980099  -0.001741   0.980402
+  dit_step5_xt             0.999933   0.110159   0.007862  -0.006926   0.889483  -0.007143   0.887999
+  dit_step10_xt            0.999743   0.216004   0.013603  -0.012446   0.811152  -0.012603   0.811299
+  dit_step15_xt            0.999108   0.433603   0.022505  -0.017845   0.746187  -0.018114   0.745269
+  dit_step20_xt            0.997397   0.645668   0.035020  -0.023481   0.700583  -0.023808   0.699582
+  dit_step25_xt            0.994154   0.894286   0.050478  -0.029250   0.679073  -0.029311   0.679278
+  dit_step30_xt            0.989253   1.155794   0.069043  -0.035037   0.684973  -0.035027   0.685262
+  dit_step35_xt            0.983572   1.518599   0.089822  -0.040808   0.717172  -0.040716   0.717196
+  dit_step40_xt            0.978291   1.917882   0.111854  -0.046581   0.771460  -0.046462   0.771853
+  dit_step45_xt            0.974684   2.279520   0.132709  -0.052804   0.843506  -0.052475   0.843036
diff --git a/tests/Metal-Q6_K.log b/tests/Metal-Q6_K.log
new file mode 100644
index 0000000..19a2f5c
--- /dev/null
+++ b/tests/Metal-Q6_K.log
@@ -0,0 +1,819 @@
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.006 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 18.8 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 1759.5 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 335.9 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 42.1 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 294.2 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x13a80b9e0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x13a80be40 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x13a80c6b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x13a80cb30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x13a80d3a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x13a80d9d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x13a80e2b0 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x13a80e6c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x13a80e920 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 46.1 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 33.8 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 140 tensors, 476.3 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 652.0 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x13a818c40 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x13a819080 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x13a8197f0 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 146.8 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 82.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 125.6 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x13a817f60 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x13a818470 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x13a808aa0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x13a808d00 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x13a81a190 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x13a81a3f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x13a81adc0 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x13a81b250 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 1009.6 ms
+[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x13970a020 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x13970a280 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x13970a4e0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x13970b610 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x13970be80 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x13970c0e0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x13970c7b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x13970cc30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x13970d300 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.261089 -0.161223 -0.098728 0.051901
+[Debug] temb: [2048] first4: 0.000236 -0.132397 -0.035348 0.064653
+[Debug] temb_t: [2048] first4: 0.001398 0.026957 -0.052741 0.063660
+[Debug] temb_r: [2048] first4: -0.001162 -0.159353 0.017394 0.000993
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049071 -0.051112 -0.017769 -0.037193
+[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.037692 -0.956719 0.540867 0.451860
+[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.167564 0.852700 0.309671 -0.538299
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.716202 -0.756050 -0.048455 0.263529
+[Debug] layer0_q_after_rope: [128, 16] first4: -0.167564 0.852700 0.309671 -0.538299
+[Debug] layer0_k_after_rope: [128, 8] first4: -1.214772 -0.856039 -1.908578 -2.256124
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.502833 0.209946 -0.367812 0.520536
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.134820 -0.084089 -34.867664 -0.724257
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540231 -1.049932 0.181504 0.461969
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.609974 -0.819551 -0.333653 0.497179
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.196066 0.534182 52.334564 -0.880322
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.325979 -0.207006 34.129318 -4.337931
+[Debug] hidden_after_layer12: [2048, 1085] first4: -15.411194 -16.311844 76.549057 29.816362
+[Debug] hidden_after_layer18: [2048, 1085] first4: -28.025963 13.209218 65.994347 20.087559
+[Debug] hidden_after_layer23: [2048, 1085] first4: -19.575611 48.863052 201.092041 136.881271
+[Debug] dit_step0_vt: [2170, 64] first4: 0.099154 1.132388 0.349667 2.375307
+[Debug] dit_step0_xt: [2170, 64] first4: 0.189829 2.104778 -0.187769 0.739688
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.140258 1.329038 -0.173978 1.924904
+[Debug] dit_step1_xt: [2170, 64] first4: 0.197479 2.032285 -0.178279 0.634693
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: 0.064148 1.236530 0.118618 2.406788
+[Debug] dit_step2_xt: [2170, 64] first4: 0.193203 1.949849 -0.186187 0.474240
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.305678 1.101620 0.246811 2.656265
+[Debug] dit_step3_xt: [2170, 64] first4: 0.167730 1.858048 -0.206755 0.252885
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.333444 1.032630 0.077940 2.735898
+[Debug] dit_step4_xt: [2170, 64] first4: 0.132004 1.747409 -0.215105 -0.040247
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.310135 0.905818 -0.324717 2.786166
+[Debug] dit_step5_xt: [2170, 64] first4: 0.087699 1.618006 -0.168717 -0.438271
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.184311 0.624224 -0.863634 2.781863
+[Debug] dit_step6_xt: [2170, 64] first4: 0.050836 1.493161 0.004009 -0.994643
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: 0.049488 0.168290 -1.435298 3.015385
+[Debug] dit_x0: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 8015.7 ms (8015.7 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x13a81b7d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x13a81c1a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x13a81c400 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x13a81ca90 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x13a81ccf0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x13a81d4d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x13a81da30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x13a81e910 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x13a81eb70 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609630.7 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000453 0.000980 0.000826 0.001209
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.006 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 20.5 ms
+[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 1781.2 ms
+[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 275.0 ms
+[Request 1/1] ggml-sft/request0.json (batch=1)
+[Request] parsed ggml-sft/request0.json (18 fields)
+[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 42.4 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 361.7 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x106006bc0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x106007320 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x106008160 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x1060083c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x106008e40 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x1060093a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x106009600 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x10600a090 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x10600a2f0 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 45.5 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 35.7 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 140 tensors, 476.3 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 850.5 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x120606720 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x120606ba0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x1206071d0 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 151.8 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631
+[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 30 tensors, 82.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 134.7 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x106012a70 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x106013260 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x106013b60 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x1060134c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x1206079b0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x120608040 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x1206082a0 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x120608730 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 1004.2 ms
+[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x120608bb0 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x120608e10 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x120609070 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x120609f80 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x12060a7f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x12060aa50 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x12060b0d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x12060b550 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x12060bc10 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.153861 -0.117528 -0.090110 0.080834
+[Debug] temb: [2048] first4: -0.002466 -0.176370 0.004369 -0.002069
+[Debug] temb_t: [2048] first4: -0.000999 0.003474 -0.013219 -0.002130
+[Debug] temb_r: [2048] first4: -0.001467 -0.179844 0.017589 0.000062
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.041140 0.030274 0.027836 -0.025460
+[Debug] temb_lin1_r: [2048] first4: 0.004272 0.006720 0.000208 -0.005103
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.088484 -0.913424 0.502796 0.445566
+[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.194042 0.920094 0.309464 -0.544236
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.914448 -0.710483 -0.040214 0.295227
+[Debug] layer0_q_after_rope: [128, 16] first4: -0.194042 0.920094 0.309464 -0.544236
+[Debug] layer0_k_after_rope: [128, 8] first4: -1.341203 -0.993715 -1.828661 -2.252987
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.651907 0.800758 -0.600550 0.531539
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.094031 -0.025671 -33.031021 -0.629337
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.804741 -1.365866 -0.176846 0.385942
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.975780 -1.029709 -0.454110 0.391604
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.417660 1.271689 57.716125 -1.718801
+[Debug] hidden_after_layer6: [2048, 1085] first4: -17.205166 2.402088 59.038250 -1.336451
+[Debug] hidden_after_layer12: [2048, 1085] first4: -10.471869 6.708532 -25.396618 -2.966099
+[Debug] hidden_after_layer18: [2048, 1085] first4: -4.594971 20.646416 -42.849018 -14.024486
+[Debug] hidden_after_layer23: [2048, 1085] first4: 34.838955 64.575096 51.865501 -11.288853
+[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.621944 2.533786 -0.220703 1.441472
+[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.335422 2.090178 -0.712111 1.749312
+[Debug] dit_step0_vt: [2170, 64] first4: -0.817132 3.005553 0.060651 1.037697
+[Debug] dit_step0_xt: [2170, 64] first4: 0.210679 2.096139 -0.173088 0.826902
+[DiT] step 1/50 t=1.000
+[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.663130 2.509250 -0.047312 1.284879
+[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.535073 2.303810 -0.132373 1.511020
+[Debug] dit_step1_vt: [2170, 64] first4: -0.505551 2.169540 -0.259578 1.319904
+[Debug] dit_step1_xt: [2170, 64] first4: 0.220790 2.052748 -0.167896 0.800504
+[DiT] step 2/50 t=0.980
+[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.658958 2.478008 -0.004274 1.297306
+[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.555012 2.295219 -0.109707 1.445456
+[Debug] dit_step2_vt: [2170, 64] first4: -0.800255 2.877401 0.236257 0.991010
+[Debug] dit_step2_xt: [2170, 64] first4: 0.236795 1.995200 -0.172622 0.780684
+[DiT] step 3/50 t=0.960
+[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.623685 2.405478 0.016157 1.310913
+[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.584147 2.331472 -0.052434 1.362097
+[Debug] dit_step3_vt: [2170, 64] first4: -0.460909 2.085223 -0.179917 1.477093
+[Debug] dit_step3_xt: [2170, 64] first4: 0.246013 1.953496 -0.169023 0.751142
+[DiT] step 4/50 t=0.940
+[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.573136 2.336644 0.011644 1.310671
+[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.579254 2.305885 -0.063045 1.333517
+[Debug] dit_step4_vt: [2170, 64] first4: -0.615409 2.553339 0.248993 1.073298
+[Debug] dit_step4_xt: [2170, 64] first4: 0.258321 1.902429 -0.174003 0.729676
+[DiT] step 5/50 t=0.920
+[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.517031 2.271256 -0.017464 1.306595
+[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.593495 2.274611 -0.028605 1.340369
+[Debug] dit_step5_vt: [2170, 64] first4: -0.226837 1.944980 -0.246283 1.356041
+[Debug] dit_step5_xt: [2170, 64] first4: 0.262858 1.863529 -0.169077 0.702555
+[DiT] step 6/50 t=0.900
+[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492334 2.237296 -0.030848 1.308453
+[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.559225 2.211779 -0.028865 1.356664
+[Debug] dit_step6_vt: [2170, 64] first4: -0.506265 2.451765 0.152689 1.063906
+[Debug] dit_step6_xt: [2170, 64] first4: 0.272983 1.814494 -0.172131 0.681277
+[DiT] step 7/50 t=0.880
+[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.448929 2.166704 -0.051454 1.296641
+[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.516741 2.132911 -0.036027 1.347799
+[Debug] dit_step7_vt: [2170, 64] first4: -0.201648 1.970928 -0.247107 1.267663
+[Debug] dit_step7_xt: [2170, 64] first4: 0.277016 1.775075 -0.167189 0.655924
+[DiT] step 8/50 t=0.860
+[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.412205 2.087660 -0.059856 1.279323
+[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.450006 2.049930 -0.047531 1.315284
+[Debug] dit_step8_vt: [2170, 64] first4: -0.455255 2.220170 0.079066 1.097043
+[Debug] dit_step8_xt: [2170, 64] first4: 0.286121 1.730672 -0.168770 0.633983
+[DiT] step 9/50 t=0.840
+[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.378611 2.005139 -0.068357 1.261302
+[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.379138 1.976703 -0.055227 1.283708
+[Debug] dit_step9_vt: [2170, 64] first4: -0.302897 1.841593 -0.214058 1.239796
+[Debug] dit_step9_xt: [2170, 64] first4: 0.292179 1.693840 -0.164489 0.609187
+[DiT] step 10/50 t=0.820
+[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.355196 1.928296 -0.064251 1.252096
+[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.318644 1.912086 -0.042776 1.279346
+[Debug] dit_step10_vt: [2170, 64] first4: -0.498521 1.941743 -0.002070 1.076981
+[Debug] dit_step10_xt: [2170, 64] first4: 0.302150 1.655005 -0.164448 0.587648
+[DiT] step 11/50 t=0.800
+[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.335946 1.853130 -0.070414 1.240474
+[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.288372 1.862547 -0.022430 1.290233
+[Debug] dit_step11_vt: [2170, 64] first4: -0.354338 1.618477 -0.262413 1.094882
+[Debug] dit_step11_xt: [2170, 64] first4: 0.309236 1.622636 -0.159200 0.565750
+[DiT] step 12/50 t=0.780
+[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.320539 1.785762 -0.070432 1.238213
+[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.262826 1.816614 -0.015609 1.306050
+[Debug] dit_step12_vt: [2170, 64] first4: -0.491342 1.658213 -0.091632 0.993836
+[Debug] dit_step12_xt: [2170, 64] first4: 0.319063 1.589471 -0.157367 0.545873
+[DiT] step 13/50 t=0.760
+[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.304664 1.719325 -0.071341 1.231198
+[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.239698 1.764744 -0.021893 1.311546
+[Debug] dit_step13_vt: [2170, 64] first4: -0.399484 1.443110 -0.224588 1.008114
+[Debug] dit_step13_xt: [2170, 64] first4: 0.327053 1.560609 -0.152875 0.525711
+[DiT] step 14/50 t=0.740
+[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.287200 1.663083 -0.055876 1.237650
+[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.221389 1.709672 -0.033314 1.310252
+[Debug] dit_step14_vt: [2170, 64] first4: -0.462595 1.498639 -0.013459 1.015139
+[Debug] dit_step14_xt: [2170, 64] first4: 0.336305 1.530637 -0.152606 0.505408
+[DiT] step 15/50 t=0.720
+[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.268932 1.604917 -0.048117 1.238302
+[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.201852 1.656007 -0.050695 1.299129
+[Debug] dit_step15_vt: [2170, 64] first4: -0.405852 1.327548 -0.070650 1.046717
+[Debug] dit_step15_xt: [2170, 64] first4: 0.344422 1.504086 -0.151193 0.484474
+[DiT] step 16/50 t=0.700
+[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.252512 1.546370 -0.039756 1.239714
+[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.180964 1.599575 -0.068729 1.281288
+[Debug] dit_step16_vt: [2170, 64] first4: -0.465245 1.345990 0.113584 1.099017
+[Debug] dit_step16_xt: [2170, 64] first4: 0.353727 1.477166 -0.153465 0.462493
+[DiT] step 17/50 t=0.680
+[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.235390 1.484958 -0.035930 1.232975
+[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.159705 1.538287 -0.087686 1.257131
+[Debug] dit_step17_vt: [2170, 64] first4: -0.434245 1.219246 0.101116 1.128435
+[Debug] dit_step17_xt: [2170, 64] first4: 0.362412 1.452781 -0.155487 0.439925
+[DiT] step 18/50 t=0.660
+[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.216501 1.424307 -0.036471 1.226410
+[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.139621 1.481410 -0.105683 1.236565
+[Debug] dit_step18_vt: [2170, 64] first4: -0.455215 1.183846 0.234644 1.168681
+[Debug] dit_step18_xt: [2170, 64] first4: 0.371516 1.429104 -0.160180 0.416551
+[DiT] step 19/50 t=0.640
+[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.193781 1.365146 -0.052930 1.209294
+[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.117725 1.431031 -0.131161 1.210289
+[Debug] dit_step19_vt: [2170, 64] first4: -0.413012 1.059516 0.190559 1.170309
+[Debug] dit_step19_xt: [2170, 64] first4: 0.379776 1.407914 -0.163991 0.393145
+[DiT] step 20/50 t=0.620
+[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.171132 1.297945 -0.074904 1.190113
+[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.094949 1.381890 -0.154719 1.182902
+[Debug] dit_step20_vt: [2170, 64] first4: -0.416916 0.961257 0.217113 1.187745
+[Debug] dit_step20_xt: [2170, 64] first4: 0.388114 1.388689 -0.168333 0.369390
+[DiT] step 21/50 t=0.600
+[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.156818 1.217449 -0.102774 1.165961
+[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.076230 1.330654 -0.181784 1.153171
+[Debug] dit_step21_vt: [2170, 64] first4: -0.413541 0.733536 0.153392 1.169193
+[Debug] dit_step21_xt: [2170, 64] first4: 0.396385 1.374018 -0.171401 0.346006
+[DiT] step 22/50 t=0.580
+[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.149490 1.124467 -0.129160 1.136317
+[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066301 1.272416 -0.203485 1.126914
+[Debug] dit_step22_vt: [2170, 64] first4: -0.414485 0.550591 0.128185 1.113165
+[Debug] dit_step22_xt: [2170, 64] first4: 0.404675 1.363006 -0.173965 0.323743
+[DiT] step 23/50 t=0.560
+[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.151829 1.034549 -0.154075 1.109805
+[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.064656 1.215351 -0.221178 1.104513
+[Debug] dit_step23_vt: [2170, 64] first4: -0.436045 0.335528 0.059138 1.081705
+[Debug] dit_step23_xt: [2170, 64] first4: 0.413396 1.356296 -0.175148 0.302109
+[DiT] step 24/50 t=0.540
+[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.158201 0.930576 -0.174559 1.077020
+[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.070645 1.149243 -0.231212 1.077814
+[Debug] dit_step24_vt: [2170, 64] first4: -0.428632 0.115268 0.009393 1.014248
+[Debug] dit_step24_xt: [2170, 64] first4: 0.421968 1.353990 -0.175335 0.281824
+[DiT] step 25/50 t=0.520
+[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.172706 0.820171 -0.187534 1.046402
+[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.084898 1.070370 -0.233334 1.052725
+[Debug] dit_step25_vt: [2170, 64] first4: -0.451610 -0.083828 -0.046615 0.975977
+[Debug] dit_step25_xt: [2170, 64] first4: 0.431001 1.355667 -0.174403 0.262304
+[DiT] step 26/50 t=0.500
+[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.189538 0.693448 -0.199819 1.008348
+[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.101718 0.976297 -0.234447 1.021465
+[Debug] dit_step26_vt: [2170, 64] first4: -0.457462 -0.318421 -0.092668 0.902675
+[Debug] dit_step26_xt: [2170, 64] first4: 0.440150 1.362035 -0.172550 0.244251
+[DiT] step 27/50 t=0.480
+[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.208769 0.570284 -0.208205 0.971380
+[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.121884 0.876404 -0.234553 0.991057
+[Debug] dit_step27_vt: [2170, 64] first4: -0.474894 -0.475314 -0.124745 0.852438
+[Debug] dit_step27_xt: [2170, 64] first4: 0.449648 1.371541 -0.170055 0.227202
+[DiT] step 28/50 t=0.460
+[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.231451 0.438197 -0.217041 0.925335
+[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.142976 0.763591 -0.236660 0.951937
+[Debug] dit_step28_vt: [2170, 64] first4: -0.503044 -0.662942 -0.154526 0.775360
+[Debug] dit_step28_xt: [2170, 64] first4: 0.459709 1.384800 -0.166964 0.211695
+[DiT] step 29/50 t=0.440
+[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.255602 0.305935 -0.225167 0.878463
+[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.161997 0.641782 -0.240440 0.915606
+[Debug] dit_step29_vt: [2170, 64] first4: -0.546771 -0.770888 -0.172051 0.693937
+[Debug] dit_step29_xt: [2170, 64] first4: 0.470644 1.400218 -0.163523 0.197816
+[DiT] step 30/50 t=0.420
+[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.279856 0.166208 -0.230015 0.829064
+[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.179567 0.505654 -0.246080 0.880209
+[Debug] dit_step30_vt: [2170, 64] first4: -0.586251 -0.913478 -0.165068 0.591419
+[Debug] dit_step30_xt: [2170, 64] first4: 0.482369 1.418488 -0.160222 0.185988
+[DiT] step 31/50 t=0.400
+[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.303020 0.026116 -0.234342 0.778238
+[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194565 0.361688 -0.252981 0.842483
+[Debug] dit_step31_vt: [2170, 64] first4: -0.634687 -1.004404 -0.163206 0.505303
+[Debug] dit_step31_xt: [2170, 64] first4: 0.495063 1.438576 -0.156958 0.175882
+[DiT] step 32/50 t=0.380
+[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.323948 -0.114179 -0.233989 0.723499
+[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.205043 0.215050 -0.257018 0.804404
+[Debug] dit_step32_vt: [2170, 64] first4: -0.683278 -1.112943 -0.143677 0.386194
+[Debug] dit_step32_xt: [2170, 64] first4: 0.508728 1.460835 -0.154084 0.168158
+[DiT] step 33/50 t=0.360
+[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.343586 -0.257977 -0.227555 0.671680
+[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.213907 0.060018 -0.258127 0.766574
+[Debug] dit_step33_vt: [2170, 64] first4: -0.732613 -1.188033 -0.112312 0.307609
+[Debug] dit_step33_xt: [2170, 64] first4: 0.523381 1.484595 -0.151838 0.162006
+[DiT] step 34/50 t=0.340
+[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.359997 -0.390317 -0.220456 0.624948
+[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.218993 -0.085222 -0.256678 0.732610
+[Debug] dit_step34_vt: [2170, 64] first4: -0.782051 -1.277164 -0.095120 0.209992
+[Debug] dit_step34_xt: [2170, 64] first4: 0.539022 1.510138 -0.149936 0.157806
+[DiT] step 35/50 t=0.320
+[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.376117 -0.519578 -0.205748 0.575179
+[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.225790 -0.227651 -0.246590 0.695063
+[Debug] dit_step35_vt: [2170, 64] first4: -0.824555 -1.344921 -0.065901 0.128571
+[Debug] dit_step35_xt: [2170, 64] first4: 0.555513 1.537037 -0.148618 0.155234
+[DiT] step 36/50 t=0.300
+[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.388908 -0.645821 -0.189103 0.531176
+[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.231184 -0.367522 -0.234059 0.657502
+[Debug] dit_step36_vt: [2170, 64] first4: -0.864100 -1.422112 -0.041499 0.066854
+[Debug] dit_step36_xt: [2170, 64] first4: 0.572795 1.565479 -0.147788 0.153897
+[DiT] step 37/50 t=0.280
+[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.403025 -0.763416 -0.163423 0.484469
+[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.241143 -0.498980 -0.214114 0.616995
+[Debug] dit_step37_vt: [2170, 64] first4: -0.892219 -1.488433 0.005198 -0.010545
+[Debug] dit_step37_xt: [2170, 64] first4: 0.590639 1.595248 -0.147892 0.154108
+[DiT] step 38/50 t=0.260
+[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.413133 -0.876420 -0.134134 0.440490
+[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.251082 -0.625665 -0.187976 0.573381
+[Debug] dit_step38_vt: [2170, 64] first4: -0.908542 -1.561219 0.031824 -0.045067
+[Debug] dit_step38_xt: [2170, 64] first4: 0.608810 1.626472 -0.148528 0.155009
+[DiT] step 39/50 t=0.240
+[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.423588 -0.987294 -0.096889 0.393114
+[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.265262 -0.752590 -0.154346 0.525557
+[Debug] dit_step39_vt: [2170, 64] first4: -0.912287 -1.609328 0.086172 -0.106501
+[Debug] dit_step39_xt: [2170, 64] first4: 0.627056 1.658659 -0.150252 0.157140
+[DiT] step 40/50 t=0.220
+[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.432563 -1.079476 -0.052369 0.343707
+[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.280351 -0.859098 -0.115334 0.472040
+[Debug] dit_step40_vt: [2170, 64] first4: -0.898876 -1.653373 0.143320 -0.117817
+[Debug] dit_step40_xt: [2170, 64] first4: 0.645033 1.691726 -0.153118 0.159496
+[DiT] step 41/50 t=0.200
+[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.437202 -1.169207 -0.002996 0.295201
+[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.297226 -0.965837 -0.069953 0.414322
+[Debug] dit_step41_vt: [2170, 64] first4: -0.848467 -1.668746 0.201099 -0.128392
+[Debug] dit_step41_xt: [2170, 64] first4: 0.662003 1.725101 -0.157140 0.162064
+[DiT] step 42/50 t=0.180
+[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.440767 -1.252195 0.052645 0.236928
+[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.315933 -1.066464 -0.015775 0.349839
+[Debug] dit_step42_vt: [2170, 64] first4: -0.805727 -1.702440 0.256392 -0.165544
+[Debug] dit_step42_xt: [2170, 64] first4: 0.678117 1.759150 -0.162268 0.165375
+[DiT] step 43/50 t=0.160
+[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.441682 -1.322971 0.111570 0.178405
+[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.332472 -1.154346 0.037873 0.279796
+[Debug] dit_step43_vt: [2170, 64] first4: -0.752842 -1.718463 0.345657 -0.158267
+[Debug] dit_step43_xt: [2170, 64] first4: 0.693174 1.793519 -0.169181 0.168540
+[DiT] step 44/50 t=0.140
+[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.436481 -1.389212 0.170327 0.116380
+[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.347603 -1.238272 0.093730 0.210065
+[Debug] dit_step44_vt: [2170, 64] first4: -0.673781 -1.742915 0.402893 -0.211575
+[Debug] dit_step44_xt: [2170, 64] first4: 0.706650 1.828377 -0.177239 0.172771
+[DiT] step 45/50 t=0.120
+[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.432089 -1.448218 0.222042 0.052259
+[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.365667 -1.310638 0.145240 0.138891
+[Debug] dit_step45_vt: [2170, 64] first4: -0.598716 -1.780947 0.463354 -0.230193
+[Debug] dit_step45_xt: [2170, 64] first4: 0.718624 1.863996 -0.186506 0.177375
+[DiT] step 46/50 t=0.100
+[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.419096 -1.509236 0.267908 -0.006720
+[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.381439 -1.387897 0.192344 0.071553
+[Debug] dit_step46_vt: [2170, 64] first4: -0.477449 -1.801432 0.493477 -0.266383
+[Debug] dit_step46_xt: [2170, 64] first4: 0.728173 1.900025 -0.196375 0.182703
+[DiT] step 47/50 t=0.080
+[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.407120 -1.565253 0.302819 -0.051005
+[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.401163 -1.460867 0.230993 0.012604
+[Debug] dit_step47_vt: [2170, 64] first4: -0.369522 -1.829498 0.516088 -0.236691
+[Debug] dit_step47_xt: [2170, 64] first4: 0.735563 1.936615 -0.206697 0.187437
+[DiT] step 48/50 t=0.060
+[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.382883 -1.607141 0.333374 -0.083609
+[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.407261 -1.531759 0.269964 -0.041764
+[Debug] dit_step48_vt: [2170, 64] first4: -0.250388 -1.766134 0.508857 -0.194591
+[Debug] dit_step48_xt: [2170, 64] first4: 0.740571 1.971938 -0.216874 0.191329
+[DiT] step 49/50 t=0.040
+[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.416988 -1.643981 0.337042 -0.115695
+[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.434090 -1.549805 0.279877 -0.060649
+[Debug] dit_step49_vt: [2170, 64] first4: -0.398854 -1.970749 0.508508 -0.360412
+[Debug] dit_x0: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537
+[DiT] step 50/50 t=0.020
+[DiT] Total generation: 99823.1 ms (99823.1 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x12060b7b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x1206126e0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x120612940 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x120612fd0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x120613410 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x120613a10 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x120613f70 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x120614e50 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x10600ac10 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609612.5 ms
+[Debug] vae_audio: [2, 4166400] first4: -0.003173 -0.003180 -0.003117 -0.002677
+[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+[Request] Loaded request0.json
+[Noise] Reusing existing rng_philox_seed42.bf16
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
+[GGML] Running acestep-v15-turbo-Q6_K.gguf...
+[GGML] Done, 47 dump files
+[Turbo] Reusing existing Python dumps: python-turbo
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.999631
+  detok_output                         0.999969
+  context                              0.999982
+  noise                                1.000000
+  temb_t                               0.999991
+  hidden_after_proj_in                 0.999987
+  enc_after_cond_emb                   0.999648
+  layer0_sa_output                     0.999791
+  hidden_after_layer0                  0.999898
+  hidden_after_layer6                  0.999877
+  hidden_after_layer12                 0.998721
+  hidden_after_layer18                 0.995721
+  hidden_after_layer23                 0.992012
+  dit_step0_vt                         0.970006
+  dit_step0_xt                         0.999934
+  dit_step1_vt                         0.973568
+  dit_step1_xt                         0.999795
+  dit_step2_vt                         0.976942
+  dit_step2_xt                         0.999458
+  dit_step3_vt                         0.977714
+  dit_step3_xt                         0.998700
+  dit_step4_vt                         0.976433
+  dit_step4_xt                         0.997003
+  dit_step5_vt                         0.973498
+  dit_step5_xt                         0.993187
+  dit_step6_vt                         0.970259
+  dit_step6_xt                         0.985910
+  dit_step7_vt                         0.963169
+  dit_x0                               0.975098
+  vae_audio                            0.894235
+  vae_audio (log spectral)             0.999805
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999933   0.149343   0.007343  -0.002290   0.972926  -0.002342   0.972003
+  dit_step1_xt             0.999794   0.293469   0.012678  -0.005225   0.942670  -0.005313   0.941730
+  dit_step2_xt             0.999456   0.477608   0.019544  -0.009177   0.909085  -0.009311   0.908527
+  dit_step3_xt             0.998699   0.734476   0.028962  -0.014472   0.873547  -0.014577   0.873624
+  dit_step4_xt             0.997001   1.052176   0.042099  -0.021523   0.841546  -0.021660   0.841995
+  dit_step5_xt             0.993185   1.523836   0.061805  -0.031603   0.824654  -0.032109   0.824593
+  dit_step6_xt             0.985908   2.172513   0.091680  -0.045910   0.855362  -0.046482   0.855546
+[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf
+[GGML] Running acestep-v15-sft-Q6_K.gguf...
+[GGML] Done, 233 dump files
+[SFT] Reusing existing Python dumps: python-sft
+[SFT] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.999631
+  detok_output                         0.999969
+  context                              0.999982
+  noise                                1.000000
+  temb_t                               0.999973
+  hidden_after_proj_in                 0.999987
+  enc_after_cond_emb                   0.999652
+  layer0_sa_output                     0.999803
+  hidden_after_layer0                  0.999920
+  hidden_after_layer6                  0.999785
+  hidden_after_layer12                 0.999350
+  hidden_after_layer18                 0.998528
+  hidden_after_layer23                 0.998828
+  null_condition_emb                   1.000000
+  null_enc_hidden                      1.000000
+  dit_step0_vt_cond                    0.998729
+  dit_step0_vt_uncond                  0.998412
+  dit_step0_vt                         0.995061
+  dit_step0_xt                         0.999998
+  dit_step5_vt_cond                    0.999147
+  dit_step5_vt                         0.992746
+  dit_step5_xt                         0.999953
+  dit_step10_vt_cond                   0.997986
+  dit_step10_vt                        0.991731
+  dit_step10_xt                        0.999832
+  dit_step15_vt_cond                   0.995896
+  dit_step15_vt                        0.984377
+  dit_step15_xt                        0.999476
+  dit_step20_vt_cond                   0.992581
+  dit_step20_vt                        0.975428
+  dit_step20_xt                        0.998605
+  dit_step25_vt_cond                   0.986288
+  dit_step25_vt                        0.963191
+  dit_step25_xt                        0.996899
+  dit_step30_vt_cond                   0.979478
+  dit_step30_vt                        0.956129
+  dit_step30_xt                        0.994252
+  dit_step35_vt_cond                   0.972963
+  dit_step35_vt                        0.947489
+  dit_step35_xt                        0.991078
+  dit_step40_vt_cond                   0.968903
+  dit_step40_vt                        0.939482
+  dit_step40_xt                        0.987990
+  dit_step45_vt_cond                   0.973091
+  dit_step45_vt                        0.949768
+  dit_step45_xt                        0.985825
+  dit_step49_vt_cond                   0.979346
+  dit_step49_vt                        0.959720
+  dit_x0                               0.985104
+  vae_audio                            0.940564
+  vae_audio (log spectral)             0.999648
+[SFT] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999996   0.039016   0.002154  -0.001750   0.980178  -0.001741   0.980402
+  dit_step5_xt             0.999952   0.136674   0.006709  -0.006940   0.889822  -0.007143   0.887999
+  dit_step10_xt            0.999831   0.203842   0.011045  -0.012357   0.811533  -0.012603   0.811299
+  dit_step15_xt            0.999475   0.335757   0.017566  -0.017603   0.746439  -0.018114   0.745269
+  dit_step20_xt            0.998605   0.555654   0.026541  -0.022932   0.700822  -0.023808   0.699582
+  dit_step25_xt            0.996899   0.830926   0.037973  -0.028358   0.679564  -0.029311   0.679278
+  dit_step30_xt            0.994252   1.135793   0.051746  -0.033803   0.685565  -0.035027   0.685262
+  dit_step35_xt            0.991078   1.467212   0.067373  -0.039173   0.717556  -0.040716   0.717196
+  dit_step40_xt            0.987990   1.880554   0.084328  -0.044527   0.771174  -0.046462   0.771853
+  dit_step45_xt            0.985824   2.238589   0.100473  -0.050335   0.842316  -0.052475   0.843036
diff --git a/tests/Metal-Q8_0.log b/tests/Metal-Q8_0.log
new file mode 100644
index 0000000..3ddbb85
--- /dev/null
+++ b/tests/Metal-Q8_0.log
@@ -0,0 +1,823 @@
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.006 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 21.3 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 1779.3 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 272.0 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 41.5 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 228.7 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x14170b900 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x14170bd60 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x14170c5d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x14170ca50 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x14170d2c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x14170d8f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x14170e1d0 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x14170e5e0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x14170e840 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 44.9 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 34.4 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 140 tensors, 616.6 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 787.4 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1            0x141718bd0 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x141719010 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x141719670 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 132.0 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 106.5 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 165.9 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x14160b720 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x14160c030 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4                  0x14160c760 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x14160c9c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32                           0x14160ce00 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8      0x14160da60 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x14160dde0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x14160e040 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x14160ea80 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 664.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x14160f030 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x14160aa00 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x14160f9a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x1416102f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x1416109b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x1416113c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x1416118b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x141610670 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x141611dc0 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.260124 -0.161873 -0.097043 0.052039
+[Debug] temb: [2048] first4: 0.000130 -0.132501 -0.035452 0.064788
+[Debug] temb_t: [2048] first4: 0.001146 0.026826 -0.052770 0.063722
+[Debug] temb_r: [2048] first4: -0.001015 -0.159327 0.017318 0.001066
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.048950 -0.051683 -0.015299 -0.038721
+[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018836 -0.015732 0.008463
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039670 -0.968864 0.535370 0.447502
+[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.166626 0.855863 0.327675 -0.524847
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.719151 -0.764016 -0.047420 0.261850
+[Debug] layer0_q_after_rope: [128, 16] first4: -0.166626 0.855863 0.327675 -0.524847
+[Debug] layer0_k_after_rope: [128, 8] first4: -1.224136 -0.868663 -1.921617 -2.258156
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.509784 0.173032 -0.350482 0.513236
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.198180 -0.062361 -34.349155 -0.672093
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.543300 -1.045637 0.193163 0.457042
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.610051 -0.820831 -0.300355 0.492100
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.086482 0.559607 52.406876 -0.905369
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.433996 -0.091888 33.781910 -4.433238
+[Debug] hidden_after_layer12: [2048, 1085] first4: -15.201079 -18.070684 72.561172 28.713606
+[Debug] hidden_after_layer18: [2048, 1085] first4: -26.474438 14.961594 62.515419 20.237282
+[Debug] hidden_after_layer23: [2048, 1085] first4: -7.978052 44.256046 198.826355 145.129532
+[Debug] dit_step0_vt: [2170, 64] first4: 0.022187 1.144711 0.357881 2.375370
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193327 2.104218 -0.188142 0.739685
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.218329 1.318116 -0.102946 1.902612
+[Debug] dit_step1_xt: [2170, 64] first4: 0.205236 2.032320 -0.182527 0.635906
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: 0.013556 1.214101 0.135533 2.387155
+[Debug] dit_step2_xt: [2170, 64] first4: 0.204333 1.951380 -0.191563 0.476762
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.268002 1.073703 0.267121 2.643928
+[Debug] dit_step3_xt: [2170, 64] first4: 0.181999 1.861905 -0.213823 0.256435
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.304715 1.021377 0.118111 2.720495
+[Debug] dit_step4_xt: [2170, 64] first4: 0.149351 1.752472 -0.226477 -0.035047
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.279773 0.924189 -0.283976 2.780081
+[Debug] dit_step5_xt: [2170, 64] first4: 0.109384 1.620445 -0.185910 -0.432201
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.163348 0.641980 -0.841978 2.816087
+[Debug] dit_step6_xt: [2170, 64] first4: 0.076714 1.492049 -0.017514 -0.995418
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.026257 0.197844 -1.519455 3.080479
+[Debug] dit_x0: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 7097.2 ms (7097.2 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x141717870 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x1417100a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x1417089d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x141708e50 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x141719b10 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x14171a1d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x14171a730 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x14171b660 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x141618f80 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609553.7 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000462 0.000971 0.000803 0.001170
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: loaded in 0.006 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = false
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] DiT backend: MTL0 (CPU threads: 5)
+[Load] Backend init: 20.2 ms
+[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 2506.1 ms
+[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] VAE backend: MTL0 (CPU threads: 5)
+[VAE] Backend: MTL0, Weight buffer: 255.7 MB
+[VAE] Loaded: 5 blocks, upsample=1920x
+[Load] VAE weights: 340.1 ms
+[Request 1/1] ggml-sft/request0.json (batch=1)
+[Request] parsed ggml-sft/request0.json (18 fields)
+[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 40.9 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] TextEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Load] TextEncoder: 238.6 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
+ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x13060e0d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x13060e830 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x13060f670 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
+ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x13060f8d0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x130610350 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x1306108b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x130610b10 | th_max =  576 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x1306115a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x130611800 | th_max = 1024 | th_width =   32
+[Encode] TextEncoder (70 tokens): 49.2 ms
+[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 34.1 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+ggml_metal_init: allocating
+ggml_metal_init: found device: Apple M2 Pro
+ggml_metal_init: picking default device: Apple M2 Pro
+ggml_metal_init: use fusion         = true
+ggml_metal_init: use concurrency    = true
+ggml_metal_init: use graph optimize = true
+[Load] CondEncoder backend: MTL0 (CPU threads: 5)
+[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 140 tensors, 616.6 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 615.4 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1            0x130709710 | th_max =  896 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x130709b90 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x13070a1c0 | th_max =  640 | th_width =   32
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 129.6 ms, enc_S=238
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443
+[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
+[WeightCtx] Loaded 30 tensors, 106.5 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 77.5 ms
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x130708890 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x13070ab80 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4                  0x13070ade0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x13070b260 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32                           0x13070b970 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8      0x13070c5e0 | th_max =  640 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x13070c840 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x13070cc50 | th_max =  448 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x13070d590 | th_max = 1024 | th_width =   32
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 663.7 ms
+[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704
+[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
+[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x130612120 | th_max =  832 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x13061a740 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x13061af30 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x13061b880 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x13061bf40 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x13061c950 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x13061ce40 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x13061bc00 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x13061d350 | th_max =  640 | th_width =   32
+[Debug] tproj: [12288] first4: 0.154891 -0.116111 -0.086457 0.081949
+[Debug] temb: [2048] first4: -0.002756 -0.176432 0.004178 -0.001982
+[Debug] temb_t: [2048] first4: -0.001185 0.003330 -0.013113 -0.002073
+[Debug] temb_r: [2048] first4: -0.001571 -0.179762 0.017291 0.000091
+[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.038370 0.029875 0.028026 -0.024772
+[Debug] temb_lin1_r: [2048] first4: 0.001863 0.003353 -0.000552 -0.000197
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.090297 -0.925707 0.497575 0.441158
+[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.185352 0.911478 0.323017 -0.548477
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.915448 -0.716620 -0.037252 0.294421
+[Debug] layer0_q_after_rope: [128, 16] first4: -0.185352 0.911478 0.323017 -0.548477
+[Debug] layer0_k_after_rope: [128, 8] first4: -1.352387 -0.993045 -1.790654 -2.255961
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.646956 0.807681 -0.548860 0.541048
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.095906 0.020644 -33.592499 -0.642108
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803416 -1.383010 -0.122776 0.380531
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.967658 -1.044807 -0.403243 0.382801
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.409370 1.156843 57.423218 -1.617135
+[Debug] hidden_after_layer6: [2048, 1085] first4: -17.478519 4.365310 57.869217 -2.590640
+[Debug] hidden_after_layer12: [2048, 1085] first4: -10.105301 4.450487 -23.010748 -1.911694
+[Debug] hidden_after_layer18: [2048, 1085] first4: -3.554647 18.982800 -38.417198 -11.175929
+[Debug] hidden_after_layer23: [2048, 1085] first4: 34.303474 64.211922 58.212040 -9.279413
+[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.583175 2.543502 -0.173930 1.461996
+[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.311259 2.094594 -0.691959 1.747500
+[Debug] dit_step0_vt: [2170, 64] first4: -0.765032 3.014946 0.101383 1.079777
+[Debug] dit_step0_xt: [2170, 64] first4: 0.209637 2.095951 -0.173903 0.826061
+[DiT] step 1/50 t=1.000
+[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.635693 2.541408 0.005301 1.299802
+[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.555907 2.342016 -0.102693 1.478369
+[Debug] dit_step1_vt: [2170, 64] first4: -0.416332 2.156285 -0.189682 1.386406
+[Debug] dit_step1_xt: [2170, 64] first4: 0.217963 2.052825 -0.170109 0.798333
+[DiT] step 2/50 t=0.980
+[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.628899 2.502596 0.041825 1.309368
+[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.545262 2.302532 -0.090023 1.423282
+[Debug] dit_step2_vt: [2170, 64] first4: -0.777333 2.936136 0.278430 1.015660
+[Debug] dit_step2_xt: [2170, 64] first4: 0.233510 1.994103 -0.175678 0.778019
+[DiT] step 3/50 t=0.960
+[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.587814 2.422845 0.053595 1.321287
+[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.577692 2.332772 -0.037127 1.357753
+[Debug] dit_step3_vt: [2170, 64] first4: -0.357759 2.100169 -0.131042 1.500563
+[Debug] dit_step3_xt: [2170, 64] first4: 0.240665 1.952099 -0.173057 0.748008
+[DiT] step 4/50 t=0.940
+[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.531520 2.346682 0.048538 1.319634
+[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.572059 2.299766 -0.047866 1.336576
+[Debug] dit_step4_vt: [2170, 64] first4: -0.562290 2.601383 0.287872 1.079526
+[Debug] dit_step4_xt: [2170, 64] first4: 0.251911 1.900071 -0.178814 0.726417
+[DiT] step 5/50 t=0.920
+[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.471980 2.275284 0.016048 1.313642
+[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.581892 2.263602 -0.014694 1.345006
+[Debug] dit_step5_vt: [2170, 64] first4: -0.118539 1.956480 -0.197979 1.364062
+[Debug] dit_step5_xt: [2170, 64] first4: 0.254282 1.860942 -0.174855 0.699136
+[DiT] step 6/50 t=0.900
+[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.438105 2.231154 0.001016 1.310053
+[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.544084 2.196794 -0.016910 1.357073
+[Debug] dit_step6_vt: [2170, 64] first4: -0.411409 2.467071 0.199489 1.062841
+[Debug] dit_step6_xt: [2170, 64] first4: 0.262510 1.811600 -0.178844 0.677879
+[DiT] step 7/50 t=0.880
+[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.388773 2.150816 -0.023557 1.299900
+[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.488913 2.112037 -0.027399 1.344294
+[Debug] dit_step7_vt: [2170, 64] first4: -0.088803 1.961096 -0.200603 1.289382
+[Debug] dit_step7_xt: [2170, 64] first4: 0.264286 1.772379 -0.174832 0.652092
+[DiT] step 8/50 t=0.860
+[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.350721 2.065278 -0.035497 1.282128
+[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.410401 2.018277 -0.041600 1.310604
+[Debug] dit_step8_vt: [2170, 64] first4: -0.374781 2.238836 0.127401 1.108719
+[Debug] dit_step8_xt: [2170, 64] first4: 0.271781 1.727602 -0.177380 0.629917
+[DiT] step 9/50 t=0.840
+[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.318524 1.978134 -0.051657 1.264737
+[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.341550 1.941071 -0.051996 1.283970
+[Debug] dit_step9_vt: [2170, 64] first4: -0.191845 1.828466 -0.186632 1.247382
+[Debug] dit_step9_xt: [2170, 64] first4: 0.275618 1.691033 -0.173648 0.604970
+[DiT] step 10/50 t=0.820
+[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.295512 1.899216 -0.053004 1.254855
+[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.287151 1.875865 -0.046272 1.283265
+[Debug] dit_step10_vt: [2170, 64] first4: -0.389814 1.941252 0.040587 1.075441
+[Debug] dit_step10_xt: [2170, 64] first4: 0.283415 1.652207 -0.174460 0.583461
+[DiT] step 11/50 t=0.800
+[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.275416 1.819571 -0.061343 1.243315
+[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.256136 1.821522 -0.037205 1.292460
+[Debug] dit_step11_vt: [2170, 64] first4: -0.238914 1.604097 -0.208856 1.109252
+[Debug] dit_step11_xt: [2170, 64] first4: 0.288193 1.620125 -0.170282 0.561276
+[DiT] step 12/50 t=0.780
+[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.256205 1.750170 -0.060178 1.242222
+[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.227482 1.778021 -0.026911 1.317358
+[Debug] dit_step12_vt: [2170, 64] first4: -0.364171 1.624805 -0.047159 0.969305
+[Debug] dit_step12_xt: [2170, 64] first4: 0.295476 1.587629 -0.169339 0.541890
+[DiT] step 13/50 t=0.760
+[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.235268 1.684496 -0.057297 1.240694
+[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.197700 1.726529 -0.035090 1.325942
+[Debug] dit_step13_vt: [2170, 64] first4: -0.273492 1.425781 -0.144172 1.021587
+[Debug] dit_step13_xt: [2170, 64] first4: 0.300946 1.559114 -0.166456 0.521458
+[DiT] step 14/50 t=0.740
+[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.215348 1.630674 -0.040594 1.249328
+[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.174794 1.672403 -0.046957 1.327406
+[Debug] dit_step14_vt: [2170, 64] first4: -0.334811 1.472660 0.055936 0.996336
+[Debug] dit_step14_xt: [2170, 64] first4: 0.307642 1.529660 -0.167575 0.501531
+[DiT] step 15/50 t=0.720
+[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.197163 1.578918 -0.035665 1.254164
+[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.153038 1.617873 -0.066125 1.317649
+[Debug] dit_step15_vt: [2170, 64] first4: -0.277163 1.353290 0.013043 1.079428
+[Debug] dit_step15_xt: [2170, 64] first4: 0.313186 1.502595 -0.167835 0.479943
+[DiT] step 16/50 t=0.700
+[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.178978 1.530298 -0.037333 1.260345
+[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.134279 1.564621 -0.087480 1.303262
+[Debug] dit_step16_vt: [2170, 64] first4: -0.306646 1.373590 0.150592 1.110865
+[Debug] dit_step16_xt: [2170, 64] first4: 0.319319 1.475123 -0.170847 0.457726
+[DiT] step 17/50 t=0.680
+[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.160694 1.478322 -0.055287 1.256185
+[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.120116 1.510010 -0.115773 1.278657
+[Debug] dit_step17_vt: [2170, 64] first4: -0.246079 1.282344 0.092105 1.178406
+[Debug] dit_step17_xt: [2170, 64] first4: 0.324240 1.449476 -0.172689 0.434157
+[DiT] step 18/50 t=0.660
+[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.143064 1.420482 -0.082808 1.247799
+[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.106988 1.460190 -0.151634 1.257033
+[Debug] dit_step18_vt: [2170, 64] first4: -0.255780 1.219606 0.175261 1.179049
+[Debug] dit_step18_xt: [2170, 64] first4: 0.329356 1.425084 -0.176195 0.410576
+[DiT] step 19/50 t=0.640
+[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.128746 1.354206 -0.117595 1.227311
+[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.089307 1.410491 -0.191474 1.230714
+[Debug] dit_step19_vt: [2170, 64] first4: -0.249608 1.072434 0.109807 1.183548
+[Debug] dit_step19_xt: [2170, 64] first4: 0.334348 1.403635 -0.178391 0.386906
+[DiT] step 20/50 t=0.620
+[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.119003 1.272067 -0.154096 1.204746
+[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.074174 1.352716 -0.224701 1.203207
+[Debug] dit_step20_vt: [2170, 64] first4: -0.270581 0.922657 0.097471 1.173580
+[Debug] dit_step20_xt: [2170, 64] first4: 0.339760 1.385182 -0.180340 0.363434
+[DiT] step 21/50 t=0.600
+[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.119830 1.180927 -0.187708 1.173581
+[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.067616 1.288429 -0.252699 1.168760
+[Debug] dit_step21_vt: [2170, 64] first4: -0.293490 0.743300 0.018199 1.149608
+[Debug] dit_step21_xt: [2170, 64] first4: 0.345629 1.370316 -0.180704 0.340442
+[DiT] step 22/50 t=0.580
+[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.122855 1.082593 -0.215883 1.140040
+[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066099 1.222055 -0.270495 1.136248
+[Debug] dit_step22_vt: [2170, 64] first4: -0.307841 0.532235 -0.036651 1.101102
+[Debug] dit_step22_xt: [2170, 64] first4: 0.351786 1.359671 -0.179971 0.318420
+[DiT] step 23/50 t=0.560
+[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.132629 0.986949 -0.241285 1.111620
+[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.070734 1.155219 -0.283975 1.102751
+[Debug] dit_step23_vt: [2170, 64] first4: -0.338930 0.377693 -0.112170 1.114765
+[Debug] dit_step23_xt: [2170, 64] first4: 0.358565 1.352118 -0.177728 0.296124
+[DiT] step 24/50 t=0.540
+[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.144419 0.884916 -0.261786 1.077981
+[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.082416 1.084641 -0.291092 1.068368
+[Debug] dit_step24_vt: [2170, 64] first4: -0.333728 0.169552 -0.174635 1.048605
+[Debug] dit_step24_xt: [2170, 64] first4: 0.365239 1.348727 -0.174235 0.275152
+[DiT] step 25/50 t=0.520
+[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.162952 0.778074 -0.275456 1.049087
+[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.097955 1.005611 -0.292343 1.037690
+[Debug] dit_step25_vt: [2170, 64] first4: -0.381506 0.021539 -0.229765 1.053622
+[Debug] dit_step25_xt: [2170, 64] first4: 0.372869 1.348296 -0.169640 0.254080
+[DiT] step 26/50 t=0.500
+[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.181652 0.659926 -0.286432 1.013943
+[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.115043 0.914795 -0.293755 1.005103
+[Debug] dit_step26_vt: [2170, 64] first4: -0.384189 -0.243706 -0.263819 0.974856
+[Debug] dit_step26_xt: [2170, 64] first4: 0.380553 1.353170 -0.164363 0.234583
+[DiT] step 27/50 t=0.480
+[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.201740 0.544023 -0.293109 0.975384
+[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.133396 0.820908 -0.294792 0.973169
+[Debug] dit_step27_vt: [2170, 64] first4: -0.422354 -0.384602 -0.284394 0.929557
+[Debug] dit_step27_xt: [2170, 64] first4: 0.389000 1.360862 -0.158675 0.215992
+[DiT] step 28/50 t=0.460
+[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.224075 0.417509 -0.297748 0.929072
+[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.152221 0.713812 -0.298235 0.936752
+[Debug] dit_step28_vt: [2170, 64] first4: -0.444730 -0.577905 -0.283219 0.830288
+[Debug] dit_step28_xt: [2170, 64] first4: 0.397895 1.372420 -0.153011 0.199386
+[DiT] step 29/50 t=0.440
+[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.245692 0.290209 -0.302133 0.880189
+[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.168230 0.597160 -0.303812 0.900573
+[Debug] dit_step29_vt: [2170, 64] first4: -0.492967 -0.691559 -0.283257 0.750519
+[Debug] dit_step29_xt: [2170, 64] first4: 0.407754 1.386251 -0.147346 0.184375
+[DiT] step 30/50 t=0.420
+[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.267059 0.157186 -0.303854 0.831628
+[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.182529 0.468759 -0.310389 0.867764
+[Debug] dit_step30_vt: [2170, 64] first4: -0.526538 -0.832075 -0.259881 0.638692
+[Debug] dit_step30_xt: [2170, 64] first4: 0.418285 1.402893 -0.142148 0.171602
+[DiT] step 31/50 t=0.400
+[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.287749 0.019218 -0.305370 0.782136
+[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194848 0.329909 -0.318059 0.834346
+[Debug] dit_step31_vt: [2170, 64] first4: -0.575041 -0.931991 -0.245392 0.549215
+[Debug] dit_step31_xt: [2170, 64] first4: 0.429786 1.421533 -0.137240 0.160617
+[DiT] step 32/50 t=0.380
+[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.304426 -0.117694 -0.305028 0.730304
+[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.204242 0.187789 -0.324302 0.800084
+[Debug] dit_step32_vt: [2170, 64] first4: -0.600474 -1.036970 -0.221470 0.433362
+[Debug] dit_step32_xt: [2170, 64] first4: 0.441795 1.442272 -0.132811 0.151950
+[DiT] step 33/50 t=0.360
+[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.319238 -0.254083 -0.299488 0.682753
+[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.210244 0.042433 -0.326321 0.765284
+[Debug] dit_step33_vt: [2170, 64] first4: -0.649450 -1.121861 -0.194288 0.366186
+[Debug] dit_step33_xt: [2170, 64] first4: 0.454784 1.464709 -0.128925 0.144626
+[DiT] step 34/50 t=0.340
+[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.329936 -0.381831 -0.294650 0.641700
+[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211479 -0.098170 -0.329415 0.733958
+[Debug] dit_step34_vt: [2170, 64] first4: -0.683049 -1.194662 -0.165597 0.286680
+[Debug] dit_step34_xt: [2170, 64] first4: 0.468445 1.488602 -0.125613 0.138893
+[DiT] step 35/50 t=0.320
+[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.339163 -0.510193 -0.285358 0.597418
+[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.213347 -0.237217 -0.326480 0.696784
+[Debug] dit_step35_vt: [2170, 64] first4: -0.716524 -1.285484 -0.142755 0.230145
+[Debug] dit_step35_xt: [2170, 64] first4: 0.482776 1.514312 -0.122758 0.134290
+[DiT] step 36/50 t=0.300
+[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.344282 -0.639077 -0.274660 0.557109
+[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.213018 -0.378000 -0.321460 0.659633
+[Debug] dit_step36_vt: [2170, 64] first4: -0.737407 -1.359316 -0.114364 0.177555
+[Debug] dit_step36_xt: [2170, 64] first4: 0.497524 1.541498 -0.120471 0.130739
+[DiT] step 37/50 t=0.280
+[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.350530 -0.754109 -0.251615 0.515208
+[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.215775 -0.505277 -0.306489 0.618527
+[Debug] dit_step37_vt: [2170, 64] first4: -0.758482 -1.436458 -0.064580 0.134758
+[Debug] dit_step37_xt: [2170, 64] first4: 0.512694 1.570228 -0.119179 0.128044
+[DiT] step 38/50 t=0.260
+[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.351878 -0.862931 -0.225801 0.473748
+[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.216566 -0.629070 -0.288643 0.572433
+[Debug] dit_step38_vt: [2170, 64] first4: -0.762284 -1.485620 -0.020081 0.114390
+[Debug] dit_step38_xt: [2170, 64] first4: 0.527939 1.599940 -0.118778 0.125756
+[DiT] step 39/50 t=0.240
+[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.353231 -0.974004 -0.188379 0.427506
+[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.220991 -0.753843 -0.259805 0.519853
+[Debug] dit_step39_vt: [2170, 64] first4: -0.758813 -1.561977 0.045217 0.080611
+[Debug] dit_step39_xt: [2170, 64] first4: 0.543115 1.631179 -0.119682 0.124144
+[DiT] step 40/50 t=0.220
+[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.356234 -1.066793 -0.144915 0.378638
+[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.228754 -0.860472 -0.222172 0.464689
+[Debug] dit_step40_vt: [2170, 64] first4: -0.749040 -1.587260 0.097200 0.055798
+[Debug] dit_step40_xt: [2170, 64] first4: 0.558096 1.662925 -0.121626 0.123028
+[DiT] step 41/50 t=0.200
+[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.355270 -1.157881 -0.092032 0.327957
+[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.236702 -0.967551 -0.178051 0.403448
+[Debug] dit_step41_vt: [2170, 64] first4: -0.709683 -1.623054 0.190026 0.058188
+[Debug] dit_step41_xt: [2170, 64] first4: 0.572290 1.695386 -0.125427 0.121864
+[DiT] step 42/50 t=0.180
+[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.352479 -1.244785 -0.033309 0.267733
+[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.244508 -1.070808 -0.121544 0.336131
+[Debug] dit_step42_vt: [2170, 64] first4: -0.676820 -1.651634 0.233357 0.014431
+[Debug] dit_step42_xt: [2170, 64] first4: 0.585826 1.728418 -0.130094 0.121575
+[DiT] step 43/50 t=0.160
+[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.345241 -1.321976 0.026233 0.204406
+[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.251274 -1.163180 -0.061907 0.264783
+[Debug] dit_step43_vt: [2170, 64] first4: -0.615402 -1.690314 0.311200 0.000887
+[Debug] dit_step43_xt: [2170, 64] first4: 0.598134 1.762225 -0.136318 0.121558
+[DiT] step 44/50 t=0.140
+[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.330899 -1.390417 0.088632 0.138209
+[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.253711 -1.249830 -0.000380 0.191808
+[Debug] dit_step44_vt: [2170, 64] first4: -0.551020 -1.700038 0.375316 -0.049211
+[Debug] dit_step44_xt: [2170, 64] first4: 0.609155 1.796225 -0.143824 0.122542
+[DiT] step 45/50 t=0.120
+[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.315962 -1.445453 0.152387 0.071900
+[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.260821 -1.326647 0.053483 0.118309
+[Debug] dit_step45_vt: [2170, 64] first4: -0.444048 -1.697294 0.502791 -0.074117
+[Debug] dit_step45_xt: [2170, 64] first4: 0.618036 1.830171 -0.153880 0.124024
+[DiT] step 46/50 t=0.100
+[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.294778 -1.496297 0.212962 0.010717
+[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.258755 -1.395077 0.130780 0.026842
+[Debug] dit_step46_vt: [2170, 64] first4: -0.331878 -1.748994 0.398840 0.016598
+[Debug] dit_step46_xt: [2170, 64] first4: 0.624673 1.865151 -0.161857 0.123692
+[DiT] step 47/50 t=0.080
+[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.281226 -1.541478 0.262625 -0.022201
+[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.279224 -1.453849 0.178946 -0.018997
+[Debug] dit_step47_vt: [2170, 64] first4: -0.212368 -1.759161 0.579829 -0.049871
+[Debug] dit_step47_xt: [2170, 64] first4: 0.628921 1.900334 -0.173453 0.124690
+[DiT] step 48/50 t=0.060
+[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.256958 -1.573266 0.310890 -0.048733
+[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.281065 -1.517397 0.260529 -0.069026
+[Debug] dit_step48_vt: [2170, 64] first4: -0.191601 -1.679712 0.330919 0.028046
+[Debug] dit_step48_xt: [2170, 64] first4: 0.632753 1.933929 -0.180072 0.124129
+[DiT] step 49/50 t=0.040
+[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.282571 -1.604237 0.314485 -0.067221
+[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.305600 -1.528070 0.251822 -0.083237
+[Debug] dit_step49_vt: [2170, 64] first4: -0.212302 -1.898327 0.637213 -0.078416
+[Debug] dit_x0: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697
+[DiT] step 50/50 t=0.020
+[DiT] Total generation: 88329.8 ms (88329.8 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 474 nodes, T_latent=192
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
+ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x13070d7f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x13070e360 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x13070e5c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
+ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x13070eb20 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x13070eea0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
+ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x13070f4a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
+ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x13070f8c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x130710c50 | th_max = 1024 | th_width =   32
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 474 nodes, T_latent=256
+[VAE] Graph: 474 nodes, T_latent=186
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
+ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x130710eb0 | th_max =  896 | th_width =   32
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 609578.6 ms
+[Debug] vae_audio: [2, 4166400] first4: -0.002759 -0.002685 -0.002611 -0.002214
+[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+[Pipeline] All done
+[Request] Loaded request0.json
+[Noise] Reusing existing rng_philox_seed42.bf16
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
+[GGML] Running acestep-v15-turbo-Q8_0.gguf...
+[GGML] Done, 47 dump files
+[Turbo] Reusing existing Python dumps: python-turbo
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.999792
+  detok_output                         0.999991
+  context                              0.999996
+  noise                                1.000000
+  temb_t                               0.999998
+  hidden_after_proj_in                 0.999992
+  enc_after_cond_emb                   0.999776
+  layer0_sa_output                     0.999956
+  hidden_after_layer0                  0.999975
+  hidden_after_layer6                  0.999915
+  hidden_after_layer12                 0.999185
+  hidden_after_layer18                 0.996490
+  hidden_after_layer23                 0.993375
+  dit_step0_vt                         0.974442
+  dit_step0_xt                         0.999944
+  dit_step1_vt                         0.976442
+  dit_step1_xt                         0.999818
+  dit_step2_vt                         0.978398
+  dit_step2_xt                         0.999498
+  dit_step3_vt                         0.979729
+  dit_step3_xt                         0.998787
+  dit_step4_vt                         0.979038
+  dit_step4_xt                         0.997189
+  dit_step5_vt                         0.976705
+  dit_step5_xt                         0.993692
+  dit_step6_vt                         0.973710
+  dit_step6_xt                         0.987147
+  dit_step7_vt                         0.967471
+  dit_x0                               0.977589
+  vae_audio                            0.899969
+  vae_audio (log spectral)             0.999797
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999943   0.142558   0.006797  -0.002331   0.972917  -0.002342   0.972003
+  dit_step1_xt             0.999817   0.272611   0.011800  -0.005326   0.942672  -0.005313   0.941730
+  dit_step2_xt             0.999496   0.463653   0.018437  -0.009355   0.909197  -0.009311   0.908527
+  dit_step3_xt             0.998785   0.659420   0.027271  -0.014709   0.873849  -0.014577   0.873624
+  dit_step4_xt             0.997188   0.977590   0.039587  -0.021771   0.842053  -0.021660   0.841995
+  dit_step5_xt             0.993691   1.450203   0.057714  -0.031846   0.825442  -0.032109   0.824593
+  dit_step6_xt             0.987145   2.144326   0.085260  -0.046128   0.856513  -0.046482   0.855546
+[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf
+[GGML] Running acestep-v15-sft-Q8_0.gguf...
+[GGML] Done, 233 dump files
+[SFT] Reusing existing Python dumps: python-sft
+[SFT] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999813
+  lyric_embed                          1.000000
+  enc_hidden                           0.999792
+  detok_output                         0.999991
+  context                              0.999996
+  noise                                1.000000
+  temb_t                               0.999994
+  hidden_after_proj_in                 0.999993
+  enc_after_cond_emb                   0.999779
+  layer0_sa_output                     0.999948
+  hidden_after_layer0                  0.999975
+  hidden_after_layer6                  0.999842
+  hidden_after_layer12                 0.999467
+  hidden_after_layer18                 0.998721
+  hidden_after_layer23                 0.998987
+  null_condition_emb                   1.000000
+  null_enc_hidden                      1.000000
+  dit_step0_vt_cond                    0.998936
+  dit_step0_vt_uncond                  0.998589
+  dit_step0_vt                         0.995617
+  dit_step0_xt                         0.999998
+  dit_step5_vt_cond                    0.999453
+  dit_step5_vt                         0.993749
+  dit_step5_xt                         0.999962
+  dit_step10_vt_cond                   0.998786
+  dit_step10_vt                        0.993632
+  dit_step10_xt                        0.999883
+  dit_step15_vt_cond                   0.996925
+  dit_step15_vt                        0.985179
+  dit_step15_xt                        0.999650
+  dit_step20_vt_cond                   0.993789
+  dit_step20_vt                        0.978156
+  dit_step20_xt                        0.998994
+  dit_step25_vt_cond                   0.988666
+  dit_step25_vt                        0.968588
+  dit_step25_xt                        0.997635
+  dit_step30_vt_cond                   0.983353
+  dit_step30_vt                        0.963692
+  dit_step30_xt                        0.995502
+  dit_step35_vt_cond                   0.978311
+  dit_step35_vt                        0.954994
+  dit_step35_xt                        0.992900
+  dit_step40_vt_cond                   0.975242
+  dit_step40_vt                        0.949054
+  dit_step40_xt                        0.990408
+  dit_step45_vt_cond                   0.977875
+  dit_step45_vt                        0.949872
+  dit_step45_xt                        0.988656
+  dit_step49_vt_cond                   0.980007
+  dit_step49_vt                        0.943555
+  dit_x0                               0.988056
+  vae_audio                            0.945079
+  vae_audio (log spectral)             0.999659
+[SFT] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999997   0.038313   0.002069  -0.001710   0.980019  -0.001741   0.980402
+  dit_step5_xt             0.999960   0.128136   0.005945  -0.006874   0.889005  -0.007143   0.887999
+  dit_step10_xt            0.999882   0.212035   0.009258  -0.012273   0.810355  -0.012603   0.811299
+  dit_step15_xt            0.999649   0.310457   0.014288  -0.017479   0.745211  -0.018114   0.745269
+  dit_step20_xt            0.998994   0.579346   0.021839  -0.022740   0.699641  -0.023808   0.699582
+  dit_step25_xt            0.997635   0.874619   0.031657  -0.028120   0.678310  -0.029311   0.679278
+  dit_step30_xt            0.995501   1.140020   0.043494  -0.033543   0.684534  -0.035027   0.685262
+  dit_step35_xt            0.992900   1.761304   0.057050  -0.038898   0.716898  -0.040716   0.717196
+  dit_step40_xt            0.990407   2.128224   0.071732  -0.044231   0.770985  -0.046462   0.771853
+  dit_step45_xt            0.988655   2.420490   0.085663  -0.050087   0.842526  -0.052475   0.843036
diff --git a/tests/Vulkan-BF16.log b/tests/Vulkan-BF16.log
new file mode 100644
index 0000000..2d955d7
--- /dev/null
+++ b/tests/Vulkan-BF16.log
@@ -0,0 +1,259 @@
+ggml_vulkan: Found 1 Vulkan devices:
+ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
+[Load] DiT backend: Vulkan0 (CPU threads: 16)
+[Load] Backend init: 260.3 ms
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 397.7 ms
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 672.5 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 32.1 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 166.9 ms
+[Encode] TextEncoder (70 tokens): 30.9 ms
+[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 163.7 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 22.5 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.758148 -0.049593 -0.132730 0.058488
+[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
+[WeightCtx] Loaded 30 tensors, 200.3 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 28.1 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 229.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.125193 1.435010 0.308190 -0.624228
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.125193 1.435010 0.308190 -0.624228
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] tproj: [12288] first4: 0.260062 -0.161562 -0.097030 0.052313
+[Debug] temb: [2048] first4: 0.000069 -0.132499 -0.035430 0.064753
+[Debug] temb_t: [2048] first4: 0.001065 0.026818 -0.052754 0.063717
+[Debug] temb_r: [2048] first4: -0.000996 -0.159317 0.017323 0.001036
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049318 -0.051829 -0.014251 -0.038444
+[Debug] temb_lin1_r: [2048] first4: -0.013266 -0.018319 -0.016375 0.008532
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039551 -0.969299 0.536133 0.446747
+[Debug] proj_in_input: [192, 2170] first4: -0.125193 1.435010 0.308190 -0.624228
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.168464 0.814954 0.327714 -0.561971
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.719110 -0.764019 -0.047328 0.261808
+[Debug] layer0_q_after_rope: [128, 16] first4: -2.424376 -0.094810 -0.411903 1.007324
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.712339 1.106410 1.775920 1.780798
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.501171 0.169176 -0.355798 0.513027
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.598325 -0.820241 -0.296337 0.493580
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.091503 0.566892 52.584164 -0.903901
+[Debug] hidden_after_layer6: [2048, 1085] first4: -21.192070 0.040278 33.599442 -4.442998
+[Debug] hidden_after_layer12: [2048, 1085] first4: -15.068191 -18.118078 71.999359 28.597229
+[Debug] hidden_after_layer18: [2048, 1085] first4: -27.132679 15.867422 60.847614 20.940519
+[Debug] hidden_after_layer23: [2048, 1085] first4: -12.584854 45.152912 198.753845 145.517029
+[Debug] dit_step0_vt: [2170, 64] first4: 0.014936 1.119046 0.345802 2.379982
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193657 2.105384 -0.187593 0.739475
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: 0.086700 0.854980 -0.273651 1.728149
+[Debug] dit_step1_xt: [2170, 64] first4: 0.188928 2.058749 -0.172667 0.645212
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: 0.180420 0.837399 -0.150421 2.056976
+[Debug] dit_step2_xt: [2170, 64] first4: 0.176900 2.002922 -0.162639 0.508081
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.130821 0.833313 0.053528 2.193359
+[Debug] dit_step3_xt: [2170, 64] first4: 0.165998 1.933480 -0.167099 0.325301
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.273712 0.866425 0.216686 2.274872
+[Debug] dit_step4_xt: [2170, 64] first4: 0.136672 1.840648 -0.190316 0.081565
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.347900 0.772171 0.542953 2.248352
+[Debug] dit_step5_xt: [2170, 64] first4: 0.086972 1.730338 -0.267881 -0.239629
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.132820 0.664673 0.218246 2.387787
+[Debug] dit_step6_xt: [2170, 64] first4: 0.060408 1.597404 -0.311530 -0.717186
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.335976 0.323303 0.198029 2.726624
+[Debug] dit_x0: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 740.5 ms (740.5 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 9812.1 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000591 0.001078 0.000929 0.001296
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:55:13.398 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:55:13.398 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:55:13.399 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:55:13.399 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:55:13.399 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:55:14.155 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:55:15.664 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:55:15.664 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:55:15.669 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:55:15.830 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:55:15.838 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:55:15.850 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:55:15.851 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:55:15.885 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:55:16.193 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:55:16.193 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:55:16.193 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006814241409301758, 'diffusion_time_cost': 0.30007076263427734, 'diffusion_per_step_time_cost': 0.03750884532928467, 'total_time_cost': 0.3068850040435791, 'offload_time_cost': 0.0}
+2026-03-01 19:55:16.208 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:55:16.485 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:55:16.488 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:55:16.491 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
+[GGML] Running acestep-v15-turbo-BF16.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999812
+  lyric_embed                          1.000000
+  enc_hidden                           0.999834
+  detok_output                         0.999997
+  context                              0.999998
+  noise                                1.000000
+  temb_t                               0.999999
+  hidden_after_proj_in                 0.999987
+  enc_after_cond_emb                   0.999825
+  layer0_sa_output                     0.999959
+  hidden_after_layer0                  0.999982
+  hidden_after_layer6                  0.999916
+  hidden_after_layer12                 0.999276
+  hidden_after_layer18                 0.996645
+  hidden_after_layer23                 0.993735
+  dit_step0_vt                         0.975502
+  dit_step0_xt                         0.999946
+  dit_step1_vt                         0.898326
+  dit_step1_xt                         0.999578
+  dit_step2_vt                         0.893586
+  dit_step2_xt                         0.998276
+  dit_step3_vt                         0.881101
+  dit_step3_xt                         0.994720
+  dit_step4_vt                         0.869138
+  dit_step4_xt                         0.986137
+  dit_step5_vt                         0.854878
+  dit_step5_xt                         0.965846
+  dit_step6_vt                         0.840298
+  dit_step6_xt                         0.925771
+  dit_step7_vt                         0.818271
+  dit_x0                               0.867399
+  vae_audio                            0.680412
+  vae_audio (STFT cosine)              0.855380
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999946   0.135811   0.006633  -0.002316   0.972919  -0.002342   0.972003
+  dit_step1_xt             0.999578   0.413265   0.019706  -0.005121   0.942541  -0.005313   0.941730
+  dit_step2_xt             0.998276   0.811472   0.038208  -0.008968   0.908957  -0.009311   0.908527
+  dit_step3_xt             0.994720   1.481150   0.064047  -0.014385   0.872574  -0.014577   0.873624
+  dit_step4_xt             0.986137   1.857148   0.100272  -0.021489   0.837038  -0.021660   0.841995
+  dit_step5_xt             0.965846   1.439633   0.154129  -0.031859   0.812819  -0.032109   0.824593
+  dit_step6_xt             0.925771   2.125688   0.235367  -0.046759   0.832442  -0.046482   0.855546
diff --git a/tests/Vulkan-CPU_Q6_K.log b/tests/Vulkan-CPU_Q6_K.log
new file mode 100644
index 0000000..8912047
--- /dev/null
+++ b/tests/Vulkan-CPU_Q6_K.log
@@ -0,0 +1,54 @@
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
+[GGML] Running acestep-v15-turbo-Q6_K.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999812
+  lyric_embed                          1.000000
+  enc_hidden                           0.999665
+  detok_output                         0.999972
+  context                              0.999982
+  noise                                1.000000
+  temb_t                               0.999990
+  hidden_after_proj_in                 0.999982
+  enc_after_cond_emb                   0.999691
+  layer0_sa_output                     0.999774
+  hidden_after_layer0                  0.999710
+  hidden_after_layer6                  0.999855
+  hidden_after_layer12                 0.998856
+  hidden_after_layer18                 0.995803
+  hidden_after_layer23                 0.992072
+  dit_step0_vt                         0.970064
+  dit_step0_xt                         0.999934
+  dit_step1_vt                         0.924564
+  dit_step1_xt                         0.999651
+  dit_step2_vt                         0.915541
+  dit_step2_xt                         0.998650
+  dit_step3_vt                         0.915489
+  dit_step3_xt                         0.996123
+  dit_step4_vt                         0.916835
+  dit_step4_xt                         0.990527
+  dit_step5_vt                         0.909275
+  dit_step5_xt                         0.977470
+  dit_step6_vt                         0.899986
+  dit_step6_xt                         0.952353
+  dit_step7_vt                         0.880023
+  dit_x0                               0.915268
+  vae_audio                            0.753562
+  vae_audio (STFT cosine)              0.882452
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999934   0.147239   0.007394  -0.002260   0.973056  -0.002342   0.972003
+  dit_step1_xt             0.999651   0.410402   0.017745  -0.005286   0.943565  -0.005313   0.941730
+  dit_step2_xt             0.998650   0.806730   0.033672  -0.009524   0.911097  -0.009311   0.908527
+  dit_step3_xt             0.996123   1.479887   0.054500  -0.015235   0.876469  -0.014577   0.873624
+  dit_step4_xt             0.990527   2.298363   0.081794  -0.022731   0.844225  -0.021660   0.841995
+  dit_step5_xt             0.977470   3.296017   0.123177  -0.033626   0.825405  -0.032109   0.824593
+  dit_step6_xt             0.952353   4.545029   0.185597  -0.049157   0.851892  -0.046482   0.855546
diff --git a/tests/Vulkan-Q4_K_M.log b/tests/Vulkan-Q4_K_M.log
new file mode 100644
index 0000000..011c0c3
--- /dev/null
+++ b/tests/Vulkan-Q4_K_M.log
@@ -0,0 +1,259 @@
+ggml_vulkan: Found 1 Vulkan devices:
+ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
+[Load] DiT backend: Vulkan0 (CPU threads: 16)
+[Load] Backend init: 115.6 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 895.6 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 126.7 ms
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 667.9 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 31.0 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 166.1 ms
+[Encode] TextEncoder (70 tokens): 18.4 ms
+[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 11.3 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 352.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 43.9 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 18.2 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.760519 -0.046675 -0.129011 0.057651
+[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 64.7 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 8.9 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 152.2 ms
+[Debug] detok_output: [2170, 64] first4: -0.107345 1.442038 0.300564 -0.641466
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.107345 1.442038 0.300564 -0.641466
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+[Debug] tproj: [12288] first4: 0.260934 -0.160421 -0.090493 0.048629
+[Debug] temb: [2048] first4: 0.000206 -0.133914 -0.034444 0.065020
+[Debug] temb_t: [2048] first4: 0.000970 0.025693 -0.052101 0.063331
+[Debug] temb_r: [2048] first4: -0.000764 -0.159607 0.017657 0.001690
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049286 -0.053324 -0.012254 -0.047666
+[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.048340 -0.991272 0.525635 0.454071
+[Debug] proj_in_input: [192, 2170] first4: -0.107345 1.442038 0.300564 -0.641466
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.176880 0.743576 0.273499 -0.548842
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.723765 -0.772117 -0.042278 0.260597
+[Debug] layer0_q_after_rope: [128, 16] first4: -3.943359 0.398682 0.213257 0.700195
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.176880 0.743576 0.273499 -0.548842
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.613281 0.155151 -0.481201 0.457520
+[Debug] layer0_attn_out: [2048, 1085] first4: -12.139185 0.824881 1.501430 1.799707
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.581965 -1.059581 0.060089 0.462956
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.685481 -0.828136 -0.442840 0.506230
+[Debug] hidden_after_layer0: [2048, 1085] first4: -8.767639 0.404994 47.213272 -0.751820
+[Debug] hidden_after_layer6: [2048, 1085] first4: -11.862045 -4.874043 33.389240 -6.747426
+[Debug] hidden_after_layer12: [2048, 1085] first4: -0.032505 3.430909 11.062031 -3.459812
+[Debug] hidden_after_layer18: [2048, 1085] first4: -3.097944 5.710473 -3.142628 -23.355347
+[Debug] hidden_after_layer23: [2048, 1085] first4: -48.737732 95.176071 35.848183 73.305969
+[Debug] dit_step0_vt: [2170, 64] first4: 0.669312 0.442215 1.300629 2.101841
+[Debug] dit_step0_xt: [2170, 64] first4: 0.163913 2.136149 -0.230995 0.752118
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: 1.120422 0.593113 1.031189 1.813599
+[Debug] dit_step1_xt: [2170, 64] first4: 0.102799 2.103798 -0.287241 0.653194
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: 1.381363 0.295410 1.456146 1.949341
+[Debug] dit_step2_xt: [2170, 64] first4: 0.010708 2.084104 -0.384318 0.523238
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 1.440727 0.067017 1.481567 2.158554
+[Debug] dit_step3_xt: [2170, 64] first4: -0.109353 2.078519 -0.507782 0.343359
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 1.377216 0.234177 1.413437 2.181564
+[Debug] dit_step4_xt: [2170, 64] first4: -0.256912 2.053428 -0.659221 0.109620
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 1.135239 0.376801 1.055233 2.272675
+[Debug] dit_step5_xt: [2170, 64] first4: -0.419089 1.999600 -0.809969 -0.215048
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.948242 0.399368 0.426941 2.645081
+[Debug] dit_step6_xt: [2170, 64] first4: -0.608737 1.919726 -0.895357 -0.744064
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: 0.549133 -0.167076 0.379578 2.984619
+[Debug] dit_x0: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 263.6 ms (263.6 ms/sample)
+[Debug] dit_output: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 9686.3 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.015021 0.018215 0.017495 0.016521
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:56:19.059 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:56:19.060 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:56:19.060 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:56:19.060 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:56:19.060 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:56:19.832 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:56:21.417 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:56:21.417 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:56:21.428 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:56:21.589 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:56:21.597 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:56:21.610 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:56:21.610 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:56:21.642 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:56:21.955 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:56:21.956 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:56:21.956 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006905794143676758, 'diffusion_time_cost': 0.3056776523590088, 'diffusion_per_step_time_cost': 0.0382097065448761, 'total_time_cost': 0.31258344650268555, 'offload_time_cost': 0.0}
+2026-03-01 19:56:21.970 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:56:22.249 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:56:22.252 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:56:22.255 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999812
+  lyric_embed                          1.000000
+  enc_hidden                           0.997128
+  detok_output                         0.999611
+  context                              0.999751
+  noise                                1.000000
+  temb_t                               0.999906
+  hidden_after_proj_in                 0.999907
+  enc_after_cond_emb                   0.997645
+  layer0_sa_output                     0.998432
+  hidden_after_layer0                  0.999545
+  hidden_after_layer6                  0.923275
+  hidden_after_layer12                 0.969957
+  hidden_after_layer18                 0.964919
+  hidden_after_layer23                 0.947132
+  dit_step0_vt                         0.790630
+  dit_step0_xt                         0.999550
+  dit_step1_vt                         0.812267
+  dit_step1_xt                         0.998316
+  dit_step2_vt                         0.797855
+  dit_step2_xt                         0.994982
+  dit_step3_vt                         0.785550
+  dit_step3_xt                         0.987155
+  dit_step4_vt                         0.777677
+  dit_step4_xt                         0.969894
+  dit_step5_vt                         0.765554
+  dit_step5_xt                         0.933268
+  dit_step6_vt                         0.748164
+  dit_step6_xt                         0.865654
+  dit_step7_vt                         0.704997
+  dit_x0                               0.768990
+  vae_audio                            0.377954
+  vae_audio (STFT cosine)              0.669489
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999550   0.201120   0.022082  -0.002496   0.972768  -0.002342   0.972003
+  dit_step1_xt             0.998316   0.415084   0.041258  -0.005641   0.942202  -0.005313   0.941730
+  dit_step2_xt             0.994982   0.710340   0.068500  -0.010236   0.907728  -0.009311   0.908527
+  dit_step3_xt             0.987155   1.070455   0.105302  -0.016404   0.870181  -0.014577   0.873624
+  dit_step4_xt             0.969894   1.456633   0.155292  -0.024587   0.833834  -0.021660   0.841995
+  dit_step5_xt             0.933268   1.997366   0.225911  -0.035903   0.808944  -0.032109   0.824593
+  dit_step6_xt             0.865654   3.020976   0.331484  -0.051668   0.828925  -0.046482   0.855546
diff --git a/tests/Vulkan-Q5_K_M.log b/tests/Vulkan-Q5_K_M.log
new file mode 100644
index 0000000..ec38ab3
--- /dev/null
+++ b/tests/Vulkan-Q5_K_M.log
@@ -0,0 +1,259 @@
+ggml_vulkan: Found 1 Vulkan devices:
+ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
+[Load] DiT backend: Vulkan0 (CPU threads: 16)
+[Load] Backend init: 114.1 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K fused, V separate
+[DiT] Cross-attn: all separate
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 151.9 ms
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 677.1 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 32.6 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 167.6 ms
+[Encode] TextEncoder (70 tokens): 18.0 ms
+[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 11.1 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K fused, V separate
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 412.5 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 55.7 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 17.4 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.760480 -0.051691 -0.132144 0.058144
+[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 73.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 14.2 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 176.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.125636 1.455599 0.291766 -0.651349
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.125636 1.455599 0.291766 -0.651349
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1775 nodes
+[Debug] tproj: [12288] first4: 0.260409 -0.161609 -0.102203 0.051602
+[Debug] temb: [2048] first4: -0.000151 -0.132293 -0.035516 0.064751
+[Debug] temb_t: [2048] first4: 0.000578 0.026708 -0.052786 0.063514
+[Debug] temb_r: [2048] first4: -0.000729 -0.159001 0.017269 0.001237
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.051153 -0.053631 -0.012192 -0.039024
+[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.043457 -0.948303 0.538086 0.454315
+[Debug] proj_in_input: [192, 2170] first4: -0.125636 1.455599 0.291766 -0.651349
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.156174 0.748947 0.319763 -0.524475
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.721755 -0.751598 -0.052189 0.264294
+[Debug] layer0_q_after_rope: [128, 16] first4: -3.849609 0.403564 0.117188 0.729004
+[Debug] layer0_k_after_rope: [128, 8] first4: -0.156174 0.748947 0.319763 -0.524475
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.502930 0.143799 -0.399902 0.485840
+[Debug] layer0_attn_out: [2048, 1085] first4: -12.621027 0.802575 1.516849 1.778620
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542487 -1.011762 0.149138 0.465263
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.584631 -0.767133 -0.342805 0.501823
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.051172 0.588318 50.418579 -0.862462
+[Debug] hidden_after_layer6: [2048, 1085] first4: -17.400093 -1.418044 30.339943 -5.945173
+[Debug] hidden_after_layer12: [2048, 1085] first4: 6.109352 -15.584214 49.778614 -0.069897
+[Debug] hidden_after_layer18: [2048, 1085] first4: -11.684156 5.829335 7.772402 -2.692122
+[Debug] hidden_after_layer23: [2048, 1085] first4: -44.213371 57.440056 122.126839 44.268806
+[Debug] dit_step0_vt: [2170, 64] first4: -0.006317 1.190186 0.280113 2.456451
+[Debug] dit_step0_xt: [2170, 64] first4: 0.194623 2.102151 -0.184607 0.735999
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.053368 1.748116 -0.894806 1.618408
+[Debug] dit_step1_xt: [2170, 64] first4: 0.197534 2.006799 -0.135800 0.647723
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.025024 1.326050 -0.792084 2.043884
+[Debug] dit_step2_xt: [2170, 64] first4: 0.199202 1.918396 -0.082994 0.511464
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: -0.000458 1.126770 -0.795593 2.254120
+[Debug] dit_step3_xt: [2170, 64] first4: 0.199240 1.824498 -0.016695 0.323620
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.174652 1.253662 -1.125977 2.441956
+[Debug] dit_step4_xt: [2170, 64] first4: 0.180528 1.690177 0.103946 0.061982
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.205261 1.640076 -1.795410 2.452087
+[Debug] dit_step5_xt: [2170, 64] first4: 0.151205 1.455881 0.360433 -0.288316
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: -0.158905 1.750122 -2.412979 2.419128
+[Debug] dit_step6_xt: [2170, 64] first4: 0.182986 1.105856 0.843029 -0.772142
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.636047 1.672760 -3.485062 2.600891
+[Debug] dit_x0: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 269.9 ms (269.9 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 9630.7 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.001367 0.001844 0.001533 0.001892
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:56:02.727 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:56:02.728 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:56:02.728 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:56:02.728 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:56:02.728 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:56:03.499 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:56:05.072 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:56:05.072 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:56:05.078 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:56:05.239 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:56:05.247 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:56:05.260 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:56:05.260 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:56:05.285 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:56:05.592 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:56:05.593 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:56:05.593 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00687718391418457, 'diffusion_time_cost': 0.3001282215118408, 'diffusion_per_step_time_cost': 0.0375160276889801, 'total_time_cost': 0.3070054054260254, 'offload_time_cost': 0.0}
+2026-03-01 19:56:05.607 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:56:05.609 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
+2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:56:05.610 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
+2026-03-01 19:56:05.610 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
+2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:56:05.884 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:56:05.888 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:56:05.891 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
+[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999812
+  lyric_embed                          1.000000
+  enc_hidden                           0.999132
+  detok_output                         0.999876
+  context                              0.999921
+  noise                                1.000000
+  temb_t                               0.999972
+  hidden_after_proj_in                 0.999959
+  enc_after_cond_emb                   0.999270
+  layer0_sa_output                     0.999442
+  hidden_after_layer0                  0.999638
+  hidden_after_layer6                  0.996691
+  hidden_after_layer12                 0.982345
+  hidden_after_layer18                 0.974400
+  hidden_after_layer23                 0.959738
+  dit_step0_vt                         0.838705
+  dit_step0_xt                         0.999650
+  dit_step1_vt                         0.854589
+  dit_step1_xt                         0.998725
+  dit_step2_vt                         0.841602
+  dit_step2_xt                         0.996217
+  dit_step3_vt                         0.832748
+  dit_step3_xt                         0.990342
+  dit_step4_vt                         0.826828
+  dit_step4_xt                         0.977304
+  dit_step5_vt                         0.815977
+  dit_step5_xt                         0.948497
+  dit_step6_vt                         0.803425
+  dit_step6_xt                         0.895308
+  dit_step7_vt                         0.770195
+  dit_x0                               0.820447
+  vae_audio                            0.478241
+  vae_audio (STFT cosine)              0.753764
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999650   0.235954   0.018872  -0.002255   0.973213  -0.002342   0.972003
+  dit_step1_xt             0.998725   0.437235   0.034677  -0.005176   0.942982  -0.005313   0.941730
+  dit_step2_xt             0.996217   0.735376   0.057569  -0.009210   0.909169  -0.009311   0.908527
+  dit_step3_xt             0.990342   1.115564   0.088544  -0.014811   0.872820  -0.014577   0.873624
+  dit_step4_xt             0.977304   1.463506   0.131044  -0.022213   0.838526  -0.021660   0.841995
+  dit_step5_xt             0.948497   2.208427   0.193557  -0.032833   0.817339  -0.032109   0.824593
+  dit_step6_xt             0.895308   3.287671   0.286241  -0.047639   0.842369  -0.046482   0.855546
diff --git a/tests/Vulkan-Q6_K.log b/tests/Vulkan-Q6_K.log
new file mode 100644
index 0000000..eff680f
--- /dev/null
+++ b/tests/Vulkan-Q6_K.log
@@ -0,0 +1,259 @@
+ggml_vulkan: Found 1 Vulkan devices:
+ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
+[Load] DiT backend: Vulkan0 (CPU threads: 16)
+[Load] Backend init: 114.2 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 181.3 ms
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 670.0 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 32.2 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 165.9 ms
+[Encode] TextEncoder (70 tokens): 17.6 ms
+[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 476.3 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 61.6 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 15.6 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.761356 -0.050570 -0.133026 0.058500
+[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 82.2 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 10.8 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 143.8 ms
+[Debug] detok_output: [2170, 64] first4: -0.141024 1.454365 0.315089 -0.623565
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.141024 1.454365 0.315089 -0.623565
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] tproj: [12288] first4: 0.261089 -0.161223 -0.098727 0.051901
+[Debug] temb: [2048] first4: 0.000236 -0.132397 -0.035347 0.064653
+[Debug] temb_t: [2048] first4: 0.001398 0.026957 -0.052741 0.063660
+[Debug] temb_r: [2048] first4: -0.001162 -0.159353 0.017394 0.000993
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.049071 -0.051112 -0.017769 -0.037193
+[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.037598 -0.956604 0.541748 0.451630
+[Debug] proj_in_input: [192, 2170] first4: -0.141024 1.454365 0.315089 -0.623565
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.170166 0.815842 0.310486 -0.571373
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.716080 -0.755969 -0.048350 0.263422
+[Debug] layer0_q_after_rope: [128, 16] first4: -2.400391 -0.081909 -0.397461 1.011719
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.581572 1.117675 1.774897 1.788774
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.503906 0.211304 -0.366943 0.520996
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.586454 -0.808233 -0.324089 0.502214
+[Debug] hidden_after_layer0: [2048, 1085] first4: -9.155503 0.531986 51.823910 -0.865276
+[Debug] hidden_after_layer6: [2048, 1085] first4: -20.861578 -0.240065 34.589954 -4.288221
+[Debug] hidden_after_layer12: [2048, 1085] first4: -14.692959 -16.975090 77.250595 30.676491
+[Debug] hidden_after_layer18: [2048, 1085] first4: -28.082283 13.370504 64.661263 19.941170
+[Debug] hidden_after_layer23: [2048, 1085] first4: -16.195175 45.294254 196.766129 138.065048
+[Debug] dit_step0_vt: [2170, 64] first4: 0.098133 1.125458 0.338135 2.349396
+[Debug] dit_step0_xt: [2170, 64] first4: 0.189875 2.105093 -0.187245 0.740865
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: -0.018386 1.071533 -0.402077 1.814056
+[Debug] dit_step1_xt: [2170, 64] first4: 0.190878 2.046645 -0.165313 0.641917
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.052032 1.017303 -0.201233 2.115219
+[Debug] dit_step2_xt: [2170, 64] first4: 0.194347 1.978825 -0.151898 0.500902
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: 0.052856 1.105988 0.072205 2.288803
+[Debug] dit_step3_xt: [2170, 64] first4: 0.189942 1.886660 -0.157915 0.310169
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.097982 1.134430 0.083038 2.362534
+[Debug] dit_step4_xt: [2170, 64] first4: 0.179444 1.765114 -0.166812 0.057040
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.122574 1.016464 0.173828 2.333248
+[Debug] dit_step5_xt: [2170, 64] first4: 0.161934 1.619904 -0.191644 -0.276281
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: 0.070358 0.866913 -0.005890 2.297897
+[Debug] dit_step6_xt: [2170, 64] first4: 0.147862 1.446522 -0.190466 -0.735860
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.360962 0.376282 -0.314270 2.626526
+[Debug] dit_x0: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 276.6 ms (276.6 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 9723.7 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000254 0.000880 0.000782 0.001025
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:55:46.361 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:55:46.361 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:55:46.361 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:55:46.362 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:55:46.362 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:55:47.150 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:55:48.700 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:55:48.700 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:55:48.705 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:55:48.864 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:55:48.872 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:55:48.885 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:55:48.885 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:55:48.917 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:55:49.229 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:55:49.230 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:55:49.230 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006822347640991211, 'diffusion_time_cost': 0.3050048351287842, 'diffusion_per_step_time_cost': 0.03812560439109802, 'total_time_cost': 0.3118271827697754, 'offload_time_cost': 0.0}
+2026-03-01 19:55:49.244 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
+2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
+2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
+2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:55:49.543 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:55:49.546 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:55:49.549 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
+[GGML] Running acestep-v15-turbo-Q6_K.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999812
+  lyric_embed                          1.000000
+  enc_hidden                           0.999665
+  detok_output                         0.999972
+  context                              0.999982
+  noise                                1.000000
+  temb_t                               0.999990
+  hidden_after_proj_in                 0.999982
+  enc_after_cond_emb                   0.999691
+  layer0_sa_output                     0.999774
+  hidden_after_layer0                  0.999710
+  hidden_after_layer6                  0.999855
+  hidden_after_layer12                 0.998856
+  hidden_after_layer18                 0.995803
+  hidden_after_layer23                 0.992072
+  dit_step0_vt                         0.970064
+  dit_step0_xt                         0.999934
+  dit_step1_vt                         0.924403
+  dit_step1_xt                         0.999650
+  dit_step2_vt                         0.915580
+  dit_step2_xt                         0.998651
+  dit_step3_vt                         0.914431
+  dit_step3_xt                         0.996098
+  dit_step4_vt                         0.913750
+  dit_step4_xt                         0.990344
+  dit_step5_vt                         0.906205
+  dit_step5_xt                         0.976856
+  dit_step6_vt                         0.897054
+  dit_step6_xt                         0.950943
+  dit_step7_vt                         0.876737
+  dit_x0                               0.912738
+  vae_audio                            0.744947
+  vae_audio (STFT cosine)              0.875717
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999934   0.147239   0.007394  -0.002260   0.973056  -0.002342   0.972003
+  dit_step1_xt             0.999650   0.408757   0.017759  -0.005276   0.943557  -0.005313   0.941730
+  dit_step2_xt             0.998651   0.803721   0.033644  -0.009510   0.911087  -0.009311   0.908527
+  dit_step3_xt             0.996098   1.476888   0.054660  -0.015226   0.876460  -0.014577   0.873624
+  dit_step4_xt             0.990344   2.294700   0.082632  -0.022702   0.844225  -0.021660   0.841995
+  dit_step5_xt             0.976856   3.284146   0.125042  -0.033545   0.825286  -0.032109   0.824593
+  dit_step6_xt             0.950943   4.445529   0.188707  -0.049081   0.851111  -0.046482   0.855546
diff --git a/tests/Vulkan-Q8_0.log b/tests/Vulkan-Q8_0.log
new file mode 100644
index 0000000..774bc8a
--- /dev/null
+++ b/tests/Vulkan-Q8_0.log
@@ -0,0 +1,259 @@
+ggml_vulkan: Found 1 Vulkan devices:
+ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
+[Load] DiT backend: Vulkan0 (CPU threads: 16)
+[Load] Backend init: 113.5 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[DiT] Self-attn: Q+K+V fused
+[DiT] Cross-attn: Q+K+V fused
+[DiT] MLP: gate+up fused
+[Load] null_condition_emb found (CFG available)
+[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
+[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
+[Load] DiT weight load: 214.1 ms
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[Load] silence_latent: [15000, 64] from GGUF
+[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
+[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
+[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
+[Load] VAE weights: 671.7 ms
+[Request 1/1] ggml-turbo/request0.json (batch=1)
+[Request] parsed ggml-turbo/request0.json (18 fields)
+[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
+[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
+[Pipeline] 434 audio codes (86.8s @ 5Hz)
+[Pipeline] T=2170, S=1085
+[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
+[Load] BPE tokenizer: 31.9 ms
+[Pipeline] caption: 70 tokens, lyrics: 167 tokens
+[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Load] TextEncoder: 28L, H=1024, Nh=16/8
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
+[Load] TextEncoder: 176.0 ms
+[Encode] TextEncoder (70 tokens): 17.6 ms
+[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
+[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
+[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
+[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[Load] LyricEncoder: 8L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[Load] TimbreEncoder: 4L
+[Qwen3] Attn: Q+K+V fused
+[Qwen3] MLP: gate+up fused
+[WeightCtx] Loaded 140 tensors, 616.6 MB into backend
+[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
+[Load] ConditionEncoder: 84.7 ms
+[CondEnc] Lyric sliding mask: 167x167, window=128
+[CondEnc] Timbre sliding mask: 750x750, window=128
+[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
+[Encode] ConditionEncoder: 19.4 ms, enc_S=238
+[Debug] enc_hidden: [238, 2048] first4: 1.759194 -0.049729 -0.133332 0.058435
+[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
+[WeightCtx] Loaded 30 tensors, 106.5 MB into backend
+[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
+[Load] Detokenizer: 15.5 ms
+[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
+[Context] Detokenizer: 85.1 ms
+[Debug] detok_output: [2170, 64] first4: -0.121505 1.434749 0.303808 -0.627535
+[Context Batch0] Philox noise seed=42, [2170, 64]
+[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
+[Debug] context: [2170, 128] first4: -0.121505 1.434749 0.303808 -0.627535
+[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
+[DiT] Batch N=1, T=2170, S=1085, enc_S=238
+[DiT] Graph: 1841 nodes
+[Debug] tproj: [12288] first4: 0.260124 -0.161873 -0.097043 0.052039
+[Debug] temb: [2048] first4: 0.000130 -0.132501 -0.035452 0.064788
+[Debug] temb_t: [2048] first4: 0.001145 0.026826 -0.052770 0.063722
+[Debug] temb_r: [2048] first4: -0.001015 -0.159327 0.017318 0.001066
+[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583
+[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
+[Debug] temb_lin1_t: [2048] first4: -0.048950 -0.051683 -0.015299 -0.038721
+[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018836 -0.015732 0.008463
+[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.038574 -0.957581 0.536377 0.445770
+[Debug] proj_in_input: [192, 2170] first4: -0.121505 1.434749 0.303808 -0.627535
+[Debug] enc_after_cond_emb: [2048, 238] first4: -0.169861 0.817307 0.328308 -0.558397
+[Debug] layer0_sa_input: [2048, 1085] first4: -0.718007 -0.757392 -0.047301 0.261071
+[Debug] layer0_q_after_rope: [128, 16] first4: -2.423828 -0.099304 -0.408203 1.004883
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.718538 1.122484 1.774887 1.790079
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.510742 0.165771 -0.347900 0.511230
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.585310 -0.791508 -0.290125 0.495190
+[Debug] hidden_after_layer0: [2048, 1085] first4: -8.926053 0.558007 51.172398 -0.877717
+[Debug] hidden_after_layer6: [2048, 1085] first4: -20.768745 -0.272222 34.170349 -4.416629
+[Debug] hidden_after_layer12: [2048, 1085] first4: -14.358247 -18.625305 73.571915 30.079784
+[Debug] hidden_after_layer18: [2048, 1085] first4: -26.789474 14.346137 62.040115 19.708126
+[Debug] hidden_after_layer23: [2048, 1085] first4: -2.927731 38.887718 192.805542 144.255524
+[Debug] dit_step0_vt: [2170, 64] first4: 0.027340 1.115875 0.350609 2.345856
+[Debug] dit_step0_xt: [2170, 64] first4: 0.193093 2.105528 -0.187812 0.741026
+[DiT] step 1/8 t=1.000
+[Debug] dit_step1_vt: [2170, 64] first4: 0.002377 1.005737 -0.352661 1.768188
+[Debug] dit_step1_xt: [2170, 64] first4: 0.192964 2.050670 -0.168576 0.644580
+[DiT] step 2/8 t=0.955
+[Debug] dit_step2_vt: [2170, 64] first4: -0.063080 1.061218 -0.344177 1.926041
+[Debug] dit_step2_xt: [2170, 64] first4: 0.197169 1.979922 -0.145631 0.516177
+[DiT] step 3/8 t=0.900
+[Debug] dit_step3_vt: [2170, 64] first4: -0.072388 1.144592 -0.184326 2.069214
+[Debug] dit_step3_xt: [2170, 64] first4: 0.203201 1.884539 -0.130270 0.343743
+[DiT] step 4/8 t=0.833
+[Debug] dit_step4_vt: [2170, 64] first4: 0.004288 1.147110 0.001495 2.068916
+[Debug] dit_step4_xt: [2170, 64] first4: 0.202742 1.761635 -0.130430 0.122073
+[DiT] step 5/8 t=0.750
+[Debug] dit_step5_vt: [2170, 64] first4: 0.070211 1.173462 0.080673 2.086014
+[Debug] dit_step5_xt: [2170, 64] first4: 0.192712 1.593997 -0.141955 -0.175929
+[DiT] step 6/8 t=0.643
+[Debug] dit_step6_vt: [2170, 64] first4: -0.010117 1.145203 0.186996 2.198898
+[Debug] dit_step6_xt: [2170, 64] first4: 0.194735 1.364957 -0.179354 -0.615709
+[DiT] step 7/8 t=0.500
+[Debug] dit_step7_vt: [2170, 64] first4: -0.244629 0.644890 0.358635 2.446594
+[Debug] dit_x0: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687
+[DiT] step 8/8 t=0.300
+[DiT] Total generation: 252.0 ms (252.0 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687
+[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
+[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Upsample factor: 1920.00 (expected ~1920)
+[VAE] Graph: 417 nodes, T_latent=256
+[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
+[VAE Batch0] Decode: 9843.4 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000170 0.000825 0.000784 0.001115
+[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Request 1/1] Done
+[Pipeline] All done
+2026-03-01 19:55:29.948 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
+2026-03-01 19:55:29.948 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
+2026-03-01 19:55:29.948 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
+2026-03-01 19:55:29.948 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
+2026-03-01 19:55:29.948 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
+2026-03-01 19:55:30.699 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+`torch_dtype` is deprecated! Use `dtype` instead!
+2026-03-01 19:55:32.273 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
+2026-03-01 19:55:32.274 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
+2026-03-01 19:55:32.279 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-01 19:55:32.442 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+======================================================================
+2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+# Instruction
+Generate audio semantic tokens based on the given conditions:
+
+# Caption
+An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
+
+# Metas
+- bpm: 83
+- timesignature: 4
+- keyscale: G major
+- duration: 88 seconds
+<|endoftext|>
+
+2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+# Languages
+fr
+
+# Lyric
+# Lyric
+[Intro - Guitar Riff]
+[Verse 1]
+Dans le monde des tutos virtuels
+G ta toise en nouvelle passion
+Avec Ggendoline et Pumbé à midi
+La communauté, c'est l'unité
+Quel joie, une clé
+
+[Chorus]
+Dans le monde des tutos virtuels
+Gândoline et Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner
+
+[Guitar Solo]
+
+[Verse 2]
+Dans le monde des tutos virtuels
+Gândoline, Pumbé à midi
+Une famille à connecter, c'est vrai
+D'un enfant qui voit toi fusionner<|endoftext|>
+2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+
+2026-03-01 19:55:32.450 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-01 19:55:32.462 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-01 19:55:32.463 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-01 19:55:32.484 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-01 19:55:32.791 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-01 19:55:32.791 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-01 19:55:32.791 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006818294525146484, 'diffusion_time_cost': 0.2995321750640869, 'diffusion_per_step_time_cost': 0.037441521883010864, 'total_time_cost': 0.3063504695892334, 'offload_time_cost': 0.0}
+2026-03-01 19:55:32.806 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
+2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
+2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
+2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-01 19:55:33.083 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-01 19:55:33.084 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-01 19:55:33.088 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+[Request] Loaded request0.json
+[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
+[GGML] Running acestep-v15-turbo-Q8_0.gguf...
+[GGML] Done, 47 dump files
+[Python] Initializing acestep-v15-turbo...
+[Python] Generating (acestep-v15-turbo, 8 steps)...
+Using precomputed LM hints
+Using precomputed LM hints
+[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
+[Python] Done, 40 dump files
+[Turbo] Cosine similarities GGML vs Python
+  stage                          GGML vs Python
+  text_hidden                          0.999812
+  lyric_embed                          1.000000
+  enc_hidden                           0.999824
+  detok_output                         0.999983
+  context                              0.999990
+  noise                                1.000000
+  temb_t                               0.999998
+  hidden_after_proj_in                 0.999985
+  enc_after_cond_emb                   0.999817
+  layer0_sa_output                     0.999939
+  hidden_after_layer0                  0.999858
+  hidden_after_layer6                  0.999893
+  hidden_after_layer12                 0.999124
+  hidden_after_layer18                 0.996403
+  hidden_after_layer23                 0.993183
+  dit_step0_vt                         0.973885
+  dit_step0_xt                         0.999943
+  dit_step1_vt                         0.915468
+  dit_step1_xt                         0.999633
+  dit_step2_vt                         0.912211
+  dit_step2_xt                         0.998544
+  dit_step3_vt                         0.912707
+  dit_step3_xt                         0.995860
+  dit_step4_vt                         0.906019
+  dit_step4_xt                         0.989505
+  dit_step5_vt                         0.896537
+  dit_step5_xt                         0.974659
+  dit_step6_vt                         0.886047
+  dit_step6_xt                         0.945866
+  dit_step7_vt                         0.869793
+  dit_x0                               0.905017
+  vae_audio                            0.746037
+  vae_audio (STFT cosine)              0.898352
+[Turbo] Error growth GGML vs Python
+  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
+  dit_step0_xt             0.999943   0.140034   0.006943  -0.002318   0.973036  -0.002342   0.972003
+  dit_step1_xt             0.999633   0.423125   0.018056  -0.005257   0.943026  -0.005313   0.941730
+  dit_step2_xt             0.998544   0.841908   0.034537  -0.009209   0.910286  -0.009311   0.908527
+  dit_step3_xt             0.995860   1.521911   0.055719  -0.014626   0.875169  -0.014577   0.873624
+  dit_step4_xt             0.989505   2.346452   0.085477  -0.021803   0.842334  -0.021660   0.841995
+  dit_step5_xt             0.974659   3.387389   0.130921  -0.032225   0.822365  -0.032109   0.824593
+  dit_step6_xt             0.945866   4.812943   0.199910  -0.047290   0.846751  -0.046482   0.855546
diff --git a/tests/debug-dit-cossim.sh b/tests/debug-dit-cossim.sh
index f5ad6ed..284f193 100755
--- a/tests/debug-dit-cossim.sh
+++ b/tests/debug-dit-cossim.sh
@@ -1,7 +1,28 @@
 #!/bin/bash
 
-./debug-dit-cossim.py --mode both --quant BF16 > BF16.log
-./debug-dit-cossim.py --mode both --quant Q8_0 > Q8_0.log
-./debug-dit-cossim.py --mode both --quant Q6_K > Q6_K.log
-./debug-dit-cossim.py --mode both --quant Q5_K_M > Q5_K_M.log
-./debug-dit-cossim.py --mode both --quant Q4_K_M > Q4_K_M.log
+cd ..
+./buildcuda.sh
+cd tests
+./debug-dit-cossim.py --mode turbo --quant BF16   2>&1 | tee CUDA-BF16.log
+./debug-dit-cossim.py --mode turbo --quant Q8_0   2>&1 | tee CUDA-Q8_0.log
+./debug-dit-cossim.py --mode turbo --quant Q6_K   2>&1 | tee CUDA-Q6_K.log
+./debug-dit-cossim.py --mode turbo --quant Q5_K_M 2>&1 | tee CUDA-Q5_K_M.log
+./debug-dit-cossim.py --mode turbo --quant Q4_K_M 2>&1 | tee CUDA-Q4_K_M.log
+
+cd ..
+./buildvulkan.sh
+cd tests
+./debug-dit-cossim.py --mode turbo --quant BF16   2>&1 | tee Vulkan-BF16.log
+./debug-dit-cossim.py --mode turbo --quant Q8_0   2>&1 | tee Vulkan-Q8_0.log
+./debug-dit-cossim.py --mode turbo --quant Q6_K   2>&1 | tee Vulkan-Q6_K.log
+./debug-dit-cossim.py --mode turbo --quant Q5_K_M 2>&1 | tee Vulkan-Q5_K_M.log
+./debug-dit-cossim.py --mode turbo --quant Q4_K_M 2>&1 | tee Vulkan-Q4_K_M.log
+
+cd ..
+./buildcpu.sh
+cd tests
+./debug-dit-cossim.py --mode turbo --quant BF16   2>&1 | tee CPU-BF16.log
+./debug-dit-cossim.py --mode turbo --quant Q8_0   2>&1 | tee CPU-Q8_0.log
+./debug-dit-cossim.py --mode turbo --quant Q6_K   2>&1 | tee CPU-Q6_K.log
+./debug-dit-cossim.py --mode turbo --quant Q5_K_M 2>&1 | tee CPU-Q5_K_M.log
+./debug-dit-cossim.py --mode turbo --quant Q4_K_M 2>&1 | tee CPU-Q4_K_M.log
diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp
index 1094fc9..fbfd049 100644
--- a/tools/ace-qwen3.cpp
+++ b/tools/ace-qwen3.cpp
@@ -560,6 +560,7 @@ static void usage(const char * prog) {
         "Debug:\n"
         "  --max-seq <N>          KV cache size (default: 8192)\n"
         "  --no-fsm               Disable FSM constrained decoding\n"
+        "  --no-fa                Disable flash attention\n"
         "  --dump-logits <path>   Dump prefill logits (binary f32)\n"
         "  --dump-tokens <path>   Dump prompt token IDs (CSV)\n"
         , prog);
@@ -571,6 +572,7 @@ int main(int argc, char ** argv) {
     int max_seq     = 8192;
     int batch_size  = 1;
     bool use_fsm    = true;
+    bool use_fa     = true;
     const char * dump_logits  = nullptr;
     const char * dump_tokens  = nullptr;
 
@@ -590,6 +592,8 @@ int main(int argc, char ** argv) {
             batch_size = atoi(argv[++i]);
         else if (!strcmp(argv[i], "--no-fsm"))
             use_fsm = false;
+        else if (!strcmp(argv[i], "--no-fa"))
+            use_fa = false;
         else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc)
             dump_logits = argv[++i];
         else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc)
@@ -651,6 +655,7 @@ int main(int argc, char ** argv) {
     Timer t_load;
     Qwen3LM model;
     if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1;
+    model.use_flash_attn = use_fa;
     double load_ms = t_load.ms();
 
     // FSM
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index bfe274d..1f4ffac 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -77,6 +77,7 @@ static void print_usage(const char * prog) {
         "  --vae-chunk <N>         Latent frames per tile (default: 256)\n"
         "  --vae-overlap <N>       Overlap frames per side (default: 64)\n\n"
         "Debug:\n"
+        "  --no-fa                 Disable flash attention\n"
         "  --dump <dir>            Dump intermediate tensors\n", prog);
 }
 
@@ -100,10 +101,11 @@ int main(int argc, char ** argv) {
     std::vector<const char *> request_paths;
     const char * text_enc_gguf = NULL;
     const char * dit_gguf      = NULL;
-    const char * vae_gguf       = NULL;
+    const char * vae_gguf      = NULL;
     const char * dump_dir      = NULL;
     const char * lora_path     = NULL;
     float lora_scale            = 1.0f;
+    bool use_fa                = true;
     int batch_n                 = 1;
     int vae_chunk               = 256;
     int vae_overlap             = 64;
@@ -118,6 +120,7 @@ int main(int argc, char ** argv) {
         else if (strcmp(argv[i], "--dit") == 0 && i+1 < argc) dit_gguf = argv[++i];
         else if (strcmp(argv[i], "--vae") == 0 && i+1 < argc) vae_gguf = argv[++i];
         else if (strcmp(argv[i], "--dump") == 0 && i+1 < argc) dump_dir = argv[++i];
+        else if (strcmp(argv[i], "--no-fa") == 0) use_fa = false;
         else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]);
@@ -159,6 +162,7 @@ int main(int argc, char ** argv) {
 
     // Load DiT model (once for all requests)
     dit_ggml_init_backend(&model);
+    if (!use_fa) model.use_flash_attn = false;
     fprintf(stderr, "[Load] Backend init: %.1f ms\n", timer.ms());
 
     timer.reset();
@@ -375,6 +379,7 @@ int main(int argc, char ** argv) {
         timer.reset();
         Qwen3GGML text_enc = {};
         qwen3_init_backend(&text_enc);
+        if (!use_fa) text_enc.use_flash_attn = false;
         if (!qwen3_load_text_encoder(&text_enc, text_enc_gguf)) {
             fprintf(stderr, "FATAL: failed to load text encoder\n");
             dit_ggml_free(&model);
@@ -391,30 +396,10 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "[Encode] TextEncoder (%d tokens): %.1f ms\n", S_text, timer.ms());
         debug_dump_2d(&dbg, "text_hidden", text_hidden.data(), S_text, H_text);
 
-        // 5. Lyric embedding (CPU vocab lookup from text encoder embed table)
+        // 5. Lyric embedding (vocab lookup via text encoder)
         timer.reset();
         std::vector<float> lyric_embed(H_text * S_lyric);
-        {
-            GGUFModel gf_te = {};
-            if (!gf_load(&gf_te, text_enc_gguf)) {
-                fprintf(stderr, "FATAL: cannot reopen text encoder GGUF for lyric embed\n");
-                dit_ggml_free(&model);
-                if (have_vae) vae_ggml_free(&vae);
-                return 1;
-            }
-            const void * embed_data = gf_get_data(gf_te, "embed_tokens.weight");
-            if (!embed_data) {
-                fprintf(stderr, "FATAL: embed_tokens.weight not found\n");
-                gf_close(&gf_te);
-                dit_ggml_free(&model);
-                if (have_vae) vae_ggml_free(&vae);
-                return 1;
-            }
-            qwen3_cpu_embed_lookup(embed_data, H_text,
-                                    lyric_ids.data(), S_lyric,
-                                    lyric_embed.data());
-            gf_close(&gf_te);
-        }
+        qwen3_embed_lookup(&text_enc, lyric_ids.data(), S_lyric, lyric_embed.data());
         fprintf(stderr, "[Encode] Lyric vocab lookup (%d tokens): %.1f ms\n", S_lyric, timer.ms());
         debug_dump_2d(&dbg, "lyric_embed", lyric_embed.data(), S_lyric, H_text);
 
@@ -422,6 +407,7 @@ int main(int argc, char ** argv) {
         timer.reset();
         CondGGML cond = {};
         cond_ggml_init_backend(&cond);
+        if (!use_fa) cond.use_flash_attn = false;
         if (!cond_ggml_load(&cond, dit_gguf)) {
             fprintf(stderr, "FATAL: failed to load condition encoder\n");
             dit_ggml_free(&model);
@@ -494,6 +480,7 @@ int main(int argc, char ** argv) {
                 if (have_vae) vae_ggml_free(&vae);
                 return 1;
             }
+            if (!use_fa) detok.use_flash_attn = false;
             fprintf(stderr, "[Load] Detokenizer: %.1f ms\n", timer.ms());
 
             int T_5Hz = (int)codes_vec.size();