diff --git a/CMakeLists.txt b/CMakeLists.txt
index afa9cd0..d7af387 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,12 @@ add_compile_definitions(GGML_MAX_NAME=128)
 # CUDA architectures: cover Turing to Blackwell for distributed binaries.
 # Users can override with -DCMAKE_CUDA_ARCHITECTURES=native for local builds.
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real")
+    find_package(CUDAToolkit QUIET)
+    if(CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
+        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real")
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real")
+    endif()
 endif()
 
 # ggml as subdirectory, inherits GGML_CUDA, GGML_METAL, etc. from cmake flags
@@ -70,3 +75,7 @@ link_ggml_backends(ace-qwen3)
 # quantize: GGUF requantizer (BF16 -> K-quants)
 add_executable(quantize tools/quantize.cpp)
 link_ggml_backends(quantize)
+
+# neural-codec: Oobleck VAE neural audio codec (encode/decode WAV <-> latent)
+add_executable(neural-codec tools/neural-codec.cpp)
+link_ggml_backends(neural-codec)
diff --git a/README.md b/README.md
index 25449d0..84f39dd 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # acestep.cpp
 
 Portable C++17 implementation of ACE-Step 1.5 music generation using GGML.
-Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, Metal, Vulkan.
+Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, ROCm, Metal, Vulkan.
 
 ## Build
 
@@ -16,6 +16,9 @@ cmake ..
 # Linux with NVIDIA GPU
 cmake .. -DGGML_CUDA=ON
 
+# Linux with AMD GPU (ROCm)
+cmake .. -DGGML_HIP=ON
+
 # Linux with Vulkan
 cmake .. -DGGML_VULKAN=ON
 
@@ -29,7 +32,7 @@ cmake .. -DGGML_CUDA=ON -DGGML_BLAS=ON
 cmake --build . --config Release -j$(nproc)
 ```
 
-Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE).
+Builds three binaries: `ace-qwen3` (LLM), `dit-vae` (DiT + VAE) and `neural-codec` (VAE encode/decode).
 
 ## Models
 
@@ -94,13 +97,13 @@ EOF
 # LLM: request.json -> request0.json (enriched with lyrics + codes)
 ./build/ace-qwen3 \
     --request /tmp/request.json \
-    --model models/acestep-5Hz-lm-4B-BF16.gguf
+    --model models/acestep-5Hz-lm-4B-Q8_0.gguf
 
 # DiT+VAE: request0.json -> request00.wav
 ./build/dit-vae \
     --request /tmp/request0.json \
-    --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \
-    --dit models/acestep-v15-turbo-BF16.gguf \
+    --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit models/acestep-v15-turbo-Q8_0.gguf \
     --vae models/vae-BF16.gguf
 ```
 
@@ -111,7 +114,7 @@ Generate multiple songs at once with `--batch`:
 # -> request0.json, request1.json (different lyrics/codes, seeds auto+0, auto+1)
 ./build/ace-qwen3 \
     --request /tmp/request.json \
-    --model models/acestep-5Hz-lm-4B-BF16.gguf \
+    --model models/acestep-5Hz-lm-4B-Q8_0.gguf \
     --batch 2
 
 # DiT+VAE: (2 DiT variations of LM output 1 and 2)
@@ -119,8 +122,8 @@ Generate multiple songs at once with `--batch`:
 # -> request1.json -> request10.wav, request11.wav
 ./build/dit-vae \
     --request /tmp/request0.json /tmp/request1.json \
-    --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \
-    --dit models/acestep-v15-turbo-BF16.gguf \
+    --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit models/acestep-v15-turbo-Q8_0.gguf \
     --vae models/vae-BF16.gguf \
     --batch 2
 ```
@@ -151,34 +154,43 @@ Empty field = "fill it". Filled = "don't touch".
 All modes always output numbered files (`request0.json` .. `requestN-1.json`).
 The input JSON is never modified.
 
-**Caption only**: the LLM generates lyrics, metadata (bpm, key, time
-signature, duration) and audio codes. With `--batch N`, each element
-generates its own lyrics and metadata from a different seed, producing
-N completely different songs. See `examples/simple.json`.
+**Caption only** (`lyrics=""`): two LLM passes. Phase 1 uses the "Expand"
+prompt to generate lyrics and metadata (bpm, keyscale, timesignature,
+duration) via CoT. Phase 2 reinjects the CoT and generates audio codes using
+the "Generate tokens" prompt. CFG is forced to 1.0 in phase 1 (free
+sampling); `lm_cfg_scale` only applies in phase 2. With `--batch N`, each
+element runs its own phase 1 from a different seed, producing N completely
+different songs. See `examples/simple.json`.
 
-**Caption + lyrics (+ optional metadata)**: the LLM fills missing
-metadata via CoT, then generates audio codes. User provided fields
-are preserved. See `examples/partial.json`.
+**Caption + lyrics (+ optional metadata)**: single LLM pass. The "Generate
+tokens" prompt is used directly. Missing metadata is filled via CoT, then
+audio codes are generated. User-provided fields are never overwritten.
+`lm_cfg_scale` applies to both CoT and code generation. See
+`examples/partial.json`.
 
 **Everything provided** (caption, lyrics, bpm, duration, keyscale,
 timesignature): the LLM skips CoT and generates audio codes directly.
 With `--batch N`, all elements share the same prompt (single prefill,
 KV cache copied). See `examples/full.json`.
 
+**Instrumental** (`lyrics="[Instrumental]"`): treated as "lyrics provided",
+so the single-pass "Generate tokens" path is used. No lyrics generation.
+The DiT was trained with this exact string as the no-vocal condition.
+
 **Passthrough** (`audio_codes` present): LLM is skipped entirely.
 Run `dit-vae` to decode existing codes. See `examples/dit-only.json`.
 
 ## Request JSON reference
 
-All fields with defaults. Only `caption` is required.
+Only `caption` is required. All other fields default to "unset" which means
+the LLM fills them, or a sensible runtime default is applied.
 
 ```json
 {
     "caption":            "",
     "lyrics":             "",
-    "instrumental":       false,
     "bpm":                0,
-    "duration":           -1,
+    "duration":           0,
     "keyscale":           "",
     "timesignature":      "",
     "vocal_language":     "unknown",
@@ -190,18 +202,98 @@ All fields with defaults. Only `caption` is required.
     "lm_negative_prompt": "",
     "audio_codes":        "",
     "inference_steps":    8,
-    "guidance_scale":     7.0,
+    "guidance_scale":     0.0,
     "shift":              3.0
 }
 ```
 
-Key fields: `seed` -1 means random (resolved once, then +1 per batch
-element). `audio_codes` is generated by ace-qwen3 and consumed by
-dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely.
+### Text conditioning (ace-qwen3 + dit-vae)
+
+**`caption`** (string, required)
+Natural language description of the music style, mood, instruments, etc.
+Fed to both the LLM and the DiT text encoder.
+
+**`lyrics`** (string, default `""`)
+Controls vocal generation. Three valid states:
+- `""`: LLM generates lyrics from the caption (phase 1 "Expand" prompt).
+- `"[Instrumental]"`: no vocals. Passed directly to the DiT, LLM skips lyrics generation.
+- Any other string: user-provided lyrics used as-is, LLM only fills missing metadata.
+
+There is no `instrumental` flag. This field is the single source of truth for
+vocal content.
+
+### Metadata (LLM-filled if unset)
+
+**`bpm`** (int, default `0` = unset)
+Beats per minute. LLM generates one if 0.
+
+**`duration`** (float seconds, default `0` = unset)
+Target audio duration. `0` means the LLM picks it. Clamped to [1, 600]s after
+generation. `1` means 1 second.
+
+**`keyscale`** (string, default `""` = unset)
+Musical key and scale, e.g. `"C major"`, `"F# minor"`. LLM fills if empty.
+
+**`timesignature`** (string, default `""` = unset)
+Time signature numerator as a string, e.g. `"4"` for 4/4, `"3"` for 3/4.
+LLM fills if empty.
+
+**`vocal_language`** (string, default `"unknown"`)
+BCP-47 language code for lyrics, e.g. `"en"`, `"fr"`, `"ja"`. When set and
+lyrics are being generated, the FSM constrains the LLM output to that language.
+`"unknown"` lets the LLM decide.
+
+### Generation control
+
+**`seed`** (int64, default `-1` = random)
+RNG seed. Resolved once at startup to a random value if -1. Batch elements
+use `seed+0`, `seed+1`, ... `seed+N-1`.
+
+**`audio_codes`** (string, default `""`)
+Comma-separated FSQ token IDs produced by ace-qwen3. When non-empty, the
+entire LLM pass is skipped and dit-vae decodes these codes directly
+(passthrough / cover mode).
+
+### LM sampling (ace-qwen3)
+
+**`lm_temperature`** (float, default `0.85`)
+Sampling temperature for both phase 1 (lyrics/metadata) and phase 2 (audio
+codes). Lower = more deterministic.
+
+**`lm_cfg_scale`** (float, default `2.0`)
+Classifier-Free Guidance scale for the LM. Only active in phase 2 (audio
+code generation) and in phase 1 when lyrics are already provided. When
+`lyrics` is empty, phase 1 always runs with `cfg=1.0` (free sampling).
+`1.0` disables CFG.
 
-Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
-SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.
+**`lm_top_p`** (float, default `0.9`)
+Nucleus sampling cutoff. `1.0` disables. When `top_k=0`, an internal
+pre-filter of 256 tokens is applied before top_p for performance.
+
+**`lm_top_k`** (int, default `0` = disabled)
+Top-K sampling. `0` disables hard top-K (top_p still applies).
+
+**`lm_negative_prompt`** (string, default `""`)
+Negative caption for CFG in phase 2. Empty string falls back to a
+caption-less unconditional prompt.
+
+### DiT flow matching (dit-vae)
+
+**`inference_steps`** (int, default `8`)
+Number of diffusion denoising steps. Turbo preset: `8`. SFT preset: `50`.
+
+**`guidance_scale`** (float, default `0.0` = auto)
+CFG scale for the DiT. `0.0` is resolved at runtime:
+- Turbo models: forced to `1.0` (CFG disabled, turbo was trained without it).
+- SFT/base models: `7.0`.
+Any value > 1.0 on a turbo model is overridden to 1.0 with a warning.
+
+**`shift`** (float, default `3.0`)
+Flow-matching schedule shift. Controls the timestep distribution.
+`shift = s*t / (1 + (s-1)*t)`. Turbo preset: `3.0`. SFT preset: `6.0`.
+
+Turbo preset: `inference_steps=8, shift=3.0` (guidance_scale auto-resolved to 1.0).
+SFT preset: `inference_steps=50, guidance_scale=7.0, shift=6.0`.
 
 ## ace-qwen3 reference
 
@@ -258,6 +350,71 @@ Debug:
 
 Models are loaded once and reused across all requests.
 
+## neural-codec
+
+GGML-native neural audio codec based on the Oobleck VAE encoder and decoder.
+Serves two purposes: validating the precision of the full VAE chain (encode +
+decode roundtrip), and compressing music at ~850 B/s with no perceptible
+difference from the original.
+
+```
+Usage: neural-codec --vae <gguf> --encode|--decode -i <input> [-o <o>] [--q8|--q4]
+
+Required:
+  --vae <path>            VAE GGUF file
+  --encode | --decode     Encode WAV to latent, or decode latent to WAV
+  -i <path>               Input (WAV for encode, latent for decode)
+
+Output:
+  -o <path>               Output file (auto-named if omitted)
+  --q8                    Quantize latent to int8 (~13 kbit/s)
+  --q4                    Quantize latent to int4 (~6.8 kbit/s)
+
+Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)
+               song.latent -> song.wav
+
+VAE tiling (memory control):
+  --vae-chunk <N>         Latent frames per tile (default: 256)
+  --vae-overlap <N>       Overlap frames per side (default: 64)
+
+Latent formats (decode auto-detects):
+  f32:  flat [T, 64] f32, no header. ~51 kbit/s.
+  NAC8: header + per-frame Q8. ~13 kbit/s.
+  NAC4: header + per-frame Q4. ~6.8 kbit/s.
+```
+
+The encoder is the symmetric mirror of the decoder: same snake activations,
+same residual units, strided conv1d for downsampling instead of transposed
+conv1d for upsampling. No new GGML ops. Downsample 2x4x4x6x10 = 1920x.
+
+48kHz stereo audio is compressed to 64-dimensional latent frames at 25 Hz.
+Three output formats, decode auto-detects from file content:
+
+| Format | Frame size | Bitrate | 3 min song | vs f32 (cossim) |
+|--------|-----------|---------|------------|-----------------|
+| f32    | 256B      | 51 kbit/s | 1.1 MB   | baseline        |
+| NAC8   | 66B       | 13 kbit/s | 290 KB   | 0.9999          |
+| NAC4   | 34B       | 6.8 kbit/s | 150 KB  | 0.989           |
+
+NAC = Neural Audio Codec. The NAC8 and NAC4 file formats are headerless
+except for a 4-byte magic (`NAC8` or `NAC4`) and a uint32 frame count.
+Q8 quantization error is 39 dB below the VAE reconstruction error (free).
+Q4 quantization error is 16 dB below the VAE reconstruction error (inaudible
+on most material).
+
+```bash
+# encode (Q4: 6.8 kbit/s, ~150 KB for 3 minutes)
+neural-codec --vae models/vae-BF16.gguf --encode --q4 -i song.wav -o song.nac4
+
+# encode (Q8: 13 kbit/s, ~290 KB for 3 minutes)
+neural-codec --vae models/vae-BF16.gguf --encode --q8 -i song.wav -o song.nac8
+
+# decode (auto-detects format)
+neural-codec --vae models/vae-BF16.gguf --decode -i song.nac4 -o song_decoded.wav
+
+# roundtrip validation: compare song.wav and song_decoded.wav with your ears
+```
+
 ## Architecture
 
 ```
@@ -278,6 +435,39 @@ dit-vae
   WAV stereo 48kHz
 ```
 
+## Roadmap
+
+This project started from a simple idea: a Telegram bot using llama.cpp to
+prompt a music generator, and the desire to make GGML sing. No more, no less.
+No cloud, no black box, scriptable and nothing between you and the model.
+
+### LLM modes
+- [ ] Remaining modes: Understand, Rewrite (single-pass, no audio codes)
+- [ ] Reference audio input: repaint and cover tasks (src_audio + cover_strength)
+
+### Audio I/O
+Current: raw PCM f32 WAV via hand-rolled writer, no external deps.
+Trade-off to document:
+- **Keep as-is**: zero dependencies, clean licensing, works everywhere
+- **ffmpeg pipe**: trivial bash wrapper handles any codec/format, no C++ codec hell
+  - pro: MP3/FLAC/OGG out of the box, input resampling for reference audio
+  - con: runtime dependency, not embedded
+Conclusion pending. Likely ffmpeg as optional external pipe, documented in README.
+
+### API and interface
+- [ ] JSON HTTP server (minimal, well-documented, stable contract)
+- [ ] Web interface on top - vibecodeable by anyone, API stays simple
+Goal: document the internals and how the model actually works,
+not reproduce the Python spaghetti. Expert-first, no commercial fluff.
+
+### Documentation
+Current README is technical study + API reference, intentional.
+- [ ] Split when a user-facing interface exists: README (user) + ARCHITECTURE.md (internals)
+
+### Future models
+- [ ] ACE-Step 2.0: evaluate architecture delta, add headers/weights as needed
+No commitment, easy to adapt by adding headers or new compilation units as needed.
+
 ## LM specifics
 
 ace-qwen3 is not a general-purpose chat engine. It is a two-phase autoregressive
@@ -318,7 +508,7 @@ python3 debug-dit-cossim.py       # DiT: per-layer cossim GGML vs Python (turbo/
 ## Patched GGML fork
 
 Uses a patched GGML fork (submodule) with two new ops, a Metal im2col optimization, and
-a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, Metal, Vulkan.
+a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, ROCm, Metal, Vulkan.
 F32/F16/BF16 data types. The DiT uses only standard GGML ops and needs no patches.
 
 The VAE reconstructs audio from latent space through 5 upsampling blocks (total 1920x),
@@ -373,6 +563,19 @@ Upstream `im2col_kernel` uses OW directly as grid dimension Y, which exceeds the
 times per tile at output widths up to 491520. Fixed with a grid-stride loop on OW and
 `MIN(OW, MAX_GRIDDIM_Z)` clamping.
 
+### Upstream divergence
+
+The GGML submodule diverges from upstream only by the addition of
+`GGML_OP_SNAKE` and `GGML_OP_COL2IM_1D`. No existing upstream kernel is
+modified. These ops are required; the VAE does not work without them.
+
+An earlier approach patched the upstream naive ops instead of adding custom
+ones. Those patches were dropped. They are documented here in case someone
+wants to study the naive path:
+
+- `conv_transpose_1d`: bounded loop replacing O(T_in) brute-force, CUDA and Metal
+- `im2col`: grid-stride loop on OW to fix gridDim.y overflow for large tensors
+
 ## Acknowledgements
 
 Independent implementation based on ACE-Step 1.5 by ACE Studio and StepFun.
@@ -387,3 +590,15 @@ All model weights are theirs, this is just a native backend.
 	note={GitHub repository}
 }
 ```
+
+## Samples
+
+https://github.com/user-attachments/assets/9a50c1f4-9ec0-474a-bd14-e8c6b00622a1
+
+https://github.com/user-attachments/assets/fb606249-0269-4153-b651-bf78e05baf22
+
+https://github.com/user-attachments/assets/e0580468-5e33-4a1f-a0f4-b914e4b9a8c2
+
+https://github.com/user-attachments/assets/292a31f1-f97e-4060-9207-ed8364d9a794
+
+https://github.com/user-attachments/assets/34b1b781-a5bc-46c4-90a6-615a10bc2c6a
diff --git a/src/backend.h b/src/backend.h
index 4b8566b..df33975 100644
--- a/src/backend.h
+++ b/src/backend.h
@@ -13,6 +13,7 @@
 extern "C" int cudaDeviceGetAttribute(int *, int, int);
 #endif
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <thread>
 
@@ -41,6 +42,10 @@ static BackendPair backend_init(const char * label) {
     ggml_backend_load_all();
     BackendPair bp = {};
     bp.backend = ggml_backend_init_best();
+    if (!bp.backend) {
+        fprintf(stderr, "[Load] FATAL: no backend available\n");
+        exit(1);
+    }
     int n_threads = (int)std::thread::hardware_concurrency() / 2;
     if (n_threads < 1) n_threads = 1;
     // [GGML] If best backend is already CPU, reuse it (avoid 2 CPU instances
@@ -51,6 +56,10 @@ static BackendPair backend_init(const char * label) {
         ggml_backend_cpu_set_n_threads(bp.backend, n_threads);
     } else {
         bp.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        if (!bp.cpu_backend) {
+            fprintf(stderr, "[Load] FATAL: failed to init CPU backend\n");
+            exit(1);
+        }
         ggml_backend_cpu_set_n_threads(bp.cpu_backend, n_threads);
     }
     fprintf(stderr, "[Load] %s backend: %s (CPU threads: %d)\n",
@@ -87,5 +96,10 @@ static void backend_release(ggml_backend_t backend, ggml_backend_t cpu_backend)
 static ggml_backend_sched_t backend_sched_new(BackendPair bp, int max_nodes) {
     ggml_backend_t backends[2] = { bp.backend, bp.cpu_backend };
     int n = (bp.backend == bp.cpu_backend) ? 1 : 2;
-    return ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true);
+    ggml_backend_sched_t sched = ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true);
+    if (!sched) {
+        fprintf(stderr, "[Load] FATAL: failed to create scheduler\n");
+        exit(1);
+    }
+    return sched;
 }
diff --git a/src/cond-enc.h b/src/cond-enc.h
index e85b6fd..ba53163 100644
--- a/src/cond-enc.h
+++ b/src/cond-enc.h
@@ -270,7 +270,10 @@ static void cond_ggml_forward(CondGGML * m,
     if (timbre_out) ggml_build_forward_expand(gf, timbre_out);
 
     // Allocate and set inputs
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[CondEncoder] FATAL: failed to allocate graph\n");
+        exit(1);
+    }
 
     ggml_backend_tensor_set(t_lyric_in, lyric_embed, 0, 1024 * S_lyric * sizeof(float));
     ggml_backend_tensor_set(t_text_in, text_hidden, 0, 1024 * S_text * sizeof(float));
diff --git a/src/debug.h b/src/debug.h
index dc7a626..a32cd11 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -6,8 +6,6 @@
 #include <cstdio>
 #include <cstdint>
 #include <cmath>
-#include <cstring>
-#include <string>
 #include <vector>
 
 struct DebugDumper {
diff --git a/src/dit-graph.h b/src/dit-graph.h
index 2a92324..abe64d0 100644
--- a/src/dit-graph.h
+++ b/src/dit-graph.h
@@ -10,10 +10,7 @@
 
 #include "dit.h"
 
-#include <cstdio>
-#include <cstdlib>
 #include <cmath>
-#include <vector>
 
 // Helper: ensure tensor is f32 (cast if bf16/f16)
 static struct ggml_tensor * dit_ggml_f32(
diff --git a/src/dit-sampler.h b/src/dit-sampler.h
index 92540a8..31d9817 100644
--- a/src/dit-sampler.h
+++ b/src/dit-sampler.h
@@ -8,12 +8,8 @@
 #include "dit-graph.h"
 #include "debug.h"
 
-#include "ggml.h"
-#include "ggml-backend.h"
-#include "ggml-alloc.h"
 
 #include <cstdio>
-#include <cstdlib>
 #include <cstring>
 #include <cmath>
 #include <vector>
diff --git a/src/dit.h b/src/dit.h
index 190b2f7..cd2936e 100644
--- a/src/dit.h
+++ b/src/dit.h
@@ -10,17 +10,13 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
-#include "ggml-alloc.h"
 #include "gguf-weights.h"
 #include "backend.h"
 
-#include "debug.h"
 
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <string>
-#include <vector>
 
 // Config (mirrors dit.cuh DiTConfig)
 struct DiTGGMLConfig {
diff --git a/src/fsq-detok.h b/src/fsq-detok.h
index c3a1e60..5cc3d7c 100644
--- a/src/fsq-detok.h
+++ b/src/fsq-detok.h
@@ -110,6 +110,10 @@ static bool detok_ggml_load(DetokGGML * m, const char * gguf_path,
     ggml_backend_t backends[2] = { backend, cpu_backend };
     int n = (backend == cpu_backend) ? 1 : 2;
     m->sched = ggml_backend_sched_new(backends, NULL, n, 4096, false, true);
+    if (!m->sched) {
+        fprintf(stderr, "[FSQ] FATAL: failed to create scheduler\n");
+        return false;
+    }
 
     fprintf(stderr, "[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)\n");
     return true;
diff --git a/src/gguf-weights.h b/src/gguf-weights.h
index ac5f22d..be5bede 100644
--- a/src/gguf-weights.h
+++ b/src/gguf-weights.h
@@ -18,7 +18,6 @@
 
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <string>
 
 #ifdef _WIN32
diff --git a/src/metadata-fsm.h b/src/metadata-fsm.h
index becbe1c..69ae125 100644
--- a/src/metadata-fsm.h
+++ b/src/metadata-fsm.h
@@ -11,10 +11,8 @@
 #include <string>
 #include <vector>
 #include <map>
-#include <unordered_map>
 #include <algorithm>
 #include <cstdio>
-#include <cstring>
 
 // Prefix tree for FSM constrained decoding
 struct PrefixTree {
diff --git a/src/prompt.h b/src/prompt.h
index 99782d8..c568f5f 100644
--- a/src/prompt.h
+++ b/src/prompt.h
@@ -8,7 +8,6 @@
 
 #include <string>
 #include <vector>
-#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 
diff --git a/src/qwen3-enc.h b/src/qwen3-enc.h
index b8ea213..ee9b628 100644
--- a/src/qwen3-enc.h
+++ b/src/qwen3-enc.h
@@ -17,7 +17,6 @@
 #include "gguf-weights.h"
 #include <cmath>
 #include <cstdio>
-#include <cstring>
 #include <string>
 #include <vector>
 
@@ -404,7 +403,10 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o
     ggml_build_forward_expand(gf, out);
 
     // Allocate
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (%d tokens)\n", S);
+        exit(1);
+    }
 
     // Set inputs
     ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int));
@@ -455,7 +457,10 @@ static void qwen3_embed_lookup(Qwen3GGML * m, const int * token_ids, int S, floa
     ggml_set_output(out);
     ggml_build_forward_expand(gf, out);
 
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (embed lookup, %d tokens)\n", S);
+        exit(1);
+    }
     ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int));
     ggml_backend_sched_graph_compute(m->sched, gf);
     ggml_backend_tensor_get(out, output, 0, (size_t)H * S * sizeof(float));
diff --git a/src/qwen3-lm.h b/src/qwen3-lm.h
index 3bbd514..5f5e290 100644
--- a/src/qwen3-lm.h
+++ b/src/qwen3-lm.h
@@ -4,14 +4,11 @@
 #pragma once
 
 #include "qwen3-enc.h" // Qwen3Layer, Qwen3Config, layer build helpers
-#include "ggml-alloc.h"
-#include "bpe.h"
 
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <cmath>
-#include <string>
 #include <vector>
 
 // LM config (superset of encoder config)
@@ -450,7 +447,10 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens,
     ggml_build_forward_expand(gf, lgt);
 
     // Schedule + allocate
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[LM] FATAL: failed to allocate graph (prefill, %d tokens)\n", n_tokens);
+        exit(1);
+    }
 
     // Set token IDs
     ggml_backend_tensor_set(token_ids_t, token_ids, 0, n_tokens * sizeof(int));
@@ -678,7 +678,10 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
     ggml_build_forward_expand(gf, lgt);
 
     // Allocate
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[LM] FATAL: failed to allocate graph (batch decode, N=%d)\n", N);
+        exit(1);
+    }
 
     // Set token IDs
     ggml_backend_tensor_set(token_ids_t, token_ids, 0, N * sizeof(int));
diff --git a/src/request.cpp b/src/request.cpp
index 9b20423..c851eb3 100644
--- a/src/request.cpp
+++ b/src/request.cpp
@@ -6,7 +6,6 @@
 
 #include "request.h"
 #include <cstdlib>
-#include <cstring>
 #include <string>
 #include <vector>
 
@@ -14,9 +13,9 @@
 void request_init(AceRequest * r) {
     r->caption            = "";
     r->lyrics             = "";
-    r->instrumental       = false;
+
     r->bpm                = 0;
-    r->duration           = -1.0f;
+    r->duration           = 0.0f;
     r->keyscale           = "";
     r->timesignature      = "";
     r->vocal_language     = "unknown";
@@ -28,7 +27,7 @@ void request_init(AceRequest * r) {
     r->lm_negative_prompt = "";
     r->audio_codes        = "";
     r->inference_steps    = 8;
-    r->guidance_scale     = 1.0f;
+    r->guidance_scale     = 0.0f;
     r->shift              = 3.0f;
 }
 
@@ -241,7 +240,6 @@ bool request_parse(AceRequest * r, const char * path) {
         else if (k == "shift")              r->shift              = (float)atof(v.c_str());
 
         // bools
-        else if (k == "instrumental")       r->instrumental       = (v == "true");
         // unknown keys: silently ignored (forward compat)
     }
 
@@ -259,8 +257,6 @@ bool request_write(const AceRequest * r, const char * path) {
     fprintf(f, "{\n");
     fprintf(f, "  \"caption\": \"%s\",\n",            json_escape(r->caption).c_str());
     fprintf(f, "  \"lyrics\": \"%s\",\n",             json_escape(r->lyrics).c_str());
-    if (r->instrumental)
-        fprintf(f, "  \"instrumental\": true,\n");
     fprintf(f, "  \"bpm\": %d,\n",                    r->bpm);
     fprintf(f, "  \"duration\": %.1f,\n",             r->duration);
     fprintf(f, "  \"keyscale\": \"%s\",\n",           json_escape(r->keyscale).c_str());
diff --git a/src/request.h b/src/request.h
index 1295b83..d1748b5 100644
--- a/src/request.h
+++ b/src/request.h
@@ -6,6 +6,7 @@
 // Aligned with Python GenerationParams (inference.py:39) and API /release_task.
 //
 
+#include <cstdint>
 #include <string>
 #include <cstdio>
 
@@ -13,11 +14,10 @@ struct AceRequest {
     // text content
     std::string caption;            // ""
     std::string lyrics;             // ""
-    bool        instrumental;       // false
 
     // metadata (user-provided or LLM-enriched)
     int         bpm;                // 0 = unset
-    float       duration;           // -1 = unset
+    float       duration;           // 0 = unset
     std::string keyscale;           // "" = unset
     std::string timesignature;      // "" = unset
     std::string vocal_language;     // "unknown"
diff --git a/src/vae-enc.h b/src/vae-enc.h
new file mode 100644
index 0000000..f5c67f2
--- /dev/null
+++ b/src/vae-enc.h
@@ -0,0 +1,391 @@
+// vae-enc.h: AutoencoderOobleck encoder (audio -> latent) via ggml
+//
+// Mirror of vae.h decoder. Reuses VAEResUnit, load helpers, graph ops.
+// Architecture: conv1(2->128,k=7) -> 5x(3xresunit+snake+strided_conv) -> snake+conv2(2048->128,k=3)
+// Output 128ch = mean[64] + scale[64]. Deterministic encode returns mean.
+// Downsample: 2x4x4x6x10 = 1920x (matches decoder upsample)
+
+#pragma once
+#include "vae.h"
+
+// Encoder block: 3xResUnit(in_ch) -> snake(in_ch) -> strided Conv1d(in_ch -> out_ch)
+// Decoder block is the mirror: snake(in_ch) -> ConvT(in_ch -> out_ch) -> 3xResUnit(out_ch)
+struct VAEEncBlock {
+    VAEResUnit ru[3];
+    struct ggml_tensor * sa, * sb;       // snake [1, in_ch]
+    struct ggml_tensor * dw, * db;       // strided conv [K, in_ch, out_ch], bias [out_ch]
+    int in_ch, out_ch, stride, kernel, padding;
+};
+
+struct VAEEncoder {
+    struct ggml_tensor * c1w, * c1b;     // conv1 [7, 2, 128], bias [128]
+    VAEEncBlock blk[5];
+    struct ggml_tensor * sa, * sb;       // final snake [1, 2048]
+    struct ggml_tensor * c2w, * c2b;     // conv2 [3, 2048, 128], bias [128]
+
+    ggml_backend_t backend;
+    ggml_backend_t cpu_backend;
+    ggml_backend_sched_t sched;
+    ggml_backend_buffer_t buf;
+    struct ggml_context * weight_ctx;
+
+    // graph cache (rebuilt when T_audio changes)
+    struct ggml_context * graph_ctx;
+    uint8_t            * graph_buf;
+    struct ggml_cgraph  * graph;
+    struct ggml_tensor  * graph_input;   // [T_audio, 2]
+    struct ggml_tensor  * graph_output;  // [T_latent, 128]
+    int                   graph_T;       // cached T_audio (0 = no cache)
+
+    std::vector<float> scratch_in;       // transposed input [2 * T_audio]
+};
+
+// Load encoder weights from the same VAE GGUF (encoder.* tensors)
+static void vae_enc_load(VAEEncoder * m, const char * path) {
+    GGUFModel gf = {};
+    if (!gf_load(&gf, path)) {
+        fprintf(stderr, "[VAE-Enc] FATAL: cannot load %s\n", path);
+        exit(1);
+    }
+
+    // Encoder channel layout (mirror of decoder, bottom-up):
+    //   conv1: 2 -> 128
+    //   block: [128->128, 128->256, 256->512, 512->1024, 1024->2048]
+    //   conv2: 2048 -> 128 (split: mean[64] + scale[64])
+    // ResUnits run at in_ch (before downsample), unlike decoder (at out_ch, after upsample).
+    static const int in_ch[]   = {128, 128, 256,  512, 1024};
+    static const int out_ch[]  = {128, 256, 512, 1024, 2048};
+    static const int strides[] = {  2,   4,   4,    6,   10};
+    static const int dilations[] = {1, 3, 9};
+
+    // Phase 1: create weight tensors
+    size_t ctx_size = ggml_tensor_overhead() * 256;
+    struct ggml_init_params p = { ctx_size, NULL, true };
+    m->weight_ctx = ggml_init(p);
+    struct ggml_context * ctx = m->weight_ctx;
+
+    m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128);
+    m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+
+    for (int i = 0; i < 5; i++) {
+        VAEEncBlock & b = m->blk[i];
+        b.in_ch   = in_ch[i];
+        b.out_ch  = out_ch[i];
+        b.stride  = strides[i];
+        b.kernel  = strides[i] * 2;
+        b.padding = (strides[i] + 1) / 2;   // ceil(stride / 2)
+        int C = in_ch[i];                    // res_units + snake at in_ch
+
+        // 3 res units at in_ch
+        for (int r = 0; r < 3; r++) {
+            VAEResUnit & ru = b.ru[r];
+            ru.dilation = dilations[r];
+            ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C);
+            ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
+            ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C);
+            ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
+        }
+
+        // snake at in_ch (before downsample conv)
+        b.sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+        b.sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+
+        // strided conv1d: [K, in_ch, out_ch]
+        b.dw = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, b.kernel, in_ch[i], out_ch[i]);
+        b.db = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch[i]);
+    }
+
+    m->sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
+    m->sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
+    m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128);
+    m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+
+    // Phase 2: allocate backend buffer
+    BackendPair bp = backend_init("VAE-Enc");
+    m->backend = bp.backend;
+    m->cpu_backend = bp.cpu_backend;
+    m->sched = backend_sched_new(bp, 8192);
+    m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend);
+    if (!m->buf) {
+        fprintf(stderr, "[VAE-Enc] FATAL: failed to allocate weight buffer\n");
+        exit(1);
+    }
+    fprintf(stderr, "[VAE-Enc] Backend: %s, Weight buffer: %.1f MB\n",
+            ggml_backend_name(m->backend),
+            (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024));
+
+    // Phase 3: load and fuse weights
+    vae_fuse_wn(m->c1w, gf, "encoder.conv1");
+    vae_load_bias(m->c1b, gf, "encoder.conv1.bias");
+
+    for (int i = 0; i < 5; i++) {
+        VAEEncBlock & b = m->blk[i];
+        std::string blk_pfx = "encoder.block." + std::to_string(i);
+
+        // res_units first (same load pattern as decoder)
+        for (int r = 0; r < 3; r++) {
+            VAEResUnit & ru = b.ru[r];
+            std::string rp = blk_pfx + ".res_unit" + std::to_string(r + 1);
+            vae_load_snake(ru.s1a, gf, rp + ".snake1.alpha");
+            vae_load_snake_inv(ru.s1b, gf, rp + ".snake1.beta");
+            vae_fuse_wn(ru.c1w, gf, rp + ".conv1");
+            vae_load_bias(ru.c1b, gf, rp + ".conv1.bias");
+            vae_load_snake(ru.s2a, gf, rp + ".snake2.alpha");
+            vae_load_snake_inv(ru.s2b, gf, rp + ".snake2.beta");
+            vae_fuse_wn(ru.c2w, gf, rp + ".conv2");
+            vae_load_bias(ru.c2b, gf, rp + ".conv2.bias");
+        }
+
+        // snake + strided downsample conv (regular conv1d, NOT transposed)
+        vae_load_snake(b.sa, gf, blk_pfx + ".snake1.alpha");
+        vae_load_snake_inv(b.sb, gf, blk_pfx + ".snake1.beta");
+        vae_fuse_wn(b.dw, gf, blk_pfx + ".conv1");
+        vae_load_bias(b.db, gf, blk_pfx + ".conv1.bias");
+    }
+
+    vae_load_snake(m->sa, gf, "encoder.snake1.alpha");
+    vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta");
+    vae_fuse_wn(m->c2w, gf, "encoder.conv2");
+    vae_load_bias(m->c2b, gf, "encoder.conv2.bias");
+
+    fprintf(stderr, "[VAE-Enc] Loaded: 5 blocks, downsample=1920x, F32 activations\n");
+    gf_close(&gf);
+}
+
+// Build encoder graph: audio [T_audio, 2] -> [T_latent, 128]
+static struct ggml_tensor * vae_enc_build_graph(
+        struct ggml_context * ctx,
+        VAEEncoder * m,
+        struct ggml_tensor * audio) {   // [T, 2]
+
+    // conv1: [T, 2] -> [T, 128]
+    struct ggml_tensor * x = vae_conv1d(ctx, m->c1w, m->c1b, audio, 1, 3, 1);
+
+    // 5 encoder blocks: resunits(in_ch) -> snake(in_ch) -> strided conv(in_ch -> out_ch)
+    for (int i = 0; i < 5; i++) {
+        VAEEncBlock & b = m->blk[i];
+        for (int r = 0; r < 3; r++)
+            x = vae_res_unit(ctx, &b.ru[r], x);
+        x = vae_snake(ctx, x, b.sa, b.sb);
+        x = vae_conv1d(ctx, b.dw, b.db, x, b.stride, b.padding, 1);
+    }
+
+    // Final: snake(2048) -> conv2(2048 -> 128, k=3, pad=1)
+    x = vae_snake(ctx, x, m->sa, m->sb);
+    x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1);
+
+    return x;  // [T_latent, 128]
+}
+
+// Core compute: build/cache graph, set input, run. Returns T_latent or -1.
+// Output stays in m->graph_output for caller to read.
+static int vae_enc_compute(
+        VAEEncoder * m,
+        const float * audio,    // [T_audio, 2] time-major interleaved stereo
+        int T_audio) {
+
+    // Rebuild graph when T_audio changes
+    if (m->graph_T != T_audio) {
+        if (m->graph_ctx) {
+            ggml_backend_sched_reset(m->sched);
+            ggml_free(m->graph_ctx);
+            free(m->graph_buf);
+        }
+
+        size_t ctx_size = ggml_tensor_overhead() * 1024 + ggml_graph_overhead_custom(8192, false);
+        m->graph_buf = (uint8_t *)malloc(ctx_size);
+        struct ggml_init_params p = { ctx_size, m->graph_buf, true };
+        struct ggml_context * ctx = ggml_init(p);
+
+        m->graph_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2);
+        ggml_set_name(m->graph_input, "enc_input");
+        ggml_set_input(m->graph_input);
+
+        m->graph_output = vae_enc_build_graph(ctx, m, m->graph_input);
+        ggml_set_name(m->graph_output, "enc_output");
+        ggml_set_output(m->graph_output);
+
+        m->graph = ggml_new_graph_custom(ctx, 8192, false);
+        ggml_build_forward_expand(m->graph, m->graph_output);
+
+        if (!ggml_backend_sched_alloc_graph(m->sched, m->graph)) {
+            fprintf(stderr, "[VAE-Enc] FATAL: graph alloc failed for T=%d\n", T_audio);
+            ggml_free(ctx);
+            free(m->graph_buf);
+            m->graph_ctx = NULL;
+            m->graph_buf = NULL;
+            m->graph_T = 0;
+            return -1;
+        }
+
+        m->graph_ctx = ctx;
+        m->graph_T = T_audio;
+        fprintf(stderr, "[VAE-Enc] Graph: %d nodes, T_audio=%d\n",
+                ggml_graph_n_nodes(m->graph), T_audio);
+    }
+
+    // Transpose: [T, 2] time-major -> ggml [T, 2] channel-contiguous
+    // ggml ne[0]=T is the contiguous dim, so we write all T samples per channel
+    size_t in_size = (size_t)2 * T_audio;
+    if (m->scratch_in.size() < in_size)
+        m->scratch_in.resize(in_size);
+    for (int c = 0; c < 2; c++)
+        for (int t = 0; t < T_audio; t++)
+            m->scratch_in[c * T_audio + t] = audio[t * 2 + c];
+    ggml_backend_tensor_set(m->graph_input,
+                            m->scratch_in.data(), 0, in_size * sizeof(float));
+
+    ggml_backend_sched_graph_compute(m->sched, m->graph);
+
+    return (int)m->graph_output->ne[0];  // T_latent
+}
+
+// Encode API: audio [T_audio, 2] -> latent_out [T_latent, 64] (mean only, deterministic)
+// Returns T_latent (or -1 on error).
+// latent_out must hold at least (T_audio / 1920) * 64 floats.
+static int vae_enc_encode(
+        VAEEncoder * m,
+        const float * audio,     // [T_audio, 2] interleaved stereo
+        int T_audio,
+        float * latent_out,      // [T_latent, 64] output, time-major
+        int max_T_latent) {
+
+    int T_latent = vae_enc_compute(m, audio, T_audio);
+    if (T_latent < 0) return -1;
+
+    if (T_latent > max_T_latent) {
+        fprintf(stderr, "[VAE-Enc] T_latent %d exceeds max %d\n", T_latent, max_T_latent);
+        return -1;
+    }
+
+    // Graph output is [ne0=T_latent, ne1=128] in ggml, channel-contiguous.
+    // Channels 0..63 = mean, 64..127 = scale. We only read mean.
+    // ggml layout: data[c * T_latent + t] for channel c, time t.
+    // We write time-major: latent_out[t * 64 + c] = data[c * T_latent + t]
+    //
+    // Read the full 128ch output once, extract mean channels 0..63
+    size_t out_bytes = (size_t)128 * T_latent * sizeof(float);
+    std::vector<float> raw(128 * T_latent);
+    ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes);
+
+    for (int t = 0; t < T_latent; t++)
+        for (int c = 0; c < 64; c++)
+            latent_out[t * 64 + c] = raw[c * T_latent + t];
+
+    fprintf(stderr, "[VAE-Enc] Encode: T_audio=%d -> T_latent=%d (%.2fs @ 48kHz)\n",
+            T_audio, T_latent, (float)T_audio / 48000.0f);
+
+    return T_latent;
+}
+
+// Tiled encode for long audio (same chunking strategy as decoder)
+// chunk_size: latent frames per tile, overlap: context frames on each side
+static int vae_enc_encode_tiled(
+        VAEEncoder * m,
+        const float * audio,     // [T_audio, 2] interleaved stereo
+        int T_audio,
+        float * latent_out,      // [T_latent, 64] output, time-major
+        int max_T_latent,
+        int chunk_size = 256,
+        int overlap = 64) {
+
+    // Work in audio-sample space. Each latent frame = 1920 audio samples.
+    int audio_chunk = chunk_size * 1920;
+    int audio_overlap = overlap * 1920;
+
+    // Shrink overlap until stride is positive
+    while (audio_chunk - 2 * audio_overlap <= 0 && audio_overlap > 0)
+        audio_overlap /= 2;
+
+    // Short audio: encode directly
+    if (T_audio <= audio_chunk)
+        return vae_enc_encode(m, audio, T_audio, latent_out, max_T_latent);
+
+    int audio_stride = audio_chunk - 2 * audio_overlap;
+    int num_steps = (T_audio + audio_stride - 1) / audio_stride;
+
+    fprintf(stderr, "[VAE-Enc] Tiled encode: %d tiles (chunk=%d, overlap=%d, stride=%d audio samples)\n",
+            num_steps, audio_chunk, audio_overlap, audio_stride);
+
+    float downsample_factor = 0.0f;
+    int latent_write_pos = 0;
+
+    for (int i = 0; i < num_steps; i++) {
+        // Core range in audio samples (the part we keep)
+        int core_start = i * audio_stride;
+        int core_end = core_start + audio_stride;
+        if (core_end > T_audio) core_end = T_audio;
+
+        // Window with overlap context
+        int win_start = core_start - audio_overlap;
+        if (win_start < 0) win_start = 0;
+        int win_end = core_end + audio_overlap;
+        if (win_end > T_audio) win_end = T_audio;
+        int win_len = win_end - win_start;
+
+        // Encode this window
+        int tile_T = vae_enc_compute(m, audio + win_start * 2, win_len);
+        if (tile_T < 0) {
+            fprintf(stderr, "[VAE-Enc] FATAL: tile %d encode failed\n", i);
+            return -1;
+        }
+
+        // Determine downsample factor from first tile
+        if (i == 0) {
+            downsample_factor = (float)tile_T / (float)win_len;
+            fprintf(stderr, "[VAE-Enc] Downsample factor: %.6f (expected ~1/1920)\n",
+                    downsample_factor);
+        }
+
+        // Trim in latent frames (mirror of decoder trim logic)
+        int added_start = core_start - win_start;
+        int trim_start = (int)roundf((float)added_start * downsample_factor);
+        int added_end = win_end - core_end;
+        int trim_end = (int)roundf((float)added_end * downsample_factor);
+
+        int end_idx = (trim_end > 0) ? (tile_T - trim_end) : tile_T;
+        int core_len = end_idx - trim_start;
+        if (core_len <= 0) continue;
+
+        if (latent_write_pos + core_len > max_T_latent) {
+            fprintf(stderr, "[VAE-Enc] FATAL: tiled output exceeds max_T_latent\n");
+            return -1;
+        }
+
+        // Read tile output [ne0=tile_T, ne1=128], extract mean (ch 0..63), transpose
+        // Only read the first 64 channels (mean), skip scale channels 64..127
+        size_t out_bytes = (size_t)128 * tile_T * sizeof(float);
+        std::vector<float> raw(128 * tile_T);
+        ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes);
+
+        for (int t = 0; t < core_len; t++)
+            for (int c = 0; c < 64; c++)
+                latent_out[(latent_write_pos + t) * 64 + c] =
+                    raw[c * tile_T + (trim_start + t)];
+
+        latent_write_pos += core_len;
+    }
+
+    fprintf(stderr, "[VAE-Enc] Tiled encode done: %d tiles -> T_latent=%d (%.2fs @ 48kHz)\n",
+            num_steps, latent_write_pos, (float)T_audio / 48000.0f);
+
+    return latent_write_pos;
+}
+
+// Free all resources
+static void vae_enc_free(VAEEncoder * m) {
+    if (m->graph_ctx) {
+        ggml_backend_sched_reset(m->sched);
+        ggml_free(m->graph_ctx);
+        free(m->graph_buf);
+    }
+    if (m->sched) ggml_backend_sched_free(m->sched);
+    if (m->buf) ggml_backend_buffer_free(m->buf);
+    if (m->weight_ctx) ggml_free(m->weight_ctx);
+    backend_release(m->backend, m->cpu_backend);
+    *m = {};
+}
diff --git a/src/vae.h b/src/vae.h
index 7c2a24e..fbf6d5f 100644
--- a/src/vae.h
+++ b/src/vae.h
@@ -14,7 +14,6 @@
 #include "backend.h"
 #include <cmath>
 #include <cstdio>
-#include <cstring>
 #include <string>
 #include <vector>
 
@@ -216,6 +215,10 @@ static void vae_ggml_load(VAEGGML * m, const char * path) {
     m->cpu_backend = bp.cpu_backend;
     m->sched = backend_sched_new(bp, 8192);
     m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend);
+    if (!m->buf) {
+        fprintf(stderr, "[VAE] FATAL: failed to allocate weight buffer\n");
+        exit(1);
+    }
     fprintf(stderr, "[VAE] Backend: %s, Weight buffer: %.1f MB\n",
             ggml_backend_name(m->backend),
             (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024));
diff --git a/tests/CPU-BF16.log b/tests/CPU-BF16.log
index b20ebae..74300ed 100644
--- a/tests/CPU-BF16.log
+++ b/tests/CPU-BF16.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 464.0 ms
+[Load] DiT weight load: 301.5 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 651.3 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 666.9 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.9 ms
+[Load] BPE tokenizer: 30.9 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 226.8 ms
-[Encode] TextEncoder (70 tokens): 59.7 ms
+[Load] TextEncoder: 121.5 ms
+[Encode] TextEncoder (70 tokens): 58.0 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.7 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 230.8 ms
+[Load] ConditionEncoder: 111.5 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 274.9 ms, enc_S=238
+[Encode] ConditionEncoder: 268.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758296 -0.049593 -0.132844 0.058496
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 34.6 ms
+[Load] Detokenizer: 23.6 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 958.8 ms
+[Context] Detokenizer: 889.4 ms
 [Debug] detok_output: [2170, 64] first4: -0.124160 1.435260 0.310138 -0.624584
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: 0.002176 0.183052 -1.467304 3.113325
 [Debug] dit_x0: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 18721.5 ms (18721.5 ms/sample)
+[DiT] Total generation: 17583.4 ms (17583.4 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 51818.0 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000519 0.001024 0.000897 0.001200
+[VAE Batch0] Decode: 46859.3 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000480 0.000983 0.000816 0.001189
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:57:38.585 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:57:38.585 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:57:38.585 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:57:38.586 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:57:38.586 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:57:39.413 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:57:40.961 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:57:40.961 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:57:40.966 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:57:41.132 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:49:02.827 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:49:02.916 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:49:04.251 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:49:04.252 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:49:04.253 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:49:04.259 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:49:04.454 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:57:41.140 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:57:41.153 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:57:41.153 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:57:41.175 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:57:41.483 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:57:41.483 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:57:41.483 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00688624382019043, 'diffusion_time_cost': 0.30014586448669434, 'diffusion_per_step_time_cost': 0.03751823306083679, 'total_time_cost': 0.30703210830688477, 'offload_time_cost': 0.0}
-2026-03-01 19:57:41.498 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
-2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
-2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
-2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:57:41.775 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:57:41.777 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:57:41.780 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:49:04.463 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:49:04.478 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:49:04.478 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:49:04.514 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:49:04.845 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:49:04.846 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:49:04.846 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007018327713012695, 'diffusion_time_cost': 0.32423973083496094, 'diffusion_per_step_time_cost': 0.04052996635437012, 'total_time_cost': 0.33125805854797363, 'offload_time_cost': 0.0}
+2026-03-04 21:49:04.860 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:49:04.862 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:49:04.862 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:49:04.862 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:49:04.862 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:49:04.862 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:49:04.862 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:49:05.138 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:49:05.140 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:49:05.142 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
 [GGML] Running acestep-v15-turbo-BF16.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988142
   dit_step7_vt                         0.969102
   dit_x0                               0.979106
-  vae_audio                            0.901370
-  vae_audio (STFT cosine)              0.975816
+  vae_audio                            0.901389
+  vae_audio (STFT cosine)              0.975826
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999946   0.136541   0.006626  -0.002312   0.972951  -0.002342   0.972003
diff --git a/tests/CPU-Q4_K_M.log b/tests/CPU-Q4_K_M.log
index 508a20c..540c4c4 100644
--- a/tests/CPU-Q4_K_M.log
+++ b/tests/CPU-Q4_K_M.log
@@ -1,5 +1,5 @@
 [Load] DiT backend: CPU (CPU threads: 16)
-[Load] Backend init: 6.3 ms
+[Load] Backend init: 1.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 895.6 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 118.4 ms
+[Load] DiT weight load: 121.8 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 696.2 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 699.6 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 33.0 ms
+[Load] BPE tokenizer: 33.3 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 148.2 ms
-[Encode] TextEncoder (70 tokens): 58.0 ms
+[Load] TextEncoder: 122.9 ms
+[Encode] TextEncoder (70 tokens): 60.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.6 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 352.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 37.5 ms
+[Load] ConditionEncoder: 34.8 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 294.2 ms, enc_S=238
+[Encode] ConditionEncoder: 300.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759313 -0.049345 -0.129442 0.055759
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 10.1 ms
+[Load] Detokenizer: 9.1 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 354.8 ms
+[Context] Detokenizer: 361.0 ms
 [Debug] detok_output: [2170, 64] first4: -0.106265 1.448869 0.309591 -0.650098
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: -0.463452 0.896626 -1.673395 3.222673
 [Debug] dit_x0: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 21769.5 ms (21769.5 ms/sample)
+[DiT] Total generation: 21823.6 ms (21823.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 52184.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000272 0.000786 0.000556 0.000990
+[VAE Batch0] Decode: 47904.5 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000330 0.000828 0.000665 0.001038
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 20:03:15.904 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 20:03:16.714 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 20:03:18.309 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 20:03:18.309 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 20:03:18.315 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 20:03:18.480 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:54:26.607 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:54:26.698 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:54:28.050 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:54:28.050 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:54:28.054 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:54:28.059 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:54:28.263 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 20:03:18.488 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 20:03:18.501 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 20:03:18.501 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 20:03:18.540 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 20:03:18.854 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 20:03:18.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 20:03:18.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006970643997192383, 'diffusion_time_cost': 0.3072662353515625, 'diffusion_per_step_time_cost': 0.03840827941894531, 'total_time_cost': 0.3142368793487549, 'offload_time_cost': 0.0}
-2026-03-01 20:03:18.869 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 20:03:19.148 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 20:03:19.151 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 20:03:19.154 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:54:28.272 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:54:28.288 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:54:28.288 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:54:28.323 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:54:28.640 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:54:28.641 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:54:28.641 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0070536136627197266, 'diffusion_time_cost': 0.30983686447143555, 'diffusion_per_step_time_cost': 0.03872960805892944, 'total_time_cost': 0.3168904781341553, 'offload_time_cost': 0.0}
+2026-03-04 21:54:28.655 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:54:28.666 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:54:28.666 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:54:28.666 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:54:28.666 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:54:28.666 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:54:28.666 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:54:28.949 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:54:28.951 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:54:28.952 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.977196
   dit_step7_vt                         0.939970
   dit_x0                               0.959881
-  vae_audio                            0.834993
-  vae_audio (STFT cosine)              0.955098
+  vae_audio                            0.834992
+  vae_audio (STFT cosine)              0.955102
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999883   0.167680   0.010319  -0.002256   0.973185  -0.002342   0.972003
diff --git a/tests/CPU-Q5_K_M.log b/tests/CPU-Q5_K_M.log
index e0d9936..6722100 100644
--- a/tests/CPU-Q5_K_M.log
+++ b/tests/CPU-Q5_K_M.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 140.3 ms
+[Load] DiT weight load: 110.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 699.1 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 698.0 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 33.4 ms
+[Load] BPE tokenizer: 33.1 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 149.7 ms
-[Encode] TextEncoder (70 tokens): 57.3 ms
+[Load] TextEncoder: 123.1 ms
+[Encode] TextEncoder (70 tokens): 57.9 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.5 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 412.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 45.1 ms
+[Load] ConditionEncoder: 41.0 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 387.5 ms, enc_S=238
+[Encode] ConditionEncoder: 388.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760901 -0.053445 -0.132760 0.058505
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 11.3 ms
+[Load] Detokenizer: 10.3 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 447.0 ms
+[Context] Detokenizer: 446.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.129311 1.458194 0.298132 -0.651512
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: -0.003599 0.325174 -1.377289 3.053612
 [Debug] dit_x0: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 27970.1 ms (27970.1 ms/sample)
+[DiT] Total generation: 28035.0 ms (28035.0 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 51966.1 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000740 0.001305 0.001083 0.001434
+[VAE Batch0] Decode: 47798.0 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000762 0.001320 0.001139 0.001557
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 20:01:56.032 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 20:01:57.576 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 20:01:57.577 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 20:01:57.581 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 20:01:57.747 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:53:09.193 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:53:09.323 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:53:10.674 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:53:10.674 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:53:10.676 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:53:10.682 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:53:10.881 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 20:01:57.755 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 20:01:57.768 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 20:01:57.768 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 20:01:57.801 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 20:01:58.109 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 20:01:58.109 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 20:01:58.109 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007002353668212891, 'diffusion_time_cost': 0.30033254623413086, 'diffusion_per_step_time_cost': 0.03754156827926636, 'total_time_cost': 0.30733489990234375, 'offload_time_cost': 0.0}
-2026-03-01 20:01:58.124 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 20:01:58.401 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 20:01:58.403 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 20:01:58.406 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:53:10.890 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:53:10.930 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:53:10.930 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:53:10.966 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:53:11.283 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:53:11.284 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:53:11.284 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006951332092285156, 'diffusion_time_cost': 0.3100306987762451, 'diffusion_per_step_time_cost': 0.03875383734703064, 'total_time_cost': 0.3169820308685303, 'offload_time_cost': 0.0}
+2026-03-04 21:53:11.298 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:53:11.300 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:53:11.300 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:53:11.300 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:53:11.300 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:53:11.300 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:53:11.300 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:53:11.575 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:53:11.577 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:53:11.579 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.983513
   dit_step7_vt                         0.954349
   dit_x0                               0.970379
-  vae_audio                            0.874800
-  vae_audio (STFT cosine)              0.967703
+  vae_audio                            0.874850
+  vae_audio (STFT cosine)              0.967714
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999926   0.135378   0.008030  -0.002303   0.973012  -0.002342   0.972003
diff --git a/tests/CPU-Q6_K.log b/tests/CPU-Q6_K.log
index 7d4c411..75b4fd2 100644
--- a/tests/CPU-Q6_K.log
+++ b/tests/CPU-Q6_K.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 169.4 ms
+[Load] DiT weight load: 150.0 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 699.2 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 689.7 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.5 ms
+[Load] BPE tokenizer: 33.1 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 148.3 ms
-[Encode] TextEncoder (70 tokens): 57.5 ms
+[Load] TextEncoder: 124.2 ms
+[Encode] TextEncoder (70 tokens): 58.0 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.6 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 476.3 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 52.6 ms
+[Load] ConditionEncoder: 47.5 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 348.9 ms, enc_S=238
+[Encode] ConditionEncoder: 349.5 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.761694 -0.052035 -0.131773 0.058231
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 12.3 ms
+[Load] Detokenizer: 11.0 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 414.3 ms
+[Context] Detokenizer: 417.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.151355 1.462444 0.326907 -0.627213
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: 0.118016 0.207620 -1.266971 2.955565
 [Debug] dit_x0: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 25398.3 ms (25398.3 ms/sample)
+[DiT] Total generation: 25477.6 ms (25477.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 52074.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000467 0.001015 0.000873 0.001303
+[VAE Batch0] Decode: 47852.2 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000553 0.001102 0.000938 0.001323
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 20:00:29.103 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 20:00:30.690 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 20:00:30.690 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 20:00:30.695 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 20:00:30.860 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:51:45.520 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:51:45.634 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:51:46.994 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:51:46.994 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:51:46.995 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:51:47.001 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:51:47.198 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 20:00:30.869 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 20:00:30.881 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 20:00:30.882 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 20:00:30.914 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 20:00:31.231 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 20:00:31.232 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 20:00:31.232 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006938934326171875, 'diffusion_time_cost': 0.31071925163269043, 'diffusion_per_step_time_cost': 0.038839906454086304, 'total_time_cost': 0.3176581859588623, 'offload_time_cost': 0.0}
-2026-03-01 20:00:31.246 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
-2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
-2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
-2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 20:00:31.524 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 20:00:31.527 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 20:00:31.531 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:51:47.208 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:51:47.224 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:51:47.224 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:51:47.259 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:51:47.579 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:51:47.579 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:51:47.579 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007021188735961914, 'diffusion_time_cost': 0.31169986724853516, 'diffusion_per_step_time_cost': 0.038962483406066895, 'total_time_cost': 0.31872105598449707, 'offload_time_cost': 0.0}
+2026-03-04 21:51:47.593 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:51:47.595 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:51:47.595 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:51:47.596 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:51:47.596 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:51:47.596 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:51:47.596 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:51:47.870 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:51:47.872 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:51:47.874 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
 [GGML] Running acestep-v15-turbo-Q6_K.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.984569
   dit_step7_vt                         0.958147
   dit_x0                               0.972312
-  vae_audio                            0.891761
-  vae_audio (STFT cosine)              0.969080
+  vae_audio                            0.891790
+  vae_audio (STFT cosine)              0.969088
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999936   0.151952   0.007283  -0.002271   0.972870  -0.002342   0.972003
diff --git a/tests/CPU-Q8_0.log b/tests/CPU-Q8_0.log
index 76183ea..3eb253c 100644
--- a/tests/CPU-Q8_0.log
+++ b/tests/CPU-Q8_0.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 188.0 ms
+[Load] DiT weight load: 178.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 690.8 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 692.2 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.8 ms
+[Load] BPE tokenizer: 32.4 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 160.0 ms
-[Encode] TextEncoder (70 tokens): 57.9 ms
+[Load] TextEncoder: 123.5 ms
+[Encode] TextEncoder (70 tokens): 58.2 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 13.0 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 616.6 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 126.4 ms
+[Load] ConditionEncoder: 65.1 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 390.3 ms, enc_S=238
+[Encode] ConditionEncoder: 373.4 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758873 -0.049568 -0.132802 0.057792
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 13.6 ms
+[Load] Detokenizer: 14.3 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 447.8 ms
+[Context] Detokenizer: 448.5 ms
 [Debug] detok_output: [2170, 64] first4: -0.126218 1.441045 0.305219 -0.629688
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: -0.037024 0.233524 -1.487499 3.098410
 [Debug] dit_x0: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 26043.3 ms (26043.3 ms/sample)
+[DiT] Total generation: 26009.5 ms (26009.5 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 52114.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000455 0.000930 0.000816 0.001121
+[VAE Batch0] Decode: 47762.1 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000441 0.000946 0.000788 0.001168
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:59:03.882 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:59:03.882 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:59:03.882 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:59:03.883 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:59:03.883 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:59:04.691 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:59:06.262 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:59:06.262 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:59:06.268 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:59:06.433 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:50:24.424 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:50:24.514 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:50:25.858 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:50:25.858 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:50:25.860 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:50:25.865 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:50:26.063 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:59:06.443 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:59:06.457 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:59:06.457 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:59:06.478 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:59:06.802 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:59:06.803 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:59:06.803 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006929874420166016, 'diffusion_time_cost': 0.3164329528808594, 'diffusion_per_step_time_cost': 0.03955411911010742, 'total_time_cost': 0.3233628273010254, 'offload_time_cost': 0.0}
-2026-03-01 19:59:06.817 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:59:07.095 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:59:07.098 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:59:07.101 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:50:26.073 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:50:26.088 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:50:26.088 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:50:26.120 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:50:26.438 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:50:26.438 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:50:26.438 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007014036178588867, 'diffusion_time_cost': 0.30962181091308594, 'diffusion_per_step_time_cost': 0.03870272636413574, 'total_time_cost': 0.3166358470916748, 'offload_time_cost': 0.0}
+2026-03-04 21:50:26.452 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:50:26.455 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:50:26.455 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.86 GB
+2026-03-04 21:50:26.455 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:50:26.455 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.86 GB
+2026-03-04 21:50:26.455 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.86 GB
+2026-03-04 21:50:26.455 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:50:26.730 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:50:26.732 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:50:26.734 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
 [GGML] Running acestep-v15-turbo-Q8_0.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988647
   dit_step7_vt                         0.970238
   dit_x0                               0.980014
-  vae_audio                            0.903408
-  vae_audio (STFT cosine)              0.976427
+  vae_audio                            0.903437
+  vae_audio (STFT cosine)              0.976438
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999946   0.139652   0.006645  -0.002330   0.972930  -0.002342   0.972003
diff --git a/tests/CUDA-BF16.log b/tests/CUDA-BF16.log
index d73a934..5ed30ff 100644
--- a/tests/CUDA-BF16.log
+++ b/tests/CUDA-BF16.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 70.8 ms
+[Load] Backend init: 32.6 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 375.6 ms
+[Load] DiT weight load: 310.9 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 661.0 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 653.9 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.8 ms
+[Load] BPE tokenizer: 30.9 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 128.5 ms
-[Encode] TextEncoder (70 tokens): 50.6 ms
+[Load] TextEncoder: 102.3 ms
+[Encode] TextEncoder (70 tokens): 50.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.5 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 127.1 ms
+[Load] ConditionEncoder: 90.6 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 7.9 ms, enc_S=238
+[Encode] ConditionEncoder: 8.2 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758648 -0.049409 -0.132412 0.058372
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 24.2 ms
+[Load] Detokenizer: 17.6 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 141.9 ms
+[Context] Detokenizer: 140.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.124204 1.435425 0.309963 -0.624679
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: -0.004009 0.190141 -1.466879 3.103273
 [Debug] dit_x0: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 248.3 ms (248.3 ms/sample)
+[DiT] Total generation: 243.9 ms (243.9 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 812.8 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000547 0.000898 0.000798 0.001064
+[VAE Batch0] Decode: 615.3 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000498 0.000900 0.000800 0.001124
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:08.539 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:09.277 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:10.804 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:10.804 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:10.810 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:10.970 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:24.010 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:24.091 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:25.418 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:25.418 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:25.421 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:25.426 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:25.618 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:10.978 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:10.991 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:10.991 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:11.023 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:11.329 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:11.330 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:11.330 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0068187713623046875, 'diffusion_time_cost': 0.2986173629760742, 'diffusion_per_step_time_cost': 0.03732717037200928, 'total_time_cost': 0.3054361343383789, 'offload_time_cost': 0.0}
-2026-03-01 19:54:11.344 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:11.625 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:11.628 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:11.632 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:25.628 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:25.643 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:25.643 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:25.674 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:25.993 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:25.994 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:25.994 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006845712661743164, 'diffusion_time_cost': 0.3112342357635498, 'diffusion_per_step_time_cost': 0.038904279470443726, 'total_time_cost': 0.31807994842529297, 'offload_time_cost': 0.0}
+2026-03-04 21:45:26.008 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:26.010 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:26.010 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:45:26.010 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:26.010 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:26.010 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:26.010 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:26.284 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:26.286 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:26.288 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
 [GGML] Running acestep-v15-turbo-BF16.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988188
   dit_step7_vt                         0.969375
   dit_x0                               0.979213
-  vae_audio                            0.901377
-  vae_audio (STFT cosine)              0.975525
+  vae_audio                            0.901411
+  vae_audio (STFT cosine)              0.975533
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999945   0.135628   0.006709  -0.002312   0.972932  -0.002342   0.972003
diff --git a/tests/CUDA-Q4_K_M.log b/tests/CUDA-Q4_K_M.log
index 189cb71..403d030 100644
--- a/tests/CUDA-Q4_K_M.log
+++ b/tests/CUDA-Q4_K_M.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 11.2 ms
+[Load] Backend init: 9.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 895.6 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 403.0 ms
+[Load] DiT weight load: 141.8 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 655.9 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 652.4 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.4 ms
+[Load] BPE tokenizer: 32.6 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 126.3 ms
-[Encode] TextEncoder (70 tokens): 52.7 ms
+[Load] TextEncoder: 103.0 ms
+[Encode] TextEncoder (70 tokens): 50.9 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.1 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 352.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 118.9 ms
+[Load] ConditionEncoder: 29.8 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 12.7 ms, enc_S=238
+[Encode] ConditionEncoder: 13.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759848 -0.046220 -0.129361 0.057668
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 22.1 ms
+[Load] Detokenizer: 6.3 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 124.0 ms
+[Context] Detokenizer: 124.2 ms
 [Debug] detok_output: [2170, 64] first4: -0.098446 1.438721 0.299255 -0.646500
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: -0.488470 0.849564 -1.659694 3.185843
 [Debug] dit_x0: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 249.1 ms (249.1 ms/sample)
+[DiT] Total generation: 249.0 ms (249.0 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 820.0 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000325 0.000812 0.000671 0.000911
+[VAE Batch0] Decode: 616.0 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000379 0.000847 0.000704 0.001000
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:39.264 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:40.025 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:41.587 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:41.587 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:41.592 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:41.751 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:55.364 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:55.452 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:56.779 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:56.779 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:56.781 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:56.786 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:56.978 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:56.981 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:56.981 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:56.981 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:41.759 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:41.771 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:41.772 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:41.805 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:42.113 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:42.114 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:42.114 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006765604019165039, 'diffusion_time_cost': 0.3010725975036621, 'diffusion_per_step_time_cost': 0.037634074687957764, 'total_time_cost': 0.30783820152282715, 'offload_time_cost': 0.0}
-2026-03-01 19:54:42.128 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:42.405 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:42.408 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:42.411 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:56.987 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:57.002 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:57.002 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:57.032 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:57.348 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:57.349 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:57.349 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890535354614258, 'diffusion_time_cost': 0.30885934829711914, 'diffusion_per_step_time_cost': 0.03860741853713989, 'total_time_cost': 0.3157498836517334, 'offload_time_cost': 0.0}
+2026-03-04 21:45:57.363 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:57.366 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:57.366 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB
+2026-03-04 21:45:57.366 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:57.366 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:57.366 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:57.366 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:57.640 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:57.642 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:57.644 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.976494
   dit_step7_vt                         0.938658
   dit_x0                               0.958725
-  vae_audio                            0.837763
-  vae_audio (STFT cosine)              0.954448
+  vae_audio                            0.837780
+  vae_audio (STFT cosine)              0.954457
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999885   0.165835   0.010206  -0.002260   0.973133  -0.002342   0.972003
diff --git a/tests/CUDA-Q5_K_M.log b/tests/CUDA-Q5_K_M.log
index 00b9652..4e72f4f 100644
--- a/tests/CUDA-Q5_K_M.log
+++ b/tests/CUDA-Q5_K_M.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 25.7 ms
+[Load] Backend init: 9.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 465.4 ms
+[Load] DiT weight load: 152.8 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 656.4 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 655.0 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
 [Load] BPE tokenizer: 31.3 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 127.3 ms
-[Encode] TextEncoder (70 tokens): 49.5 ms
+[Load] TextEncoder: 102.1 ms
+[Encode] TextEncoder (70 tokens): 70.3 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.4 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 412.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 138.7 ms
+[Load] ConditionEncoder: 34.4 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 13.1 ms, enc_S=238
+[Encode] ConditionEncoder: 13.5 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760389 -0.050879 -0.130835 0.059141
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 24.2 ms
+[Load] Detokenizer: 6.8 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 121.7 ms
+[Context] Detokenizer: 124.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.125017 1.460327 0.292545 -0.654237
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: 0.031181 0.378487 -1.509792 3.095486
 [Debug] dit_x0: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 251.1 ms (251.1 ms/sample)
+[DiT] Total generation: 261.4 ms (261.4 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 804.2 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000692 0.001098 0.000938 0.001230
+[VAE Batch0] Decode: 614.5 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000681 0.001094 0.000878 0.001246
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:32.168 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:33.881 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:33.882 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:33.887 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:34.060 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:47.565 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:47.662 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:48.979 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:48.979 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:48.981 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:48.987 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:49.182 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:34.068 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:34.081 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:34.081 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:34.105 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:34.415 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:34.416 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:34.416 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006921052932739258, 'diffusion_time_cost': 0.3029003143310547, 'diffusion_per_step_time_cost': 0.037862539291381836, 'total_time_cost': 0.30982136726379395, 'offload_time_cost': 0.0}
-2026-03-01 19:54:34.431 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:34.714 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:34.716 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:34.720 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:49.211 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:49.226 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:49.226 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:49.260 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:49.577 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:49.577 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:49.578 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00789022445678711, 'diffusion_time_cost': 0.30838513374328613, 'diffusion_per_step_time_cost': 0.03854814171791077, 'total_time_cost': 0.31627535820007324, 'offload_time_cost': 0.0}
+2026-03-04 21:45:49.591 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:49.594 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:49.594 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:45:49.594 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:49.594 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:49.594 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:49.594 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:49.873 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:49.875 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:49.877 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.983446
   dit_step7_vt                         0.953383
   dit_x0                               0.970119
-  vae_audio                            0.883226
-  vae_audio (STFT cosine)              0.968463
+  vae_audio                            0.883245
+  vae_audio (STFT cosine)              0.968470
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999930   0.139407   0.007818  -0.002306   0.973025  -0.002342   0.972003
diff --git a/tests/CUDA-Q6_K.log b/tests/CUDA-Q6_K.log
index 10b9a7a..4950234 100644
--- a/tests/CUDA-Q6_K.log
+++ b/tests/CUDA-Q6_K.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 9.5 ms
+[Load] Backend init: 10.2 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 514.8 ms
+[Load] DiT weight load: 176.0 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 657.3 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 655.0 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 30.7 ms
+[Load] BPE tokenizer: 31.5 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 125.7 ms
-[Encode] TextEncoder (70 tokens): 49.2 ms
+[Load] TextEncoder: 102.6 ms
+[Encode] TextEncoder (70 tokens): 51.1 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.3 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 476.3 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 145.8 ms
+[Load] ConditionEncoder: 40.6 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 11.0 ms, enc_S=238
+[Encode] ConditionEncoder: 10.8 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760759 -0.050104 -0.133269 0.058044
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 26.4 ms
+[Load] Detokenizer: 7.8 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 123.5 ms
+[Context] Detokenizer: 123.6 ms
 [Debug] detok_output: [2170, 64] first4: -0.140341 1.456987 0.310602 -0.632665
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: 0.081321 0.135461 -1.397063 2.986206
 [Debug] dit_x0: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 273.2 ms (273.2 ms/sample)
+[DiT] Total generation: 270.6 ms (270.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 804.3 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000481 0.000872 0.000838 0.001216
+[VAE Batch0] Decode: 616.4 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.001035 0.000900 0.001303
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:23.682 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:24.419 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:25.992 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:25.992 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:25.998 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:26.157 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:39.727 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:39.815 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:41.135 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:41.135 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:41.137 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:41.142 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:41.335 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:26.166 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:26.178 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:26.178 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:26.214 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:26.528 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:26.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:26.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00680994987487793, 'diffusion_time_cost': 0.30716919898986816, 'diffusion_per_step_time_cost': 0.03839614987373352, 'total_time_cost': 0.3139791488647461, 'offload_time_cost': 0.0}
-2026-03-01 19:54:26.543 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
-2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:26.821 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:26.824 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:26.828 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:41.345 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:41.359 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:41.359 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:41.390 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:41.705 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:41.706 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:41.706 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890773773193359, 'diffusion_time_cost': 0.30776047706604004, 'diffusion_per_step_time_cost': 0.038470059633255005, 'total_time_cost': 0.3146512508392334, 'offload_time_cost': 0.0}
+2026-03-04 21:45:41.720 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:41.722 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:41.723 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:45:41.723 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:41.723 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:41.723 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:41.723 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:41.997 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:41.999 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:42.001 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
 [GGML] Running acestep-v15-turbo-Q6_K.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.985862
   dit_step7_vt                         0.962454
   dit_x0                               0.974866
-  vae_audio                            0.893678
-  vae_audio (STFT cosine)              0.969663
+  vae_audio                            0.893720
+  vae_audio (STFT cosine)              0.969672
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999937   0.147590   0.007252  -0.002265   0.972930  -0.002342   0.972003
diff --git a/tests/CUDA-Q8_0.log b/tests/CUDA-Q8_0.log
index 3a84ce1..2744819 100644
--- a/tests/CUDA-Q8_0.log
+++ b/tests/CUDA-Q8_0.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 9.5 ms
+[Load] Backend init: 9.7 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 221.9 ms
+[Load] DiT weight load: 201.4 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 658.9 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 655.8 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
 [Load] BPE tokenizer: 31.2 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 127.0 ms
-[Encode] TextEncoder (70 tokens): 68.2 ms
+[Load] TextEncoder: 102.2 ms
+[Encode] TextEncoder (70 tokens): 57.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.3 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 616.6 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 65.2 ms
+[Load] ConditionEncoder: 52.3 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 8.9 ms, enc_S=238
+[Encode] ConditionEncoder: 9.0 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759220 -0.049559 -0.133467 0.058389
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 12.1 ms
+[Load] Detokenizer: 9.2 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 104.8 ms
+[Context] Detokenizer: 103.8 ms
 [Debug] detok_output: [2170, 64] first4: -0.120490 1.436288 0.301594 -0.632564
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: -0.007394 0.229067 -1.488817 3.083439
 [Debug] dit_x0: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 242.9 ms (242.9 ms/sample)
+[DiT] Total generation: 236.6 ms (236.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 822.6 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000524 0.000859 0.000752 0.001056
+[VAE Batch0] Decode: 618.6 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.000916 0.000781 0.001161
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:15.905 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:16.672 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:18.198 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:18.198 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:18.207 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:18.371 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:31.851 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:31.953 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:33.265 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:33.265 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:33.269 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:33.275 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:33.468 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:18.380 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:18.392 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:18.392 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:18.418 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:18.724 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:18.724 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:18.724 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006882190704345703, 'diffusion_time_cost': 0.298403263092041, 'diffusion_per_step_time_cost': 0.03730040788650513, 'total_time_cost': 0.3052854537963867, 'offload_time_cost': 0.0}
-2026-03-01 19:54:18.739 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
-2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:19.031 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:19.034 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:19.037 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:33.490 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:33.505 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:33.505 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:33.539 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:33.854 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:33.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:33.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0069425106048583984, 'diffusion_time_cost': 0.30779337882995605, 'diffusion_per_step_time_cost': 0.03847417235374451, 'total_time_cost': 0.31473588943481445, 'offload_time_cost': 0.0}
+2026-03-04 21:45:33.869 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:33.871 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:33.871 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB
+2026-03-04 21:45:33.871 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:33.871 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:33.871 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:33.871 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:34.145 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:34.147 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:34.149 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
 [GGML] Running acestep-v15-turbo-Q8_0.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988641
   dit_step7_vt                         0.970144
   dit_x0                               0.979969
-  vae_audio                            0.905525
-  vae_audio (STFT cosine)              0.976530
+  vae_audio                            0.905563
+  vae_audio (STFT cosine)              0.976538
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999948   0.134961   0.006551  -0.002307   0.972901  -0.002342   0.972003
diff --git a/tests/Vulkan-BF16.log b/tests/Vulkan-BF16.log
index 2d955d7..c063695 100644
--- a/tests/Vulkan-BF16.log
+++ b/tests/Vulkan-BF16.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 260.3 ms
+[Load] Backend init: 142.5 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 397.7 ms
+[Load] DiT weight load: 338.6 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 672.5 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 661.6 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.1 ms
+[Load] BPE tokenizer: 31.2 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 166.9 ms
-[Encode] TextEncoder (70 tokens): 30.9 ms
+[Load] TextEncoder: 141.4 ms
+[Encode] TextEncoder (70 tokens): 1939.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 163.7 ms
+[Load] ConditionEncoder: 130.2 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 22.5 ms, enc_S=238
+[Encode] ConditionEncoder: 2492.6 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758148 -0.049593 -0.132730 0.058488
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 28.1 ms
+[Load] Detokenizer: 23.1 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 229.8 ms
+[Context] Detokenizer: 2525.9 ms
 [Debug] detok_output: [2170, 64] first4: -0.125193 1.435010 0.308190 -0.624228
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] proj_in_input: [192, 2170] first4: -0.125193 1.435010 0.308190 -0.624228
 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.168464 0.814954 0.327714 -0.561971
 [Debug] layer0_sa_input: [2048, 1085] first4: -0.719110 -0.764019 -0.047328 0.261808
-[Debug] layer0_q_after_rope: [128, 16] first4: -2.424376 -0.094810 -0.411903 1.007324
-[Debug] layer0_k_after_rope: [128, 8] first4: -12.712339 1.106410 1.775920 1.780798
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.501171 0.169176 -0.355798 0.513027
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.598325 -0.820241 -0.296337 0.493580
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.091503 0.566892 52.584164 -0.903901
-[Debug] hidden_after_layer6: [2048, 1085] first4: -21.192070 0.040278 33.599442 -4.442998
-[Debug] hidden_after_layer12: [2048, 1085] first4: -15.068191 -18.118078 71.999359 28.597229
-[Debug] hidden_after_layer18: [2048, 1085] first4: -27.132679 15.867422 60.847614 20.940519
-[Debug] hidden_after_layer23: [2048, 1085] first4: -12.584854 45.152912 198.753845 145.517029
-[Debug] dit_step0_vt: [2170, 64] first4: 0.014936 1.119046 0.345802 2.379982
-[Debug] dit_step0_xt: [2170, 64] first4: 0.193657 2.105384 -0.187593 0.739475
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.985390 -0.040374 -0.446411 0.887640
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.133966 1.032982 1.765450 1.789189
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.285921 -0.088167 -0.083954 0.187361
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.465657 -0.778736 0.078704 0.498346
+[Debug] hidden_after_layer0: [2048, 1085] first4: -8.621284 0.720027 54.661194 -0.769228
+[Debug] hidden_after_layer6: [2048, 1085] first4: -12.726752 3.144506 -9.323353 -12.165966
+[Debug] hidden_after_layer12: [2048, 1085] first4: -16.201662 -10.547243 4.967308 15.566863
+[Debug] hidden_after_layer18: [2048, 1085] first4: -26.509827 14.787127 -25.476906 8.639433
+[Debug] hidden_after_layer23: [2048, 1085] first4: -16.044237 89.590195 45.410172 78.645676
+[Debug] dit_step0_vt: [2170, 64] first4: 0.347229 0.879013 0.198151 1.945618
+[Debug] dit_step0_xt: [2170, 64] first4: 0.178553 2.116295 -0.180882 0.759219
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: 0.086700 0.854980 -0.273651 1.728149
-[Debug] dit_step1_xt: [2170, 64] first4: 0.188928 2.058749 -0.172667 0.645212
+[Debug] dit_step1_vt: [2170, 64] first4: 0.068695 0.847748 -0.298004 1.750702
+[Debug] dit_step1_xt: [2170, 64] first4: 0.174806 2.070054 -0.164627 0.663726
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: 0.180420 0.837399 -0.150421 2.056976
-[Debug] dit_step2_xt: [2170, 64] first4: 0.176900 2.002922 -0.162639 0.508081
+[Debug] dit_step2_vt: [2170, 64] first4: 0.151260 0.875549 -0.207390 2.089754
+[Debug] dit_step2_xt: [2170, 64] first4: 0.164722 2.011684 -0.150801 0.524409
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.130821 0.833313 0.053528 2.193359
-[Debug] dit_step3_xt: [2170, 64] first4: 0.165998 1.933480 -0.167099 0.325301
+[Debug] dit_step3_vt: [2170, 64] first4: 0.077034 0.843689 -0.087112 2.299004
+[Debug] dit_step3_xt: [2170, 64] first4: 0.158302 1.941377 -0.143542 0.332826
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.273712 0.866425 0.216686 2.274872
-[Debug] dit_step4_xt: [2170, 64] first4: 0.136672 1.840648 -0.190316 0.081565
+[Debug] dit_step4_vt: [2170, 64] first4: 0.173340 0.815531 0.275307 2.367218
+[Debug] dit_step4_xt: [2170, 64] first4: 0.139730 1.853999 -0.173039 0.079195
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.347900 0.772171 0.542953 2.248352
-[Debug] dit_step5_xt: [2170, 64] first4: 0.086972 1.730338 -0.267881 -0.239629
+[Debug] dit_step5_vt: [2170, 64] first4: 0.210556 0.765915 0.470947 2.214279
+[Debug] dit_step5_xt: [2170, 64] first4: 0.109651 1.744582 -0.240317 -0.237130
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.132820 0.664673 0.218246 2.387787
-[Debug] dit_step6_xt: [2170, 64] first4: 0.060408 1.597404 -0.311530 -0.717186
+[Debug] dit_step6_vt: [2170, 64] first4: -0.038303 0.546310 0.224964 2.284607
+[Debug] dit_step6_xt: [2170, 64] first4: 0.117311 1.635320 -0.285310 -0.694052
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.335976 0.323303 0.198029 2.726624
-[Debug] dit_x0: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173
+[Debug] dit_step7_vt: [2170, 64] first4: -0.300537 0.235870 0.263802 2.617432
+[Debug] dit_x0: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 740.5 ms (740.5 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173
+[DiT] Total generation: 2630.4 ms (2630.4 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9812.1 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000591 0.001078 0.000929 0.001296
+[VAE Batch0] Decode: 2992.9 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000614 0.001141 0.000934 0.001396
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:55:13.398 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:55:13.398 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:55:13.399 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:55:13.399 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:55:13.399 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:55:14.155 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:55:15.664 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:55:15.664 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:55:15.669 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:55:15.830 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:46:56.541 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:46:56.622 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:46:57.937 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:46:57.937 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:46:57.939 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:46:57.945 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:46:58.137 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:55:15.838 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:55:15.850 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:55:15.851 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:55:15.885 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:55:16.193 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:55:16.193 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:55:16.193 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006814241409301758, 'diffusion_time_cost': 0.30007076263427734, 'diffusion_per_step_time_cost': 0.03750884532928467, 'total_time_cost': 0.3068850040435791, 'offload_time_cost': 0.0}
-2026-03-01 19:55:16.208 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:55:16.485 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:55:16.488 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:55:16.491 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:46:58.146 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:46:58.161 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:46:58.161 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:46:58.195 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:46:58.511 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:46:58.512 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:46:58.512 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006871938705444336, 'diffusion_time_cost': 0.30806517601013184, 'diffusion_per_step_time_cost': 0.03850814700126648, 'total_time_cost': 0.31493711471557617, 'offload_time_cost': 0.0}
+2026-03-04 21:46:58.526 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:46:58.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:46:58.528 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:46:58.528 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:46:58.528 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:46:58.528 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:46:58.528 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:46:58.802 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:46:58.804 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:46:58.806 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
 [GGML] Running acestep-v15-turbo-BF16.gguf...
@@ -224,36 +219,36 @@ Using precomputed LM hints
   temb_t                               0.999999
   hidden_after_proj_in                 0.999987
   enc_after_cond_emb                   0.999825
-  layer0_sa_output                     0.999959
-  hidden_after_layer0                  0.999982
-  hidden_after_layer6                  0.999916
-  hidden_after_layer12                 0.999276
-  hidden_after_layer18                 0.996645
-  hidden_after_layer23                 0.993735
-  dit_step0_vt                         0.975502
-  dit_step0_xt                         0.999946
-  dit_step1_vt                         0.898326
-  dit_step1_xt                         0.999578
-  dit_step2_vt                         0.893586
-  dit_step2_xt                         0.998276
-  dit_step3_vt                         0.881101
-  dit_step3_xt                         0.994720
-  dit_step4_vt                         0.869138
-  dit_step4_xt                         0.986137
-  dit_step5_vt                         0.854878
-  dit_step5_xt                         0.965846
-  dit_step6_vt                         0.840298
-  dit_step6_xt                         0.925771
-  dit_step7_vt                         0.818271
-  dit_x0                               0.867399
-  vae_audio                            0.680412
-  vae_audio (STFT cosine)              0.855380
+  layer0_sa_output                     0.920858
+  hidden_after_layer0                  0.996092
+  hidden_after_layer6                  0.980248
+  hidden_after_layer12                 0.977161
+  hidden_after_layer18                 0.973382
+  hidden_after_layer23                 0.961755
+  dit_step0_vt                         0.843333
+  dit_step0_xt                         0.999656
+  dit_step1_vt                         0.875601
+  dit_step1_xt                         0.998907
+  dit_step2_vt                         0.860701
+  dit_step2_xt                         0.996792
+  dit_step3_vt                         0.838816
+  dit_step3_xt                         0.991464
+  dit_step4_vt                         0.827875
+  dit_step4_xt                         0.978766
+  dit_step5_vt                         0.812689
+  dit_step5_xt                         0.949636
+  dit_step6_vt                         0.795272
+  dit_step6_xt                         0.894491
+  dit_step7_vt                         0.769772
+  dit_x0                               0.818406
+  vae_audio                            0.571274
+  vae_audio (STFT cosine)              0.788509
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999946   0.135811   0.006633  -0.002316   0.972919  -0.002342   0.972003
-  dit_step1_xt             0.999578   0.413265   0.019706  -0.005121   0.942541  -0.005313   0.941730
-  dit_step2_xt             0.998276   0.811472   0.038208  -0.008968   0.908957  -0.009311   0.908527
-  dit_step3_xt             0.994720   1.481150   0.064047  -0.014385   0.872574  -0.014577   0.873624
-  dit_step4_xt             0.986137   1.857148   0.100272  -0.021489   0.837038  -0.021660   0.841995
-  dit_step5_xt             0.965846   1.439633   0.154129  -0.031859   0.812819  -0.032109   0.824593
-  dit_step6_xt             0.925771   2.125688   0.235367  -0.046759   0.832442  -0.046482   0.855546
+  dit_step0_xt             0.999656   0.367652   0.018858  -0.002243   0.972108  -0.002342   0.972003
+  dit_step1_xt             0.998907   0.763455   0.032624  -0.004985   0.941679  -0.005313   0.941730
+  dit_step2_xt             0.996792   1.022189   0.053741  -0.008816   0.908019  -0.009311   0.908527
+  dit_step3_xt             0.991464   1.657425   0.084380  -0.014275   0.871556  -0.014577   0.873624
+  dit_step4_xt             0.978766   2.432666   0.128087  -0.021464   0.836876  -0.021660   0.841995
+  dit_step5_xt             0.949636   3.423663   0.193034  -0.032107   0.813619  -0.032109   0.824593
+  dit_step6_xt             0.894491   4.744513   0.289706  -0.047388   0.833987  -0.046482   0.855546
diff --git a/tests/Vulkan-Q4_K_M.log b/tests/Vulkan-Q4_K_M.log
index 011c0c3..03f9985 100644
--- a/tests/Vulkan-Q4_K_M.log
+++ b/tests/Vulkan-Q4_K_M.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 115.6 ms
+[Load] Backend init: 146.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 895.6 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 126.7 ms
+[Load] DiT weight load: 110.3 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 667.9 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 661.8 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.0 ms
+[Load] BPE tokenizer: 31.2 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 166.1 ms
-[Encode] TextEncoder (70 tokens): 18.4 ms
+[Load] TextEncoder: 143.1 ms
+[Encode] TextEncoder (70 tokens): 18.1 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.3 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 352.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 43.9 ms
+[Load] ConditionEncoder: 40.5 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 18.2 ms, enc_S=238
+[Encode] ConditionEncoder: 2552.5 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760519 -0.046675 -0.129011 0.057651
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 8.9 ms
+[Load] Detokenizer: 8.5 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 152.2 ms
+[Context] Detokenizer: 574.4 ms
 [Debug] detok_output: [2170, 64] first4: -0.107345 1.442038 0.300564 -0.641466
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -93,56 +91,53 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] dit_step0_vt: [2170, 64] first4: 0.669312 0.442215 1.300629 2.101841
 [Debug] dit_step0_xt: [2170, 64] first4: 0.163913 2.136149 -0.230995 0.752118
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: 1.120422 0.593113 1.031189 1.813599
-[Debug] dit_step1_xt: [2170, 64] first4: 0.102799 2.103798 -0.287241 0.653194
+[Debug] dit_step1_vt: [2170, 64] first4: 1.083954 0.575027 1.011414 1.785126
+[Debug] dit_step1_xt: [2170, 64] first4: 0.104788 2.104784 -0.286163 0.654747
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: 1.381363 0.295410 1.456146 1.949341
-[Debug] dit_step2_xt: [2170, 64] first4: 0.010708 2.084104 -0.384318 0.523238
+[Debug] dit_step2_vt: [2170, 64] first4: 1.406609 0.358032 1.442169 1.947861
+[Debug] dit_step2_xt: [2170, 64] first4: 0.011014 2.080915 -0.382307 0.524890
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 1.440727 0.067017 1.481567 2.158554
-[Debug] dit_step3_xt: [2170, 64] first4: -0.109353 2.078519 -0.507782 0.343359
+[Debug] dit_step3_vt: [2170, 64] first4: 1.450653 0.080627 1.479324 2.174759
+[Debug] dit_step3_xt: [2170, 64] first4: -0.109874 2.074197 -0.505584 0.343660
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 1.377216 0.234177 1.413437 2.181564
-[Debug] dit_step4_xt: [2170, 64] first4: -0.256912 2.053428 -0.659221 0.109620
+[Debug] dit_step4_vt: [2170, 64] first4: 1.396931 0.250122 1.401264 2.164902
+[Debug] dit_step4_xt: [2170, 64] first4: -0.259545 2.047398 -0.655720 0.111706
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 1.135239 0.376801 1.055233 2.272675
-[Debug] dit_step5_xt: [2170, 64] first4: -0.419089 1.999600 -0.809969 -0.215048
+[Debug] dit_step5_vt: [2170, 64] first4: 1.155813 0.405807 1.027550 2.260437
+[Debug] dit_step5_xt: [2170, 64] first4: -0.424661 1.989425 -0.802512 -0.211213
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.948242 0.399368 0.426941 2.645081
-[Debug] dit_step6_xt: [2170, 64] first4: -0.608737 1.919726 -0.895357 -0.744064
+[Debug] dit_step6_vt: [2170, 64] first4: 0.916870 0.396088 0.350647 2.622253
+[Debug] dit_step6_xt: [2170, 64] first4: -0.608035 1.910208 -0.872642 -0.735664
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: 0.549133 -0.167076 0.379578 2.984619
-[Debug] dit_x0: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450
+[Debug] dit_step7_vt: [2170, 64] first4: 0.544876 -0.215309 0.434998 3.006592
+[Debug] dit_x0: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 263.6 ms (263.6 ms/sample)
-[Debug] dit_output: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450
+[DiT] Total generation: 342.3 ms (342.3 ms/sample)
+[Debug] dit_output: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9686.3 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.015021 0.018215 0.017495 0.016521
+[VAE Batch0] Decode: 1703.5 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.012597 0.015460 0.014870 0.014040
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:56:19.059 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:56:19.832 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:56:21.417 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:56:21.417 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:56:21.428 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:56:21.589 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:49.166 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:49.255 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:50.597 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:50.597 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:50.598 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:50.604 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:50.793 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:56:21.597 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:56:21.610 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:56:21.610 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:56:21.642 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:56:21.955 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:56:21.956 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:56:21.956 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006905794143676758, 'diffusion_time_cost': 0.3056776523590088, 'diffusion_per_step_time_cost': 0.0382097065448761, 'total_time_cost': 0.31258344650268555, 'offload_time_cost': 0.0}
-2026-03-01 19:56:21.970 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:56:22.249 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:56:22.252 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:56:22.255 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:50.802 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:50.816 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:50.816 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:50.850 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:51.166 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:51.167 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:51.167 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006922483444213867, 'diffusion_time_cost': 0.3079640865325928, 'diffusion_per_step_time_cost': 0.0384955108165741, 'total_time_cost': 0.31488656997680664, 'offload_time_cost': 0.0}
+2026-03-04 21:47:51.181 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:51.183 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:51.183 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:47:51.183 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:51.183 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:47:51.183 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:47:51.183 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:51.458 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:51.460 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:51.461 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
@@ -232,28 +227,28 @@ Using precomputed LM hints
   hidden_after_layer23                 0.947132
   dit_step0_vt                         0.790630
   dit_step0_xt                         0.999550
-  dit_step1_vt                         0.812267
-  dit_step1_xt                         0.998316
-  dit_step2_vt                         0.797855
-  dit_step2_xt                         0.994982
-  dit_step3_vt                         0.785550
-  dit_step3_xt                         0.987155
-  dit_step4_vt                         0.777677
-  dit_step4_xt                         0.969894
-  dit_step5_vt                         0.765554
-  dit_step5_xt                         0.933268
-  dit_step6_vt                         0.748164
-  dit_step6_xt                         0.865654
-  dit_step7_vt                         0.704997
-  dit_x0                               0.768990
-  vae_audio                            0.377954
-  vae_audio (STFT cosine)              0.669489
+  dit_step1_vt                         0.756205
+  dit_step1_xt                         0.998148
+  dit_step2_vt                         0.797194
+  dit_step2_xt                         0.994834
+  dit_step3_vt                         0.784456
+  dit_step3_xt                         0.987026
+  dit_step4_vt                         0.776725
+  dit_step4_xt                         0.969792
+  dit_step5_vt                         0.765077
+  dit_step5_xt                         0.933184
+  dit_step6_vt                         0.747231
+  dit_step6_xt                         0.865289
+  dit_step7_vt                         0.704165
+  dit_x0                               0.767979
+  vae_audio                            0.376451
+  vae_audio (STFT cosine)              0.668630
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999550   0.201120   0.022082  -0.002496   0.972768  -0.002342   0.972003
-  dit_step1_xt             0.998316   0.415084   0.041258  -0.005641   0.942202  -0.005313   0.941730
-  dit_step2_xt             0.994982   0.710340   0.068500  -0.010236   0.907728  -0.009311   0.908527
-  dit_step3_xt             0.987155   1.070455   0.105302  -0.016404   0.870181  -0.014577   0.873624
-  dit_step4_xt             0.969894   1.456633   0.155292  -0.024587   0.833834  -0.021660   0.841995
-  dit_step5_xt             0.933268   1.997366   0.225911  -0.035903   0.808944  -0.032109   0.824593
-  dit_step6_xt             0.865654   3.020976   0.331484  -0.051668   0.828925  -0.046482   0.855546
+  dit_step1_xt             0.998148   0.415598   0.043234  -0.005810   0.944103  -0.005313   0.941730
+  dit_step2_xt             0.994834   0.709830   0.069736  -0.010410   0.909328  -0.009311   0.908527
+  dit_step3_xt             0.987026   1.071567   0.106058  -0.016584   0.871456  -0.014577   0.873624
+  dit_step4_xt             0.969792   1.488428   0.155756  -0.024763   0.834729  -0.021660   0.841995
+  dit_step5_xt             0.933184   1.958024   0.226224  -0.036147   0.809005  -0.032109   0.824593
+  dit_step6_xt             0.865289   3.030077   0.331834  -0.051892   0.828296  -0.046482   0.855546
diff --git a/tests/Vulkan-Q5_K_M.log b/tests/Vulkan-Q5_K_M.log
index ec38ab3..aa0eb9c 100644
--- a/tests/Vulkan-Q5_K_M.log
+++ b/tests/Vulkan-Q5_K_M.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 114.1 ms
+[Load] Backend init: 114.4 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 151.9 ms
+[Load] DiT weight load: 129.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 677.1 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 660.3 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.6 ms
+[Load] BPE tokenizer: 30.7 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 167.6 ms
-[Encode] TextEncoder (70 tokens): 18.0 ms
+[Load] TextEncoder: 142.0 ms
+[Encode] TextEncoder (70 tokens): 17.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.1 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 412.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 55.7 ms
+[Load] ConditionEncoder: 50.1 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 17.4 ms, enc_S=238
+[Encode] ConditionEncoder: 3109.7 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760480 -0.051691 -0.132144 0.058144
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 14.2 ms
+[Load] Detokenizer: 9.1 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 176.8 ms
+[Context] Detokenizer: 674.8 ms
 [Debug] detok_output: [2170, 64] first4: -0.125636 1.455599 0.291766 -0.651349
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -96,53 +94,50 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] dit_step1_vt: [2170, 64] first4: -0.053368 1.748116 -0.894806 1.618408
 [Debug] dit_step1_xt: [2170, 64] first4: 0.197534 2.006799 -0.135800 0.647723
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.025024 1.326050 -0.792084 2.043884
-[Debug] dit_step2_xt: [2170, 64] first4: 0.199202 1.918396 -0.082994 0.511464
+[Debug] dit_step2_vt: [2170, 64] first4: -0.031860 1.378967 -0.801270 2.036987
+[Debug] dit_step2_xt: [2170, 64] first4: 0.199658 1.914868 -0.082382 0.511923
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: -0.000458 1.126770 -0.795593 2.254120
-[Debug] dit_step3_xt: [2170, 64] first4: 0.199240 1.824498 -0.016695 0.323620
+[Debug] dit_step3_vt: [2170, 64] first4: 0.009003 1.141663 -0.806183 2.229477
+[Debug] dit_step3_xt: [2170, 64] first4: 0.198908 1.819729 -0.015200 0.326134
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.174652 1.253662 -1.125977 2.441956
-[Debug] dit_step4_xt: [2170, 64] first4: 0.180528 1.690177 0.103946 0.061982
+[Debug] dit_step4_vt: [2170, 64] first4: 0.174896 1.264160 -1.139648 2.439102
+[Debug] dit_step4_xt: [2170, 64] first4: 0.180169 1.684284 0.106905 0.064801
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.205261 1.640076 -1.795410 2.452087
-[Debug] dit_step5_xt: [2170, 64] first4: 0.151205 1.455881 0.360433 -0.288316
+[Debug] dit_step5_vt: [2170, 64] first4: 0.201294 1.641151 -1.784760 2.454834
+[Debug] dit_step5_xt: [2170, 64] first4: 0.151413 1.449833 0.361871 -0.285889
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: -0.158905 1.750122 -2.412979 2.419128
-[Debug] dit_step6_xt: [2170, 64] first4: 0.182986 1.105856 0.843029 -0.772142
+[Debug] dit_step6_vt: [2170, 64] first4: -0.154907 1.748291 -2.434448 2.425964
+[Debug] dit_step6_xt: [2170, 64] first4: 0.182394 1.100175 0.848760 -0.771082
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.636047 1.672760 -3.485062 2.600891
-[Debug] dit_x0: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409
+[Debug] dit_step7_vt: [2170, 64] first4: -0.633545 1.687561 -3.500275 2.586243
+[Debug] dit_x0: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 269.9 ms (269.9 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409
+[DiT] Total generation: 354.9 ms (354.9 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9630.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.001367 0.001844 0.001533 0.001892
+[VAE Batch0] Decode: 1718.2 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.001432 0.001921 0.001585 0.001927
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:56:02.727 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:56:03.499 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:56:05.072 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:56:05.072 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:56:05.078 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:56:05.239 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:37.062 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:37.143 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:38.480 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:38.481 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:38.482 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:38.488 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:38.703 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:56:05.247 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:56:05.260 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:56:05.260 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:56:05.285 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:56:05.592 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:56:05.593 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:56:05.593 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00687718391418457, 'diffusion_time_cost': 0.3001282215118408, 'diffusion_per_step_time_cost': 0.0375160276889801, 'total_time_cost': 0.3070054054260254, 'offload_time_cost': 0.0}
-2026-03-01 19:56:05.607 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:56:05.609 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
-2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:56:05.610 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
-2026-03-01 19:56:05.610 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
-2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:56:05.884 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:56:05.888 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:56:05.891 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:38.712 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:38.726 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:38.726 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:38.761 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:39.078 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:39.079 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:39.079 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006884098052978516, 'diffusion_time_cost': 0.3090353012084961, 'diffusion_per_step_time_cost': 0.03862941265106201, 'total_time_cost': 0.3159193992614746, 'offload_time_cost': 0.0}
+2026-03-04 21:47:39.092 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:39.095 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:39.095 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:47:39.095 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:39.095 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:39.095 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:39.095 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:39.374 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:39.376 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:39.378 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
@@ -234,26 +229,26 @@ Using precomputed LM hints
   dit_step0_xt                         0.999650
   dit_step1_vt                         0.854589
   dit_step1_xt                         0.998725
-  dit_step2_vt                         0.841602
-  dit_step2_xt                         0.996217
-  dit_step3_vt                         0.832748
-  dit_step3_xt                         0.990342
-  dit_step4_vt                         0.826828
-  dit_step4_xt                         0.977304
-  dit_step5_vt                         0.815977
-  dit_step5_xt                         0.948497
-  dit_step6_vt                         0.803425
-  dit_step6_xt                         0.895308
-  dit_step7_vt                         0.770195
-  dit_x0                               0.820447
-  vae_audio                            0.478241
-  vae_audio (STFT cosine)              0.753764
+  dit_step2_vt                         0.826891
+  dit_step2_xt                         0.996124
+  dit_step3_vt                         0.832715
+  dit_step3_xt                         0.990263
+  dit_step4_vt                         0.826558
+  dit_step4_xt                         0.977265
+  dit_step5_vt                         0.815705
+  dit_step5_xt                         0.948477
+  dit_step6_vt                         0.802898
+  dit_step6_xt                         0.895216
+  dit_step7_vt                         0.769793
+  dit_x0                               0.820156
+  vae_audio                            0.477357
+  vae_audio (STFT cosine)              0.753154
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999650   0.235954   0.018872  -0.002255   0.973213  -0.002342   0.972003
   dit_step1_xt             0.998725   0.437235   0.034677  -0.005176   0.942982  -0.005313   0.941730
-  dit_step2_xt             0.996217   0.735376   0.057569  -0.009210   0.909169  -0.009311   0.908527
-  dit_step3_xt             0.990342   1.115564   0.088544  -0.014811   0.872820  -0.014577   0.873624
-  dit_step4_xt             0.977304   1.463506   0.131044  -0.022213   0.838526  -0.021660   0.841995
-  dit_step5_xt             0.948497   2.208427   0.193557  -0.032833   0.817339  -0.032109   0.824593
-  dit_step6_xt             0.895308   3.287671   0.286241  -0.047639   0.842369  -0.046482   0.855546
+  dit_step2_xt             0.996124   0.735913   0.058267  -0.009379   0.909744  -0.009311   0.908527
+  dit_step3_xt             0.990263   1.130236   0.088998  -0.014995   0.873310  -0.014577   0.873624
+  dit_step4_xt             0.977265   1.457183   0.131253  -0.022419   0.838885  -0.021660   0.841995
+  dit_step5_xt             0.948477   2.197404   0.193723  -0.033044   0.817537  -0.032109   0.824593
+  dit_step6_xt             0.895216   3.271284   0.286472  -0.047848   0.842172  -0.046482   0.855546
diff --git a/tests/Vulkan-Q6_K.log b/tests/Vulkan-Q6_K.log
index eff680f..a938da1 100644
--- a/tests/Vulkan-Q6_K.log
+++ b/tests/Vulkan-Q6_K.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 114.2 ms
+[Load] Backend init: 144.9 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 181.3 ms
+[Load] DiT weight load: 156.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 670.0 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 657.4 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.2 ms
+[Load] BPE tokenizer: 32.6 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 165.9 ms
-[Encode] TextEncoder (70 tokens): 17.6 ms
+[Load] TextEncoder: 142.6 ms
+[Encode] TextEncoder (70 tokens): 43.2 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 476.3 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 61.6 ms
+[Load] ConditionEncoder: 55.4 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 15.6 ms, enc_S=238
+[Encode] ConditionEncoder: 3621.4 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.761356 -0.050570 -0.133026 0.058500
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 10.8 ms
+[Load] Detokenizer: 10.5 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 143.8 ms
+[Context] Detokenizer: 421.5 ms
 [Debug] detok_output: [2170, 64] first4: -0.141024 1.454365 0.315089 -0.623565
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] proj_in_input: [192, 2170] first4: -0.141024 1.454365 0.315089 -0.623565
 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.170166 0.815842 0.310486 -0.571373
 [Debug] layer0_sa_input: [2048, 1085] first4: -0.716080 -0.755969 -0.048350 0.263422
-[Debug] layer0_q_after_rope: [128, 16] first4: -2.400391 -0.081909 -0.397461 1.011719
-[Debug] layer0_k_after_rope: [128, 8] first4: -12.581572 1.117675 1.774897 1.788774
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.503906 0.211304 -0.366943 0.520996
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.586454 -0.808233 -0.324089 0.502214
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.155503 0.531986 51.823910 -0.865276
-[Debug] hidden_after_layer6: [2048, 1085] first4: -20.861578 -0.240065 34.589954 -4.288221
-[Debug] hidden_after_layer12: [2048, 1085] first4: -14.692959 -16.975090 77.250595 30.676491
-[Debug] hidden_after_layer18: [2048, 1085] first4: -28.082283 13.370504 64.661263 19.941170
-[Debug] hidden_after_layer23: [2048, 1085] first4: -16.195175 45.294254 196.766129 138.065048
-[Debug] dit_step0_vt: [2170, 64] first4: 0.098133 1.125458 0.338135 2.349396
-[Debug] dit_step0_xt: [2170, 64] first4: 0.189875 2.105093 -0.187245 0.740865
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.967773 -0.181152 -0.292236 0.785156
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.036863 1.158559 1.733423 1.789948
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.344727 -0.353271 -0.171753 0.330078
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.478606 -0.639722 0.069986 0.503358
+[Debug] hidden_after_layer0: [2048, 1085] first4: -6.179441 -0.194424 25.726625 -0.569950
+[Debug] hidden_after_layer6: [2048, 1085] first4: -12.978424 -2.696237 30.199980 -5.338717
+[Debug] hidden_after_layer12: [2048, 1085] first4: -13.710206 -8.286438 60.887405 36.884922
+[Debug] hidden_after_layer18: [2048, 1085] first4: -19.046274 10.102365 41.516960 14.606686
+[Debug] hidden_after_layer23: [2048, 1085] first4: 52.532547 37.219868 135.759094 151.323456
+[Debug] dit_step0_vt: [2170, 64] first4: 0.032410 0.877930 -0.200378 2.148727
+[Debug] dit_step0_xt: [2170, 64] first4: 0.192863 2.116344 -0.162767 0.749987
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: -0.018386 1.071533 -0.402077 1.814056
-[Debug] dit_step1_xt: [2170, 64] first4: 0.190878 2.046645 -0.165313 0.641917
+[Debug] dit_step1_vt: [2170, 64] first4: -0.018381 1.082458 -0.369057 1.835251
+[Debug] dit_step1_xt: [2170, 64] first4: 0.193865 2.057301 -0.142637 0.649882
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.052032 1.017303 -0.201233 2.115219
-[Debug] dit_step2_xt: [2170, 64] first4: 0.194347 1.978825 -0.151898 0.500902
+[Debug] dit_step2_vt: [2170, 64] first4: -0.045654 1.004852 -0.202515 2.128693
+[Debug] dit_step2_xt: [2170, 64] first4: 0.196909 1.990311 -0.129136 0.507969
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.052856 1.105988 0.072205 2.288803
-[Debug] dit_step3_xt: [2170, 64] first4: 0.189942 1.886660 -0.157915 0.310169
+[Debug] dit_step3_vt: [2170, 64] first4: 0.053986 1.098206 0.059753 2.273270
+[Debug] dit_step3_xt: [2170, 64] first4: 0.192410 1.898794 -0.134115 0.318530
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.097982 1.134430 0.083038 2.362534
-[Debug] dit_step4_xt: [2170, 64] first4: 0.179444 1.765114 -0.166812 0.057040
+[Debug] dit_step4_vt: [2170, 64] first4: 0.059109 1.133232 0.098053 2.316540
+[Debug] dit_step4_xt: [2170, 64] first4: 0.186077 1.777376 -0.144621 0.070330
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.122574 1.016464 0.173828 2.333248
-[Debug] dit_step5_xt: [2170, 64] first4: 0.161934 1.619904 -0.191644 -0.276281
+[Debug] dit_step5_vt: [2170, 64] first4: 0.060867 1.087685 0.153732 2.225224
+[Debug] dit_step5_xt: [2170, 64] first4: 0.177382 1.621992 -0.166582 -0.247560
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.070358 0.866913 -0.005890 2.297897
-[Debug] dit_step6_xt: [2170, 64] first4: 0.147862 1.446522 -0.190466 -0.735860
+[Debug] dit_step6_vt: [2170, 64] first4: -0.040359 0.926651 0.010437 2.195786
+[Debug] dit_step6_xt: [2170, 64] first4: 0.185454 1.436662 -0.168670 -0.686717
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.360962 0.376282 -0.314270 2.626526
-[Debug] dit_x0: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818
+[Debug] dit_step7_vt: [2170, 64] first4: -0.506134 0.486553 -0.233337 2.557739
+[Debug] dit_x0: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 276.6 ms (276.6 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818
+[DiT] Total generation: 336.6 ms (336.6 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9723.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000254 0.000880 0.000782 0.001025
+[VAE Batch0] Decode: 1718.9 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000037 0.000692 0.000656 0.000941
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:55:46.361 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:55:46.361 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:55:46.361 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:55:46.362 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:55:46.362 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:55:47.150 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:55:48.700 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:55:48.700 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:55:48.705 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:55:48.864 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:24.206 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:24.287 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:25.614 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:25.614 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:25.616 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:25.621 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:25.810 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:55:48.872 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:55:48.885 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:55:48.885 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:55:48.917 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:55:49.229 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:55:49.230 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:55:49.230 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006822347640991211, 'diffusion_time_cost': 0.3050048351287842, 'diffusion_per_step_time_cost': 0.03812560439109802, 'total_time_cost': 0.3118271827697754, 'offload_time_cost': 0.0}
-2026-03-01 19:55:49.244 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:55:49.543 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:55:49.546 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:55:49.549 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:25.819 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:25.833 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:25.833 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:25.868 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:26.184 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:26.184 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:26.185 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006891727447509766, 'diffusion_time_cost': 0.3077425956726074, 'diffusion_per_step_time_cost': 0.03846782445907593, 'total_time_cost': 0.3146343231201172, 'offload_time_cost': 0.0}
+2026-03-04 21:47:26.198 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:26.201 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:26.201 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:47:26.201 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:26.201 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:26.201 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:26.201 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:26.477 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:26.479 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:26.481 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
 [GGML] Running acestep-v15-turbo-Q6_K.gguf...
@@ -224,36 +219,36 @@ Using precomputed LM hints
   temb_t                               0.999990
   hidden_after_proj_in                 0.999982
   enc_after_cond_emb                   0.999691
-  layer0_sa_output                     0.999774
-  hidden_after_layer0                  0.999710
-  hidden_after_layer6                  0.999855
-  hidden_after_layer12                 0.998856
-  hidden_after_layer18                 0.995803
-  hidden_after_layer23                 0.992072
-  dit_step0_vt                         0.970064
-  dit_step0_xt                         0.999934
-  dit_step1_vt                         0.924403
-  dit_step1_xt                         0.999650
-  dit_step2_vt                         0.915580
-  dit_step2_xt                         0.998651
-  dit_step3_vt                         0.914431
-  dit_step3_xt                         0.996098
-  dit_step4_vt                         0.913750
-  dit_step4_xt                         0.990344
-  dit_step5_vt                         0.906205
-  dit_step5_xt                         0.976856
-  dit_step6_vt                         0.897054
-  dit_step6_xt                         0.950943
-  dit_step7_vt                         0.876737
-  dit_x0                               0.912738
-  vae_audio                            0.744947
-  vae_audio (STFT cosine)              0.875717
+  layer0_sa_output                     0.916347
+  hidden_after_layer0                  0.997124
+  hidden_after_layer6                  0.993692
+  hidden_after_layer12                 0.992958
+  hidden_after_layer18                 0.988620
+  hidden_after_layer23                 0.980873
+  dit_step0_vt                         0.928387
+  dit_step0_xt                         0.999844
+  dit_step1_vt                         0.919122
+  dit_step1_xt                         0.999441
+  dit_step2_vt                         0.904200
+  dit_step2_xt                         0.998155
+  dit_step3_vt                         0.897635
+  dit_step3_xt                         0.994890
+  dit_step4_vt                         0.891638
+  dit_step4_xt                         0.987300
+  dit_step5_vt                         0.886907
+  dit_step5_xt                         0.970219
+  dit_step6_vt                         0.876538
+  dit_step6_xt                         0.938117
+  dit_step7_vt                         0.853291
+  dit_x0                               0.891872
+  vae_audio                            0.694699
+  vae_audio (STFT cosine)              0.858167
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999934   0.147239   0.007394  -0.002260   0.973056  -0.002342   0.972003
-  dit_step1_xt             0.999650   0.408757   0.017759  -0.005276   0.943557  -0.005313   0.941730
-  dit_step2_xt             0.998651   0.803721   0.033644  -0.009510   0.911087  -0.009311   0.908527
-  dit_step3_xt             0.996098   1.476888   0.054660  -0.015226   0.876460  -0.014577   0.873624
-  dit_step4_xt             0.990344   2.294700   0.082632  -0.022702   0.844225  -0.021660   0.841995
-  dit_step5_xt             0.976856   3.284146   0.125042  -0.033545   0.825286  -0.032109   0.824593
-  dit_step6_xt             0.950943   4.445529   0.188707  -0.049081   0.851111  -0.046482   0.855546
+  dit_step0_xt             0.999844   0.420509   0.012210  -0.002227   0.973206  -0.002342   0.972003
+  dit_step1_xt             0.999441   0.819075   0.022719  -0.005232   0.943799  -0.005313   0.941730
+  dit_step2_xt             0.998155   1.085687   0.039812  -0.009404   0.911549  -0.009311   0.908527
+  dit_step3_xt             0.994890   1.743559   0.063467  -0.015082   0.877147  -0.014577   0.873624
+  dit_step4_xt             0.987300   2.546782   0.096584  -0.022664   0.845277  -0.021660   0.841995
+  dit_step5_xt             0.970219   3.539635   0.144911  -0.033717   0.826728  -0.032109   0.824593
+  dit_step6_xt             0.938117   4.795851   0.216607  -0.049484   0.852836  -0.046482   0.855546
diff --git a/tests/Vulkan-Q8_0.log b/tests/Vulkan-Q8_0.log
index 774bc8a..ef53667 100644
--- a/tests/Vulkan-Q8_0.log
+++ b/tests/Vulkan-Q8_0.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 113.5 ms
+[Load] Backend init: 111.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 214.1 ms
+[Load] DiT weight load: 194.1 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 671.7 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 657.9 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.9 ms
+[Load] BPE tokenizer: 31.4 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 176.0 ms
+[Load] TextEncoder: 145.4 ms
 [Encode] TextEncoder (70 tokens): 17.6 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 616.6 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 84.7 ms
+[Load] ConditionEncoder: 75.8 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 19.4 ms, enc_S=238
+[Encode] ConditionEncoder: 5074.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759194 -0.049729 -0.133332 0.058435
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 15.5 ms
+[Load] Detokenizer: 13.7 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 85.1 ms
+[Context] Detokenizer: 437.6 ms
 [Debug] detok_output: [2170, 64] first4: -0.121505 1.434749 0.303808 -0.627535
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] proj_in_input: [192, 2170] first4: -0.121505 1.434749 0.303808 -0.627535
 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.169861 0.817307 0.328308 -0.558397
 [Debug] layer0_sa_input: [2048, 1085] first4: -0.718007 -0.757392 -0.047301 0.261071
-[Debug] layer0_q_after_rope: [128, 16] first4: -2.423828 -0.099304 -0.408203 1.004883
-[Debug] layer0_k_after_rope: [128, 8] first4: -12.718538 1.122484 1.774887 1.790079
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.510742 0.165771 -0.347900 0.511230
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.585310 -0.791508 -0.290125 0.495190
-[Debug] hidden_after_layer0: [2048, 1085] first4: -8.926053 0.558007 51.172398 -0.877717
-[Debug] hidden_after_layer6: [2048, 1085] first4: -20.768745 -0.272222 34.170349 -4.416629
-[Debug] hidden_after_layer12: [2048, 1085] first4: -14.358247 -18.625305 73.571915 30.079784
-[Debug] hidden_after_layer18: [2048, 1085] first4: -26.789474 14.346137 62.040115 19.708126
-[Debug] hidden_after_layer23: [2048, 1085] first4: -2.927731 38.887718 192.805542 144.255524
-[Debug] dit_step0_vt: [2170, 64] first4: 0.027340 1.115875 0.350609 2.345856
-[Debug] dit_step0_xt: [2170, 64] first4: 0.193093 2.105528 -0.187812 0.741026
+[Debug] layer0_q_after_rope: [128, 16] first4: -2.134766 -0.300049 -0.404541 0.904297
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.349133 1.249128 1.744302 1.794822
+[Debug] layer0_sa_output: [2048, 1085] first4: -0.636230 -0.405029 0.096436 0.194946
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.239698 -0.683206 0.416180 0.509788
+[Debug] hidden_after_layer0: [2048, 1085] first4: -4.682029 -0.464333 15.184165 -0.212429
+[Debug] hidden_after_layer6: [2048, 1085] first4: -8.053159 0.591622 20.595821 -6.469027
+[Debug] hidden_after_layer12: [2048, 1085] first4: -11.836857 -8.197025 41.079239 30.392553
+[Debug] hidden_after_layer18: [2048, 1085] first4: -20.004263 1.558971 15.575721 16.331001
+[Debug] hidden_after_layer23: [2048, 1085] first4: 23.482555 18.593208 82.512901 173.016068
+[Debug] dit_step0_vt: [2170, 64] first4: 0.084528 0.834541 -0.408783 2.115417
+[Debug] dit_step0_xt: [2170, 64] first4: 0.190494 2.118316 -0.153294 0.751501
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: 0.002377 1.005737 -0.352661 1.768188
-[Debug] dit_step1_xt: [2170, 64] first4: 0.192964 2.050670 -0.168576 0.644580
+[Debug] dit_step1_vt: [2170, 64] first4: -0.071388 1.041626 -0.270477 1.704315
+[Debug] dit_step1_xt: [2170, 64] first4: 0.194388 2.061500 -0.138541 0.658538
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.063080 1.061218 -0.344177 1.926041
-[Debug] dit_step2_xt: [2170, 64] first4: 0.197169 1.979922 -0.145631 0.516177
+[Debug] dit_step2_vt: [2170, 64] first4: -0.071960 1.095016 -0.333557 1.988541
+[Debug] dit_step2_xt: [2170, 64] first4: 0.199185 1.988499 -0.116304 0.525969
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: -0.072388 1.144592 -0.184326 2.069214
-[Debug] dit_step3_xt: [2170, 64] first4: 0.203201 1.884539 -0.130270 0.343743
+[Debug] dit_step3_vt: [2170, 64] first4: -0.037468 1.148598 -0.165955 2.091240
+[Debug] dit_step3_xt: [2170, 64] first4: 0.202307 1.892783 -0.102474 0.351699
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.004288 1.147110 0.001495 2.068916
-[Debug] dit_step4_xt: [2170, 64] first4: 0.202742 1.761635 -0.130430 0.122073
+[Debug] dit_step4_vt: [2170, 64] first4: 0.014343 1.134537 -0.033691 2.114731
+[Debug] dit_step4_xt: [2170, 64] first4: 0.200771 1.771225 -0.098864 0.125120
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.070211 1.173462 0.080673 2.086014
-[Debug] dit_step5_xt: [2170, 64] first4: 0.192712 1.593997 -0.141955 -0.175929
+[Debug] dit_step5_vt: [2170, 64] first4: 0.064150 1.159027 0.062057 2.121386
+[Debug] dit_step5_xt: [2170, 64] first4: 0.191606 1.605650 -0.107730 -0.177935
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: -0.010117 1.145203 0.186996 2.198898
-[Debug] dit_step6_xt: [2170, 64] first4: 0.194735 1.364957 -0.179354 -0.615709
+[Debug] dit_step6_vt: [2170, 64] first4: -0.041473 1.200439 0.198494 2.240326
+[Debug] dit_step6_xt: [2170, 64] first4: 0.199901 1.365562 -0.147428 -0.626000
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.244629 0.644890 0.358635 2.446594
-[Debug] dit_x0: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687
+[Debug] dit_step7_vt: [2170, 64] first4: -0.309998 0.692413 0.432823 2.469238
+[Debug] dit_x0: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 252.0 ms (252.0 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687
+[DiT] Total generation: 335.0 ms (335.0 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9843.4 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000170 0.000825 0.000784 0.001115
+[VAE Batch0] Decode: 1706.4 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000160 0.000739 0.000691 0.001054
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:55:30.699 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:55:32.273 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:55:32.274 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:55:32.279 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:55:32.442 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:11.115 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:11.205 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:12.506 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:12.506 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:12.508 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:12.513 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:12.703 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:55:32.450 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:55:32.462 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:55:32.463 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:55:32.484 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:55:32.791 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:55:32.791 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:55:32.791 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006818294525146484, 'diffusion_time_cost': 0.2995321750640869, 'diffusion_per_step_time_cost': 0.037441521883010864, 'total_time_cost': 0.3063504695892334, 'offload_time_cost': 0.0}
-2026-03-01 19:55:32.806 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
-2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
-2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
-2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:55:33.083 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:55:33.084 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:55:33.088 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:12.713 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:12.727 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:12.727 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:12.758 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:13.073 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:13.073 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:13.073 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006865262985229492, 'diffusion_time_cost': 0.30722999572753906, 'diffusion_per_step_time_cost': 0.03840374946594238, 'total_time_cost': 0.31409525871276855, 'offload_time_cost': 0.0}
+2026-03-04 21:47:13.087 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:13.096 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:13.096 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.82 GB
+2026-03-04 21:47:13.096 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:13.096 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.82 GB
+2026-03-04 21:47:13.096 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.82 GB
+2026-03-04 21:47:13.096 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:13.370 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:13.372 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:13.374 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
 [GGML] Running acestep-v15-turbo-Q8_0.gguf...
@@ -224,36 +219,36 @@ Using precomputed LM hints
   temb_t                               0.999998
   hidden_after_proj_in                 0.999985
   enc_after_cond_emb                   0.999817
-  layer0_sa_output                     0.999939
-  hidden_after_layer0                  0.999858
-  hidden_after_layer6                  0.999893
-  hidden_after_layer12                 0.999124
-  hidden_after_layer18                 0.996403
-  hidden_after_layer23                 0.993183
-  dit_step0_vt                         0.973885
-  dit_step0_xt                         0.999943
-  dit_step1_vt                         0.915468
-  dit_step1_xt                         0.999633
-  dit_step2_vt                         0.912211
-  dit_step2_xt                         0.998544
-  dit_step3_vt                         0.912707
-  dit_step3_xt                         0.995860
-  dit_step4_vt                         0.906019
-  dit_step4_xt                         0.989505
-  dit_step5_vt                         0.896537
-  dit_step5_xt                         0.974659
-  dit_step6_vt                         0.886047
-  dit_step6_xt                         0.945866
-  dit_step7_vt                         0.869793
-  dit_x0                               0.905017
-  vae_audio                            0.746037
-  vae_audio (STFT cosine)              0.898352
+  layer0_sa_output                     0.896665
+  hidden_after_layer0                  0.996506
+  hidden_after_layer6                  0.988924
+  hidden_after_layer12                 0.986595
+  hidden_after_layer18                 0.980435
+  hidden_after_layer23                 0.969958
+  dit_step0_vt                         0.880150
+  dit_step0_xt                         0.999739
+  dit_step1_vt                         0.904993
+  dit_step1_xt                         0.999178
+  dit_step2_vt                         0.897232
+  dit_step2_xt                         0.997639
+  dit_step3_vt                         0.896753
+  dit_step3_xt                         0.994077
+  dit_step4_vt                         0.889861
+  dit_step4_xt                         0.986004
+  dit_step5_vt                         0.878511
+  dit_step5_xt                         0.967661
+  dit_step6_vt                         0.867605
+  dit_step6_xt                         0.933014
+  dit_step7_vt                         0.848412
+  dit_x0                               0.884572
+  vae_audio                            0.692036
+  vae_audio (STFT cosine)              0.882942
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999943   0.140034   0.006943  -0.002318   0.973036  -0.002342   0.972003
-  dit_step1_xt             0.999633   0.423125   0.018056  -0.005257   0.943026  -0.005313   0.941730
-  dit_step2_xt             0.998544   0.841908   0.034537  -0.009209   0.910286  -0.009311   0.908527
-  dit_step3_xt             0.995860   1.521911   0.055719  -0.014626   0.875169  -0.014577   0.873624
-  dit_step4_xt             0.989505   2.346452   0.085477  -0.021803   0.842334  -0.021660   0.841995
-  dit_step5_xt             0.974659   3.387389   0.130921  -0.032225   0.822365  -0.032109   0.824593
-  dit_step6_xt             0.945866   4.812943   0.199910  -0.047290   0.846751  -0.046482   0.855546
+  dit_step0_xt             0.999739   0.400727   0.016274  -0.002102   0.972847  -0.002342   0.972003
+  dit_step1_xt             0.999178   0.814308   0.027485  -0.004968   0.942952  -0.005313   0.941730
+  dit_step2_xt             0.997639   1.101152   0.044575  -0.008840   0.910138  -0.009311   0.908527
+  dit_step3_xt             0.994077   1.762341   0.067497  -0.014170   0.875003  -0.014577   0.873624
+  dit_step4_xt             0.986004   2.565164   0.099802  -0.021228   0.842166  -0.021660   0.841995
+  dit_step5_xt             0.967661   3.593323   0.149360  -0.031486   0.822043  -0.032109   0.824593
+  dit_step6_xt             0.933014   4.978329   0.224230  -0.046337   0.845793  -0.046482   0.855546
diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp
index fbfd049..9605e2a 100644
--- a/tools/ace-qwen3.cpp
+++ b/tools/ace-qwen3.cpp
@@ -691,8 +691,7 @@ int main(int argc, char ** argv) {
                 "# Instruction\n"
                 "Expand the user's input into a more detailed"
                 " and specific musical description:\n";
-            std::string user_msg = ace.caption + "\n\ninstrumental: "
-                + std::string(req.instrumental ? "true" : "false");
+            std::string user_msg = ace.caption;
             prompt = build_custom_prompt(bpe, sys, user_msg.c_str());
         } else {
             prompt = build_lm_prompt(bpe, ace);
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index ac50e9f..8893f4b 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -6,13 +6,10 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <cmath>
 #include <vector>
 #include <random>
 #include "philox.h"
 
-#include "ggml.h"
-#include "ggml-backend.h"
 #include "dit-sampler.h"
 #include "vae.h"
 #include "qwen3-enc.h"
@@ -241,10 +238,12 @@ int main(int argc, char ** argv) {
         float duration        = req.duration > 0 ? req.duration : 30.0f;
         long long seed        = req.seed;
         int num_steps         = req.inference_steps > 0 ? req.inference_steps : 8;
-        float guidance_scale  = req.guidance_scale > 0 ? req.guidance_scale : 7.0f;
+        float guidance_scale  = req.guidance_scale;
         float shift           = req.shift > 0 ? req.shift : 1.0f;
 
-        if (is_turbo && guidance_scale > 1.0f) {
+        if (guidance_scale <= 0.0f)
+            guidance_scale = is_turbo ? 1.0f : 7.0f;
+        else if (is_turbo && guidance_scale > 1.0f) {
             fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n",
                     guidance_scale);
             guidance_scale = 1.0f;
diff --git a/tools/neural-codec.cpp b/tools/neural-codec.cpp
new file mode 100644
index 0000000..e75f3c3
--- /dev/null
+++ b/tools/neural-codec.cpp
@@ -0,0 +1,522 @@
+// neural-codec.cpp: neural audio codec (Oobleck VAE)
+//
+// 2 modes:
+//   encode: WAV -> latent file (f32, Q8, or Q4)
+//   decode: latent file -> WAV (48kHz stereo)
+//
+// Three latent formats, decode auto-detects:
+//
+//   f32 (default): flat [T, 64] f32, no header.
+//     T = file_size / 256. 25Hz, ~6.4 KB/s, ~51 kbit/s.
+//
+//   Q8 (--q8): symmetric per-frame int8 quantization.
+//     header: "NAC8" magic (4B) + uint32 T_latent (4B)
+//     frame:  f16 scale (2B) + int8[64] (64B) = 66B
+//     25Hz, ~1.6 KB/s, ~13 kbit/s.
+//
+//   Q4 (--q4): symmetric per-frame 4-bit quantization.
+//     header: "NAC4" magic (4B) + uint32 T_latent (4B)
+//     frame:  f16 scale (2B) + nibbles[32] (32B) = 34B
+//     25Hz, ~850 B/s, ~6.8 kbit/s.
+//
+// Usage:
+//   neural-codec --vae model.gguf --encode -i song.wav -o song.latent
+//   neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nac8
+//   neural-codec --vae model.gguf --encode --q4 -i song.wav -o song.nac4
+//   neural-codec --vae model.gguf --decode -i song.nac4 -o song.wav
+
+#include "vae.h"
+#include "vae-enc.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+// Minimal WAV reader: 16-bit PCM or 32-bit float, mono/stereo, any sample rate.
+// Returns interleaved float [T, 2]. Sets *T_audio, *sr. Caller frees.
+static float * read_wav(const char * path, int * T_audio, int * sr) {
+    FILE * f = fopen(path, "rb");
+    if (!f) { fprintf(stderr, "[WAV] Cannot open %s\n", path); return NULL; }
+
+    char riff[4]; fread(riff, 1, 4, f);
+    if (memcmp(riff, "RIFF", 4) != 0) {
+        fprintf(stderr, "[WAV] Not a RIFF file: %s\n", path); fclose(f); return NULL;
+    }
+    fseek(f, 4, SEEK_CUR);
+    char wave[4]; fread(wave, 1, 4, f);
+    if (memcmp(wave, "WAVE", 4) != 0) {
+        fprintf(stderr, "[WAV] Not a WAVE file: %s\n", path); fclose(f); return NULL;
+    }
+
+    int n_channels = 0, sample_rate = 0, bits_per_sample = 0;
+    short audio_format = 0;
+    float * audio = NULL;
+    int n_samples = 0;
+
+    while (!feof(f)) {
+        char chunk_id[4];
+        int chunk_size;
+        if (fread(chunk_id, 1, 4, f) != 4) break;
+        if (fread(&chunk_size, 4, 1, f) != 1) break;
+
+        if (memcmp(chunk_id, "fmt ", 4) == 0) {
+            fread(&audio_format, 2, 1, f);
+            short nc; fread(&nc, 2, 1, f); n_channels = nc;
+            fread(&sample_rate, 4, 1, f);
+            fseek(f, 4, SEEK_CUR); // byte_rate
+            fseek(f, 2, SEEK_CUR); // block_align
+            short bps; fread(&bps, 2, 1, f); bits_per_sample = bps;
+            int consumed = 16;
+            if (chunk_size > consumed) fseek(f, chunk_size - consumed, SEEK_CUR);
+
+        } else if (memcmp(chunk_id, "data", 4) == 0 && n_channels > 0) {
+            if (audio_format == 1 && bits_per_sample == 16) {
+                n_samples = chunk_size / (n_channels * 2);
+                audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float));
+                std::vector<short> buf((size_t)n_samples * n_channels);
+                fread(buf.data(), 2, (size_t)n_samples * n_channels, f);
+                for (int t = 0; t < n_samples; t++) {
+                    if (n_channels == 1) {
+                        float s = (float)buf[t] / 32768.0f;
+                        audio[t * 2 + 0] = s;
+                        audio[t * 2 + 1] = s;
+                    } else {
+                        audio[t * 2 + 0] = (float)buf[t * n_channels + 0] / 32768.0f;
+                        audio[t * 2 + 1] = (float)buf[t * n_channels + 1] / 32768.0f;
+                    }
+                }
+            } else if (audio_format == 3 && bits_per_sample == 32) {
+                n_samples = chunk_size / (n_channels * 4);
+                audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float));
+                std::vector<float> buf((size_t)n_samples * n_channels);
+                fread(buf.data(), 4, (size_t)n_samples * n_channels, f);
+                for (int t = 0; t < n_samples; t++) {
+                    if (n_channels == 1) {
+                        audio[t * 2 + 0] = buf[t];
+                        audio[t * 2 + 1] = buf[t];
+                    } else {
+                        audio[t * 2 + 0] = buf[t * n_channels + 0];
+                        audio[t * 2 + 1] = buf[t * n_channels + 1];
+                    }
+                }
+            } else {
+                fprintf(stderr, "[WAV] Unsupported: format=%d bits=%d (need PCM16 or float32)\n",
+                        audio_format, bits_per_sample);
+                fclose(f); return NULL;
+            }
+            break;
+        } else {
+            fseek(f, chunk_size, SEEK_CUR);
+        }
+    }
+    fclose(f);
+    if (!audio) { fprintf(stderr, "[WAV] No audio data in %s\n", path); return NULL; }
+
+    *T_audio = n_samples;
+    *sr = sample_rate;
+    fprintf(stderr, "[WAV] Read %s: %d samples, %d Hz, %d ch, %d bit\n",
+            path, n_samples, sample_rate, n_channels, bits_per_sample);
+    return audio;
+}
+
+// WAV writer: planar [ch0: T, ch1: T] -> 16-bit PCM stereo
+static bool write_wav(const char * path, const float * audio, int T_audio, int sr) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+    int n_channels = 2, bits = 16;
+    int byte_rate = sr * n_channels * (bits / 8);
+    int block_align = n_channels * (bits / 8);
+    int data_size = T_audio * n_channels * (bits / 8);
+    int file_size = 36 + data_size;
+    fwrite("RIFF", 1, 4, f);
+    fwrite(&file_size, 4, 1, f);
+    fwrite("WAVE", 1, 4, f);
+    fwrite("fmt ", 1, 4, f);
+    int fmt_size = 16; fwrite(&fmt_size, 4, 1, f);
+    short audio_fmt = 1; fwrite(&audio_fmt, 2, 1, f);
+    short nc = (short)n_channels; fwrite(&nc, 2, 1, f);
+    fwrite(&sr, 4, 1, f);
+    fwrite(&byte_rate, 4, 1, f);
+    short ba = (short)block_align; fwrite(&ba, 2, 1, f);
+    short bp = (short)bits; fwrite(&bp, 2, 1, f);
+    fwrite("data", 1, 4, f);
+    fwrite(&data_size, 4, 1, f);
+    for (int t = 0; t < T_audio; t++) {
+        for (int c = 0; c < 2; c++) {
+            float s = audio[c * T_audio + t];
+            s = s < -1.0f ? -1.0f : (s > 1.0f ? 1.0f : s);
+            short v = (short)(s * 32767.0f);
+            fwrite(&v, 2, 1, f);
+        }
+    }
+    fclose(f);
+    return true;
+}
+
+// Q8 format constants
+static const char NAC8_MAGIC[4] = {'N', 'A', 'C', '8'};
+static const int NAC8_HEADER = 8;   // 4B magic + 4B T_latent
+static const int NAC8_FRAME  = 66;  // 2B f16 scale + 64B int8
+
+// Write Q8 quantized latent
+static bool write_latent_q8(const char * path, const float * data, int T_latent) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+
+    fwrite(NAC8_MAGIC, 1, 4, f);
+    uint32_t t = (uint32_t)T_latent;
+    fwrite(&t, 4, 1, f);
+
+    for (int i = 0; i < T_latent; i++) {
+        const float * frame = data + i * 64;
+
+        // find max abs for symmetric quant
+        float amax = 0.0f;
+        for (int j = 0; j < 64; j++) {
+            float a = fabsf(frame[j]);
+            if (a > amax) amax = a;
+        }
+        float scale = amax / 127.0f;
+        ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale);
+        fwrite(&scale_f16, 2, 1, f);
+
+        // quantize
+        int8_t q[64];
+        float inv = (scale > 0.0f) ? 127.0f / amax : 0.0f;
+        for (int j = 0; j < 64; j++) {
+            int v = (int)roundf(frame[j] * inv);
+            q[j] = (int8_t)(v < -127 ? -127 : (v > 127 ? 127 : v));
+        }
+        fwrite(q, 1, 64, f);
+    }
+    fclose(f);
+
+    size_t bytes = NAC8_HEADER + (size_t)T_latent * NAC8_FRAME;
+    float duration = (float)T_latent * 1920.0f / 48000.0f;
+    float kbps = (float)bytes * 8.0f / (duration * 1000.0f);
+    fprintf(stderr, "[Latent] Wrote %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, T_latent, duration, (float)bytes / 1024.0f, kbps);
+    return true;
+}
+
+// Q4 format constants
+static const char NAC4_MAGIC[4] = {'N', 'A', 'C', '4'};
+static const int NAC4_HEADER = 8;   // 4B magic + 4B T_latent
+static const int NAC4_FRAME  = 34;  // 2B f16 scale + 32B packed nibbles
+
+// Write Q4 quantized latent
+// Symmetric 4-bit: range [-7, 7], scale = amax / 7.0
+// Packing: byte = (low & 0x0F) | (high << 4), two signed nibbles per byte
+static bool write_latent_q4(const char * path, const float * data, int T_latent) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+
+    fwrite(NAC4_MAGIC, 1, 4, f);
+    uint32_t t = (uint32_t)T_latent;
+    fwrite(&t, 4, 1, f);
+
+    for (int i = 0; i < T_latent; i++) {
+        const float * frame = data + i * 64;
+
+        // find max abs for symmetric quant
+        float amax = 0.0f;
+        for (int j = 0; j < 64; j++) {
+            float a = fabsf(frame[j]);
+            if (a > amax) amax = a;
+        }
+        float scale = amax / 7.0f;
+        ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale);
+        fwrite(&scale_f16, 2, 1, f);
+
+        // quantize and pack pairs into bytes
+        float inv = (scale > 0.0f) ? 7.0f / amax : 0.0f;
+        uint8_t packed[32];
+        for (int j = 0; j < 32; j++) {
+            int lo = (int)roundf(frame[j * 2 + 0] * inv);
+            int hi = (int)roundf(frame[j * 2 + 1] * inv);
+            lo = lo < -7 ? -7 : (lo > 7 ? 7 : lo);
+            hi = hi < -7 ? -7 : (hi > 7 ? 7 : hi);
+            packed[j] = (uint8_t)((lo & 0x0F) | (hi << 4));
+        }
+        fwrite(packed, 1, 32, f);
+    }
+    fclose(f);
+
+    size_t bytes = NAC4_HEADER + (size_t)T_latent * NAC4_FRAME;
+    float duration = (float)T_latent * 1920.0f / 48000.0f;
+    float kbps = (float)bytes * 8.0f / (duration * 1000.0f);
+    fprintf(stderr, "[Latent] Wrote %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, T_latent, duration, (float)bytes / 1024.0f, kbps);
+    return true;
+}
+
+// Write f32 raw latent (no header)
+static bool write_latent_f32(const char * path, const float * data, int T_latent) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+    size_t bytes = (size_t)T_latent * 64 * sizeof(float);
+    fwrite(data, 1, bytes, f);
+    fclose(f);
+    float duration = (float)T_latent * 1920.0f / 48000.0f;
+    fprintf(stderr, "[Latent] Wrote %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, T_latent, duration, (float)bytes / 1024.0f,
+            (float)bytes * 8.0f / (duration * 1000.0f));
+    return true;
+}
+
+// Read latent, auto-detect format (NAC8 -> Q8, NAC4 -> Q4, else f32).
+// Returns [T_latent, 64] f32 (dequantized if quantized). Caller frees.
+static float * read_latent(const char * path, int * T_latent) {
+    FILE * f = fopen(path, "rb");
+    if (!f) { fprintf(stderr, "[Latent] Cannot open %s\n", path); return NULL; }
+    fseek(f, 0, SEEK_END);
+    long fsize = ftell(f);
+    fseek(f, 0, SEEK_SET);
+
+    // Check magic
+    char magic[4] = {};
+    if (fsize >= 8) fread(magic, 1, 4, f);
+
+    if (memcmp(magic, NAC8_MAGIC, 4) == 0) {
+        // Q8 format
+        uint32_t t;
+        fread(&t, 4, 1, f);
+        *T_latent = (int)t;
+
+        long expected = NAC8_HEADER + (long)t * NAC8_FRAME;
+        if (fsize != expected) {
+            fprintf(stderr, "[Latent] Q8 size mismatch: expected %ld, got %ld\n", expected, fsize);
+            fclose(f); return NULL;
+        }
+
+        float * data = (float *)malloc((size_t)t * 64 * sizeof(float));
+        for (int i = 0; i < (int)t; i++) {
+            ggml_fp16_t scale_f16;
+            fread(&scale_f16, 2, 1, f);
+            float scale = ggml_fp16_to_fp32(scale_f16);
+
+            int8_t q[64];
+            fread(q, 1, 64, f);
+
+            float * frame = data + i * 64;
+            for (int j = 0; j < 64; j++)
+                frame[j] = (float)q[j] * scale;
+        }
+        fclose(f);
+
+        float duration = (float)(*T_latent) * 1920.0f / 48000.0f;
+        float kbps = (float)fsize * 8.0f / (duration * 1000.0f);
+        fprintf(stderr, "[Latent] Read %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+                path, *T_latent, duration, (float)fsize / 1024.0f, kbps);
+        return data;
+    }
+
+    if (memcmp(magic, NAC4_MAGIC, 4) == 0) {
+        // Q4 format
+        uint32_t t;
+        fread(&t, 4, 1, f);
+        *T_latent = (int)t;
+
+        long expected = NAC4_HEADER + (long)t * NAC4_FRAME;
+        if (fsize != expected) {
+            fprintf(stderr, "[Latent] Q4 size mismatch: expected %ld, got %ld\n", expected, fsize);
+            fclose(f); return NULL;
+        }
+
+        float * data = (float *)malloc((size_t)t * 64 * sizeof(float));
+        for (int i = 0; i < (int)t; i++) {
+            ggml_fp16_t scale_f16;
+            fread(&scale_f16, 2, 1, f);
+            float scale = ggml_fp16_to_fp32(scale_f16);
+
+            uint8_t packed[32];
+            fread(packed, 1, 32, f);
+
+            // unpack signed nibbles
+            float * frame = data + i * 64;
+            for (int j = 0; j < 32; j++) {
+                int lo = (int)(packed[j] & 0x0F);
+                int hi = (int)(packed[j] >> 4);
+                if (lo >= 8) lo -= 16;
+                if (hi >= 8) hi -= 16;
+                frame[j * 2 + 0] = (float)lo * scale;
+                frame[j * 2 + 1] = (float)hi * scale;
+            }
+        }
+        fclose(f);
+
+        float duration = (float)(*T_latent) * 1920.0f / 48000.0f;
+        float kbps = (float)fsize * 8.0f / (duration * 1000.0f);
+        fprintf(stderr, "[Latent] Read %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+                path, *T_latent, duration, (float)fsize / 1024.0f, kbps);
+        return data;
+    }
+
+    // f32 format (no header, rewind)
+    fseek(f, 0, SEEK_SET);
+    if (fsize % (64 * (int)sizeof(float)) != 0) {
+        fprintf(stderr, "[Latent] File size %ld not a multiple of %d (64 * f32)\n",
+                fsize, (int)(64 * sizeof(float)));
+        fclose(f); return NULL;
+    }
+
+    *T_latent = (int)(fsize / (64 * sizeof(float)));
+    float * data = (float *)malloc(fsize);
+    fread(data, 1, fsize, f);
+    fclose(f);
+
+    float duration = (float)(*T_latent) * 1920.0f / 48000.0f;
+    fprintf(stderr, "[Latent] Read %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, *T_latent, duration, (float)fsize / 1024.0f,
+            (float)fsize * 8.0f / (duration * 1000.0f));
+    return data;
+}
+
+static void print_usage(const char * prog) {
+    fprintf(stderr,
+        "Usage: %s --vae <gguf> --encode|--decode -i <input> [-o <output>] [--q8|--q4]\n\n"
+        "Required:\n"
+        "  --vae <path>            VAE GGUF file\n"
+        "  --encode | --decode     Encode WAV to latent, or decode latent to WAV\n"
+        "  -i <path>               Input (WAV for encode, latent for decode)\n\n"
+        "Output:\n"
+        "  -o <path>               Output file (auto-named if omitted)\n"
+        "  --q8                    Quantize latent to int8 (~13 kbit/s)\n"
+        "  --q4                    Quantize latent to int4 (~6.8 kbit/s)\n\n"
+        "Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n"
+        "               song.latent -> song.wav\n\n"
+        "VAE tiling (memory control):\n"
+        "  --vae-chunk <N>         Latent frames per tile (default: 256)\n"
+        "  --vae-overlap <N>       Overlap frames per side (default: 64)\n\n"
+        "Latent formats (decode auto-detects):\n"
+        "  f32:  flat [T, 64] f32, no header. ~51 kbit/s.\n"
+        "  NAC8: header + per-frame Q8. ~13 kbit/s.\n"
+        "  NAC4: header + per-frame Q4. ~6.8 kbit/s.\n",
+        prog);
+}
+
+static std::string auto_output(const char * input, const char * ext) {
+    std::string s = input;
+    size_t dot = s.rfind('.');
+    if (dot != std::string::npos)
+        return s.substr(0, dot) + ext;
+    return s + ext;
+}
+
+int main(int argc, char ** argv) {
+    const char * vae_path = NULL;
+    const char * input_path = NULL;
+    const char * output_path = NULL;
+    int chunk_size = 256;
+    int overlap = 64;
+    int mode = -1;  // 0 = encode, 1 = decode
+    int quant = 0;  // 0 = f32, 8 = q8, 4 = q4
+
+    for (int i = 1; i < argc; i++) {
+        if      (strcmp(argv[i], "--vae") == 0 && i + 1 < argc)     vae_path = argv[++i];
+        else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc)        input_path = argv[++i];
+        else if (strcmp(argv[i], "--input") == 0 && i + 1 < argc)   input_path = argv[++i];
+        else if (strcmp(argv[i], "-o") == 0 && i + 1 < argc)        output_path = argv[++i];
+        else if (strcmp(argv[i], "--output") == 0 && i + 1 < argc)  output_path = argv[++i];
+        else if (strcmp(argv[i], "--vae-chunk") == 0 && i + 1 < argc)   chunk_size = atoi(argv[++i]);
+        else if (strcmp(argv[i], "--vae-overlap") == 0 && i + 1 < argc) overlap = atoi(argv[++i]);
+        else if (strcmp(argv[i], "--encode") == 0)  mode = 0;
+        else if (strcmp(argv[i], "--decode") == 0)  mode = 1;
+        else if (strcmp(argv[i], "--q8") == 0)      quant = 8;
+        else if (strcmp(argv[i], "--q4") == 0)      quant = 4;
+        else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+            print_usage(argv[0]); return 0;
+        } else {
+            fprintf(stderr, "Unknown arg: %s\n", argv[i]);
+            print_usage(argv[0]); return 1;
+        }
+    }
+
+    if (!vae_path || !input_path || mode < 0) {
+        print_usage(argv[0]); return 1;
+    }
+
+    // Auto output names
+    std::string out_str;
+    if (!output_path) {
+        if (mode == 0) {
+            const char * ext = ".latent";
+            if (quant == 8) ext = ".nac8";
+            if (quant == 4) ext = ".nac4";
+            out_str = auto_output(input_path, ext);
+        } else {
+            out_str = auto_output(input_path, ".wav");
+        }
+        output_path = out_str.c_str();
+    }
+
+    const char * quant_str = "";
+    if (mode == 0 && quant == 8) quant_str = " (Q8)";
+    if (mode == 0 && quant == 4) quant_str = " (Q4)";
+    fprintf(stderr, "\n[VAE] Mode: %s%s\n", mode == 0 ? "encode" : "decode", quant_str);
+    fprintf(stderr, "[VAE] Input:  %s\n", input_path);
+    fprintf(stderr, "[VAE] Output: %s\n\n", output_path);
+
+    // ENCODE
+    if (mode == 0) {
+        int T_audio = 0, sr = 0;
+        float * audio = read_wav(input_path, &T_audio, &sr);
+        if (!audio) return 1;
+        if (sr != 48000)
+            fprintf(stderr, "[WARN] Input is %d Hz, VAE expects 48000. Resample with ffmpeg first.\n", sr);
+
+        VAEEncoder enc = {};
+        vae_enc_load(&enc, vae_path);
+
+        int max_T = (T_audio / 1920) + 64;
+        std::vector<float> latent((size_t)max_T * 64);
+
+        fprintf(stderr, "\n[VAE] Encoding %d samples (%.2fs)...\n",
+                T_audio, (float)T_audio / (float)(sr > 0 ? sr : 48000));
+        int T_latent = vae_enc_encode_tiled(&enc, audio, T_audio,
+                                             latent.data(), max_T, chunk_size, overlap);
+        free(audio);
+        if (T_latent < 0) { vae_enc_free(&enc); return 1; }
+
+        if (quant == 8)
+            write_latent_q8(output_path, latent.data(), T_latent);
+        else if (quant == 4)
+            write_latent_q4(output_path, latent.data(), T_latent);
+        else
+            write_latent_f32(output_path, latent.data(), T_latent);
+
+        vae_enc_free(&enc);
+        fprintf(stderr, "[VAE] Done.\n");
+        return 0;
+    }
+
+    // DECODE (auto-detects f32 vs Q8 vs Q4 from file content)
+    {
+        int T_latent = 0;
+        float * latent = read_latent(input_path, &T_latent);
+        if (!latent) return 1;
+
+        VAEGGML dec = {};
+        vae_ggml_load(&dec, vae_path);
+
+        int max_T = T_latent * 1920 + 4096;
+        std::vector<float> audio((size_t)2 * max_T, 0.0f);
+
+        fprintf(stderr, "\n[VAE] Decoding %d latent frames...\n", T_latent);
+        int T_audio = vae_ggml_decode_tiled(&dec, latent, T_latent,
+                                             audio.data(), max_T, chunk_size, overlap);
+        free(latent);
+        if (T_audio < 0) { vae_ggml_free(&dec); return 1; }
+
+        if (write_wav(output_path, audio.data(), T_audio, 48000))
+            fprintf(stderr, "\n[VAE] Output: %s (%d samples, %.2fs @ 48kHz)\n",
+                    output_path, T_audio, (float)T_audio / 48000.0f);
+        else
+            fprintf(stderr, "[VAE] FATAL: failed to write %s\n", output_path);
+
+        vae_ggml_free(&dec);
+        fprintf(stderr, "[VAE] Done.\n");
+        return 0;
+    }
+}
diff --git a/tools/quantize.cpp b/tools/quantize.cpp
index c778a47..84a3dd4 100644
--- a/tools/quantize.cpp
+++ b/tools/quantize.cpp
@@ -10,7 +10,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <string>
 #include <vector>
 
 #ifdef _WIN32