diff --git a/CMakeLists.txt b/CMakeLists.txt index afa9cd0..d7af387 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,12 @@ add_compile_definitions(GGML_MAX_NAME=128) # CUDA architectures: cover Turing to Blackwell for distributed binaries. # Users can override with -DCMAKE_CUDA_ARCHITECTURES=native for local builds. if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real") + find_package(CUDAToolkit QUIET) + if(CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") + set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real") + else() + set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real") + endif() endif() # ggml as subdirectory, inherits GGML_CUDA, GGML_METAL, etc. from cmake flags @@ -70,3 +75,7 @@ link_ggml_backends(ace-qwen3) # quantize: GGUF requantizer (BF16 -> K-quants) add_executable(quantize tools/quantize.cpp) link_ggml_backends(quantize) + +# neural-codec: Oobleck VAE neural audio codec (encode/decode WAV <-> latent) +add_executable(neural-codec tools/neural-codec.cpp) +link_ggml_backends(neural-codec) diff --git a/README.md b/README.md index 25449d0..84f39dd 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # acestep.cpp Portable C++17 implementation of ACE-Step 1.5 music generation using GGML. -Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, Metal, Vulkan. +Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, ROCm, Metal, Vulkan. ## Build @@ -16,6 +16,9 @@ cmake .. # Linux with NVIDIA GPU cmake .. -DGGML_CUDA=ON +# Linux with AMD GPU (ROCm) +cmake .. -DGGML_HIP=ON + # Linux with Vulkan cmake .. -DGGML_VULKAN=ON @@ -29,7 +32,7 @@ cmake .. -DGGML_CUDA=ON -DGGML_BLAS=ON cmake --build . --config Release -j$(nproc) ``` -Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE). +Builds three binaries: `ace-qwen3` (LLM), `dit-vae` (DiT + VAE) and `neural-codec` (VAE encode/decode). ## Models @@ -94,13 +97,13 @@ EOF # LLM: request.json -> request0.json (enriched with lyrics + codes) ./build/ace-qwen3 \ --request /tmp/request.json \ - --model models/acestep-5Hz-lm-4B-BF16.gguf + --model models/acestep-5Hz-lm-4B-Q8_0.gguf # DiT+VAE: request0.json -> request00.wav ./build/dit-vae \ --request /tmp/request0.json \ - --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \ - --dit models/acestep-v15-turbo-BF16.gguf \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ --vae models/vae-BF16.gguf ``` @@ -111,7 +114,7 @@ Generate multiple songs at once with `--batch`: # -> request0.json, request1.json (different lyrics/codes, seeds auto+0, auto+1) ./build/ace-qwen3 \ --request /tmp/request.json \ - --model models/acestep-5Hz-lm-4B-BF16.gguf \ + --model models/acestep-5Hz-lm-4B-Q8_0.gguf \ --batch 2 # DiT+VAE: (2 DiT variations of LM output 1 and 2) @@ -119,8 +122,8 @@ Generate multiple songs at once with `--batch`: # -> request1.json -> request10.wav, request11.wav ./build/dit-vae \ --request /tmp/request0.json /tmp/request1.json \ - --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \ - --dit models/acestep-v15-turbo-BF16.gguf \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ --vae models/vae-BF16.gguf \ --batch 2 ``` @@ -151,34 +154,43 @@ Empty field = "fill it". Filled = "don't touch". All modes always output numbered files (`request0.json` .. `requestN-1.json`). The input JSON is never modified. -**Caption only**: the LLM generates lyrics, metadata (bpm, key, time -signature, duration) and audio codes. With `--batch N`, each element -generates its own lyrics and metadata from a different seed, producing -N completely different songs. See `examples/simple.json`. +**Caption only** (`lyrics=""`): two LLM passes. Phase 1 uses the "Expand" +prompt to generate lyrics and metadata (bpm, keyscale, timesignature, +duration) via CoT. Phase 2 reinjects the CoT and generates audio codes using +the "Generate tokens" prompt. CFG is forced to 1.0 in phase 1 (free +sampling); `lm_cfg_scale` only applies in phase 2. With `--batch N`, each +element runs its own phase 1 from a different seed, producing N completely +different songs. See `examples/simple.json`. -**Caption + lyrics (+ optional metadata)**: the LLM fills missing -metadata via CoT, then generates audio codes. User provided fields -are preserved. See `examples/partial.json`. +**Caption + lyrics (+ optional metadata)**: single LLM pass. The "Generate +tokens" prompt is used directly. Missing metadata is filled via CoT, then +audio codes are generated. User-provided fields are never overwritten. +`lm_cfg_scale` applies to both CoT and code generation. See +`examples/partial.json`. **Everything provided** (caption, lyrics, bpm, duration, keyscale, timesignature): the LLM skips CoT and generates audio codes directly. With `--batch N`, all elements share the same prompt (single prefill, KV cache copied). See `examples/full.json`. +**Instrumental** (`lyrics="[Instrumental]"`): treated as "lyrics provided", +so the single-pass "Generate tokens" path is used. No lyrics generation. +The DiT was trained with this exact string as the no-vocal condition. + **Passthrough** (`audio_codes` present): LLM is skipped entirely. Run `dit-vae` to decode existing codes. See `examples/dit-only.json`. ## Request JSON reference -All fields with defaults. Only `caption` is required. +Only `caption` is required. All other fields default to "unset" which means +the LLM fills them, or a sensible runtime default is applied. ```json { "caption": "", "lyrics": "", - "instrumental": false, "bpm": 0, - "duration": -1, + "duration": 0, "keyscale": "", "timesignature": "", "vocal_language": "unknown", @@ -190,18 +202,98 @@ All fields with defaults. Only `caption` is required. "lm_negative_prompt": "", "audio_codes": "", "inference_steps": 8, - "guidance_scale": 7.0, + "guidance_scale": 0.0, "shift": 3.0 } ``` -Key fields: `seed` -1 means random (resolved once, then +1 per batch -element). `audio_codes` is generated by ace-qwen3 and consumed by -dit-vae (comma separated FSQ token IDs). When present, the LLM is -skipped entirely. +### Text conditioning (ace-qwen3 + dit-vae) + +**`caption`** (string, required) +Natural language description of the music style, mood, instruments, etc. +Fed to both the LLM and the DiT text encoder. + +**`lyrics`** (string, default `""`) +Controls vocal generation. Three valid states: +- `""`: LLM generates lyrics from the caption (phase 1 "Expand" prompt). +- `"[Instrumental]"`: no vocals. Passed directly to the DiT, LLM skips lyrics generation. +- Any other string: user-provided lyrics used as-is, LLM only fills missing metadata. + +There is no `instrumental` flag. This field is the single source of truth for +vocal content. + +### Metadata (LLM-filled if unset) + +**`bpm`** (int, default `0` = unset) +Beats per minute. LLM generates one if 0. + +**`duration`** (float seconds, default `0` = unset) +Target audio duration. `0` means the LLM picks it. Clamped to [1, 600]s after +generation. `1` means 1 second. + +**`keyscale`** (string, default `""` = unset) +Musical key and scale, e.g. `"C major"`, `"F# minor"`. LLM fills if empty. + +**`timesignature`** (string, default `""` = unset) +Time signature numerator as a string, e.g. `"4"` for 4/4, `"3"` for 3/4. +LLM fills if empty. + +**`vocal_language`** (string, default `"unknown"`) +BCP-47 language code for lyrics, e.g. `"en"`, `"fr"`, `"ja"`. When set and +lyrics are being generated, the FSM constrains the LLM output to that language. +`"unknown"` lets the LLM decide. + +### Generation control + +**`seed`** (int64, default `-1` = random) +RNG seed. Resolved once at startup to a random value if -1. Batch elements +use `seed+0`, `seed+1`, ... `seed+N-1`. + +**`audio_codes`** (string, default `""`) +Comma-separated FSQ token IDs produced by ace-qwen3. When non-empty, the +entire LLM pass is skipped and dit-vae decodes these codes directly +(passthrough / cover mode). + +### LM sampling (ace-qwen3) + +**`lm_temperature`** (float, default `0.85`) +Sampling temperature for both phase 1 (lyrics/metadata) and phase 2 (audio +codes). Lower = more deterministic. + +**`lm_cfg_scale`** (float, default `2.0`) +Classifier-Free Guidance scale for the LM. Only active in phase 2 (audio +code generation) and in phase 1 when lyrics are already provided. When +`lyrics` is empty, phase 1 always runs with `cfg=1.0` (free sampling). +`1.0` disables CFG. -Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG). -SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`. +**`lm_top_p`** (float, default `0.9`) +Nucleus sampling cutoff. `1.0` disables. When `top_k=0`, an internal +pre-filter of 256 tokens is applied before top_p for performance. + +**`lm_top_k`** (int, default `0` = disabled) +Top-K sampling. `0` disables hard top-K (top_p still applies). + +**`lm_negative_prompt`** (string, default `""`) +Negative caption for CFG in phase 2. Empty string falls back to a +caption-less unconditional prompt. + +### DiT flow matching (dit-vae) + +**`inference_steps`** (int, default `8`) +Number of diffusion denoising steps. Turbo preset: `8`. SFT preset: `50`. + +**`guidance_scale`** (float, default `0.0` = auto) +CFG scale for the DiT. `0.0` is resolved at runtime: +- Turbo models: forced to `1.0` (CFG disabled, turbo was trained without it). +- SFT/base models: `7.0`. +Any value > 1.0 on a turbo model is overridden to 1.0 with a warning. + +**`shift`** (float, default `3.0`) +Flow-matching schedule shift. Controls the timestep distribution. +`shift = s*t / (1 + (s-1)*t)`. Turbo preset: `3.0`. SFT preset: `6.0`. + +Turbo preset: `inference_steps=8, shift=3.0` (guidance_scale auto-resolved to 1.0). +SFT preset: `inference_steps=50, guidance_scale=7.0, shift=6.0`. ## ace-qwen3 reference @@ -258,6 +350,71 @@ Debug: Models are loaded once and reused across all requests. +## neural-codec + +GGML-native neural audio codec based on the Oobleck VAE encoder and decoder. +Serves two purposes: validating the precision of the full VAE chain (encode + +decode roundtrip), and compressing music at ~850 B/s with no perceptible +difference from the original. + +``` +Usage: neural-codec --vae --encode|--decode -i [-o ] [--q8|--q4] + +Required: + --vae VAE GGUF file + --encode | --decode Encode WAV to latent, or decode latent to WAV + -i Input (WAV for encode, latent for decode) + +Output: + -o Output file (auto-named if omitted) + --q8 Quantize latent to int8 (~13 kbit/s) + --q4 Quantize latent to int4 (~6.8 kbit/s) + +Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4) + song.latent -> song.wav + +VAE tiling (memory control): + --vae-chunk Latent frames per tile (default: 256) + --vae-overlap Overlap frames per side (default: 64) + +Latent formats (decode auto-detects): + f32: flat [T, 64] f32, no header. ~51 kbit/s. + NAC8: header + per-frame Q8. ~13 kbit/s. + NAC4: header + per-frame Q4. ~6.8 kbit/s. +``` + +The encoder is the symmetric mirror of the decoder: same snake activations, +same residual units, strided conv1d for downsampling instead of transposed +conv1d for upsampling. No new GGML ops. Downsample 2x4x4x6x10 = 1920x. + +48kHz stereo audio is compressed to 64-dimensional latent frames at 25 Hz. +Three output formats, decode auto-detects from file content: + +| Format | Frame size | Bitrate | 3 min song | vs f32 (cossim) | +|--------|-----------|---------|------------|-----------------| +| f32 | 256B | 51 kbit/s | 1.1 MB | baseline | +| NAC8 | 66B | 13 kbit/s | 290 KB | 0.9999 | +| NAC4 | 34B | 6.8 kbit/s | 150 KB | 0.989 | + +NAC = Neural Audio Codec. The NAC8 and NAC4 file formats are headerless +except for a 4-byte magic (`NAC8` or `NAC4`) and a uint32 frame count. +Q8 quantization error is 39 dB below the VAE reconstruction error (free). +Q4 quantization error is 16 dB below the VAE reconstruction error (inaudible +on most material). + +```bash +# encode (Q4: 6.8 kbit/s, ~150 KB for 3 minutes) +neural-codec --vae models/vae-BF16.gguf --encode --q4 -i song.wav -o song.nac4 + +# encode (Q8: 13 kbit/s, ~290 KB for 3 minutes) +neural-codec --vae models/vae-BF16.gguf --encode --q8 -i song.wav -o song.nac8 + +# decode (auto-detects format) +neural-codec --vae models/vae-BF16.gguf --decode -i song.nac4 -o song_decoded.wav + +# roundtrip validation: compare song.wav and song_decoded.wav with your ears +``` + ## Architecture ``` @@ -278,6 +435,39 @@ dit-vae WAV stereo 48kHz ``` +## Roadmap + +This project started from a simple idea: a Telegram bot using llama.cpp to +prompt a music generator, and the desire to make GGML sing. No more, no less. +No cloud, no black box, scriptable and nothing between you and the model. + +### LLM modes +- [ ] Remaining modes: Understand, Rewrite (single-pass, no audio codes) +- [ ] Reference audio input: repaint and cover tasks (src_audio + cover_strength) + +### Audio I/O +Current: raw PCM f32 WAV via hand-rolled writer, no external deps. +Trade-off to document: +- **Keep as-is**: zero dependencies, clean licensing, works everywhere +- **ffmpeg pipe**: trivial bash wrapper handles any codec/format, no C++ codec hell + - pro: MP3/FLAC/OGG out of the box, input resampling for reference audio + - con: runtime dependency, not embedded +Conclusion pending. Likely ffmpeg as optional external pipe, documented in README. + +### API and interface +- [ ] JSON HTTP server (minimal, well-documented, stable contract) +- [ ] Web interface on top - vibecodeable by anyone, API stays simple +Goal: document the internals and how the model actually works, +not reproduce the Python spaghetti. Expert-first, no commercial fluff. + +### Documentation +Current README is technical study + API reference, intentional. +- [ ] Split when a user-facing interface exists: README (user) + ARCHITECTURE.md (internals) + +### Future models +- [ ] ACE-Step 2.0: evaluate architecture delta, add headers/weights as needed +No commitment, easy to adapt by adding headers or new compilation units as needed. + ## LM specifics ace-qwen3 is not a general-purpose chat engine. It is a two-phase autoregressive @@ -318,7 +508,7 @@ python3 debug-dit-cossim.py # DiT: per-layer cossim GGML vs Python (turbo/ ## Patched GGML fork Uses a patched GGML fork (submodule) with two new ops, a Metal im2col optimization, and -a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, Metal, Vulkan. +a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, ROCm, Metal, Vulkan. F32/F16/BF16 data types. The DiT uses only standard GGML ops and needs no patches. The VAE reconstructs audio from latent space through 5 upsampling blocks (total 1920x), @@ -373,6 +563,19 @@ Upstream `im2col_kernel` uses OW directly as grid dimension Y, which exceeds the times per tile at output widths up to 491520. Fixed with a grid-stride loop on OW and `MIN(OW, MAX_GRIDDIM_Z)` clamping. +### Upstream divergence + +The GGML submodule diverges from upstream only by the addition of +`GGML_OP_SNAKE` and `GGML_OP_COL2IM_1D`. No existing upstream kernel is +modified. These ops are required; the VAE does not work without them. + +An earlier approach patched the upstream naive ops instead of adding custom +ones. Those patches were dropped. They are documented here in case someone +wants to study the naive path: + +- `conv_transpose_1d`: bounded loop replacing O(T_in) brute-force, CUDA and Metal +- `im2col`: grid-stride loop on OW to fix gridDim.y overflow for large tensors + ## Acknowledgements Independent implementation based on ACE-Step 1.5 by ACE Studio and StepFun. @@ -387,3 +590,15 @@ All model weights are theirs, this is just a native backend. note={GitHub repository} } ``` + +## Samples + +https://github.com/user-attachments/assets/9a50c1f4-9ec0-474a-bd14-e8c6b00622a1 + +https://github.com/user-attachments/assets/fb606249-0269-4153-b651-bf78e05baf22 + +https://github.com/user-attachments/assets/e0580468-5e33-4a1f-a0f4-b914e4b9a8c2 + +https://github.com/user-attachments/assets/292a31f1-f97e-4060-9207-ed8364d9a794 + +https://github.com/user-attachments/assets/34b1b781-a5bc-46c4-90a6-615a10bc2c6a diff --git a/src/backend.h b/src/backend.h index 4b8566b..df33975 100644 --- a/src/backend.h +++ b/src/backend.h @@ -13,6 +13,7 @@ extern "C" int cudaDeviceGetAttribute(int *, int, int); #endif #include +#include #include #include @@ -41,6 +42,10 @@ static BackendPair backend_init(const char * label) { ggml_backend_load_all(); BackendPair bp = {}; bp.backend = ggml_backend_init_best(); + if (!bp.backend) { + fprintf(stderr, "[Load] FATAL: no backend available\n"); + exit(1); + } int n_threads = (int)std::thread::hardware_concurrency() / 2; if (n_threads < 1) n_threads = 1; // [GGML] If best backend is already CPU, reuse it (avoid 2 CPU instances @@ -51,6 +56,10 @@ static BackendPair backend_init(const char * label) { ggml_backend_cpu_set_n_threads(bp.backend, n_threads); } else { bp.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (!bp.cpu_backend) { + fprintf(stderr, "[Load] FATAL: failed to init CPU backend\n"); + exit(1); + } ggml_backend_cpu_set_n_threads(bp.cpu_backend, n_threads); } fprintf(stderr, "[Load] %s backend: %s (CPU threads: %d)\n", @@ -87,5 +96,10 @@ static void backend_release(ggml_backend_t backend, ggml_backend_t cpu_backend) static ggml_backend_sched_t backend_sched_new(BackendPair bp, int max_nodes) { ggml_backend_t backends[2] = { bp.backend, bp.cpu_backend }; int n = (bp.backend == bp.cpu_backend) ? 1 : 2; - return ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true); + ggml_backend_sched_t sched = ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true); + if (!sched) { + fprintf(stderr, "[Load] FATAL: failed to create scheduler\n"); + exit(1); + } + return sched; } diff --git a/src/cond-enc.h b/src/cond-enc.h index e85b6fd..ba53163 100644 --- a/src/cond-enc.h +++ b/src/cond-enc.h @@ -270,7 +270,10 @@ static void cond_ggml_forward(CondGGML * m, if (timbre_out) ggml_build_forward_expand(gf, timbre_out); // Allocate and set inputs - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[CondEncoder] FATAL: failed to allocate graph\n"); + exit(1); + } ggml_backend_tensor_set(t_lyric_in, lyric_embed, 0, 1024 * S_lyric * sizeof(float)); ggml_backend_tensor_set(t_text_in, text_hidden, 0, 1024 * S_text * sizeof(float)); diff --git a/src/debug.h b/src/debug.h index dc7a626..a32cd11 100644 --- a/src/debug.h +++ b/src/debug.h @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include struct DebugDumper { diff --git a/src/dit-graph.h b/src/dit-graph.h index 2a92324..abe64d0 100644 --- a/src/dit-graph.h +++ b/src/dit-graph.h @@ -10,10 +10,7 @@ #include "dit.h" -#include -#include #include -#include // Helper: ensure tensor is f32 (cast if bf16/f16) static struct ggml_tensor * dit_ggml_f32( diff --git a/src/dit-sampler.h b/src/dit-sampler.h index 92540a8..31d9817 100644 --- a/src/dit-sampler.h +++ b/src/dit-sampler.h @@ -8,12 +8,8 @@ #include "dit-graph.h" #include "debug.h" -#include "ggml.h" -#include "ggml-backend.h" -#include "ggml-alloc.h" #include -#include #include #include #include diff --git a/src/dit.h b/src/dit.h index 190b2f7..cd2936e 100644 --- a/src/dit.h +++ b/src/dit.h @@ -10,17 +10,13 @@ #include "ggml.h" #include "ggml-backend.h" -#include "ggml-alloc.h" #include "gguf-weights.h" #include "backend.h" -#include "debug.h" #include #include -#include #include -#include // Config (mirrors dit.cuh DiTConfig) struct DiTGGMLConfig { diff --git a/src/fsq-detok.h b/src/fsq-detok.h index c3a1e60..5cc3d7c 100644 --- a/src/fsq-detok.h +++ b/src/fsq-detok.h @@ -110,6 +110,10 @@ static bool detok_ggml_load(DetokGGML * m, const char * gguf_path, ggml_backend_t backends[2] = { backend, cpu_backend }; int n = (backend == cpu_backend) ? 1 : 2; m->sched = ggml_backend_sched_new(backends, NULL, n, 4096, false, true); + if (!m->sched) { + fprintf(stderr, "[FSQ] FATAL: failed to create scheduler\n"); + return false; + } fprintf(stderr, "[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)\n"); return true; diff --git a/src/gguf-weights.h b/src/gguf-weights.h index ac5f22d..be5bede 100644 --- a/src/gguf-weights.h +++ b/src/gguf-weights.h @@ -18,7 +18,6 @@ #include #include -#include #include #ifdef _WIN32 diff --git a/src/metadata-fsm.h b/src/metadata-fsm.h index becbe1c..69ae125 100644 --- a/src/metadata-fsm.h +++ b/src/metadata-fsm.h @@ -11,10 +11,8 @@ #include #include #include -#include #include #include -#include // Prefix tree for FSM constrained decoding struct PrefixTree { diff --git a/src/prompt.h b/src/prompt.h index 99782d8..c568f5f 100644 --- a/src/prompt.h +++ b/src/prompt.h @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/src/qwen3-enc.h b/src/qwen3-enc.h index b8ea213..ee9b628 100644 --- a/src/qwen3-enc.h +++ b/src/qwen3-enc.h @@ -17,7 +17,6 @@ #include "gguf-weights.h" #include #include -#include #include #include @@ -404,7 +403,10 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o ggml_build_forward_expand(gf, out); // Allocate - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (%d tokens)\n", S); + exit(1); + } // Set inputs ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int)); @@ -455,7 +457,10 @@ static void qwen3_embed_lookup(Qwen3GGML * m, const int * token_ids, int S, floa ggml_set_output(out); ggml_build_forward_expand(gf, out); - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (embed lookup, %d tokens)\n", S); + exit(1); + } ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int)); ggml_backend_sched_graph_compute(m->sched, gf); ggml_backend_tensor_get(out, output, 0, (size_t)H * S * sizeof(float)); diff --git a/src/qwen3-lm.h b/src/qwen3-lm.h index 3bbd514..5f5e290 100644 --- a/src/qwen3-lm.h +++ b/src/qwen3-lm.h @@ -4,14 +4,11 @@ #pragma once #include "qwen3-enc.h" // Qwen3Layer, Qwen3Config, layer build helpers -#include "ggml-alloc.h" -#include "bpe.h" #include #include #include #include -#include #include // LM config (superset of encoder config) @@ -450,7 +447,10 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens, ggml_build_forward_expand(gf, lgt); // Schedule + allocate - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[LM] FATAL: failed to allocate graph (prefill, %d tokens)\n", n_tokens); + exit(1); + } // Set token IDs ggml_backend_tensor_set(token_ids_t, token_ids, 0, n_tokens * sizeof(int)); @@ -678,7 +678,10 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, ggml_build_forward_expand(gf, lgt); // Allocate - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[LM] FATAL: failed to allocate graph (batch decode, N=%d)\n", N); + exit(1); + } // Set token IDs ggml_backend_tensor_set(token_ids_t, token_ids, 0, N * sizeof(int)); diff --git a/src/request.cpp b/src/request.cpp index 9b20423..c851eb3 100644 --- a/src/request.cpp +++ b/src/request.cpp @@ -6,7 +6,6 @@ #include "request.h" #include -#include #include #include @@ -14,9 +13,9 @@ void request_init(AceRequest * r) { r->caption = ""; r->lyrics = ""; - r->instrumental = false; + r->bpm = 0; - r->duration = -1.0f; + r->duration = 0.0f; r->keyscale = ""; r->timesignature = ""; r->vocal_language = "unknown"; @@ -28,7 +27,7 @@ void request_init(AceRequest * r) { r->lm_negative_prompt = ""; r->audio_codes = ""; r->inference_steps = 8; - r->guidance_scale = 1.0f; + r->guidance_scale = 0.0f; r->shift = 3.0f; } @@ -241,7 +240,6 @@ bool request_parse(AceRequest * r, const char * path) { else if (k == "shift") r->shift = (float)atof(v.c_str()); // bools - else if (k == "instrumental") r->instrumental = (v == "true"); // unknown keys: silently ignored (forward compat) } @@ -259,8 +257,6 @@ bool request_write(const AceRequest * r, const char * path) { fprintf(f, "{\n"); fprintf(f, " \"caption\": \"%s\",\n", json_escape(r->caption).c_str()); fprintf(f, " \"lyrics\": \"%s\",\n", json_escape(r->lyrics).c_str()); - if (r->instrumental) - fprintf(f, " \"instrumental\": true,\n"); fprintf(f, " \"bpm\": %d,\n", r->bpm); fprintf(f, " \"duration\": %.1f,\n", r->duration); fprintf(f, " \"keyscale\": \"%s\",\n", json_escape(r->keyscale).c_str()); diff --git a/src/request.h b/src/request.h index 1295b83..d1748b5 100644 --- a/src/request.h +++ b/src/request.h @@ -6,6 +6,7 @@ // Aligned with Python GenerationParams (inference.py:39) and API /release_task. // +#include #include #include @@ -13,11 +14,10 @@ struct AceRequest { // text content std::string caption; // "" std::string lyrics; // "" - bool instrumental; // false // metadata (user-provided or LLM-enriched) int bpm; // 0 = unset - float duration; // -1 = unset + float duration; // 0 = unset std::string keyscale; // "" = unset std::string timesignature; // "" = unset std::string vocal_language; // "unknown" diff --git a/src/vae-enc.h b/src/vae-enc.h new file mode 100644 index 0000000..f5c67f2 --- /dev/null +++ b/src/vae-enc.h @@ -0,0 +1,391 @@ +// vae-enc.h: AutoencoderOobleck encoder (audio -> latent) via ggml +// +// Mirror of vae.h decoder. Reuses VAEResUnit, load helpers, graph ops. +// Architecture: conv1(2->128,k=7) -> 5x(3xresunit+snake+strided_conv) -> snake+conv2(2048->128,k=3) +// Output 128ch = mean[64] + scale[64]. Deterministic encode returns mean. +// Downsample: 2x4x4x6x10 = 1920x (matches decoder upsample) + +#pragma once +#include "vae.h" + +// Encoder block: 3xResUnit(in_ch) -> snake(in_ch) -> strided Conv1d(in_ch -> out_ch) +// Decoder block is the mirror: snake(in_ch) -> ConvT(in_ch -> out_ch) -> 3xResUnit(out_ch) +struct VAEEncBlock { + VAEResUnit ru[3]; + struct ggml_tensor * sa, * sb; // snake [1, in_ch] + struct ggml_tensor * dw, * db; // strided conv [K, in_ch, out_ch], bias [out_ch] + int in_ch, out_ch, stride, kernel, padding; +}; + +struct VAEEncoder { + struct ggml_tensor * c1w, * c1b; // conv1 [7, 2, 128], bias [128] + VAEEncBlock blk[5]; + struct ggml_tensor * sa, * sb; // final snake [1, 2048] + struct ggml_tensor * c2w, * c2b; // conv2 [3, 2048, 128], bias [128] + + ggml_backend_t backend; + ggml_backend_t cpu_backend; + ggml_backend_sched_t sched; + ggml_backend_buffer_t buf; + struct ggml_context * weight_ctx; + + // graph cache (rebuilt when T_audio changes) + struct ggml_context * graph_ctx; + uint8_t * graph_buf; + struct ggml_cgraph * graph; + struct ggml_tensor * graph_input; // [T_audio, 2] + struct ggml_tensor * graph_output; // [T_latent, 128] + int graph_T; // cached T_audio (0 = no cache) + + std::vector scratch_in; // transposed input [2 * T_audio] +}; + +// Load encoder weights from the same VAE GGUF (encoder.* tensors) +static void vae_enc_load(VAEEncoder * m, const char * path) { + GGUFModel gf = {}; + if (!gf_load(&gf, path)) { + fprintf(stderr, "[VAE-Enc] FATAL: cannot load %s\n", path); + exit(1); + } + + // Encoder channel layout (mirror of decoder, bottom-up): + // conv1: 2 -> 128 + // block: [128->128, 128->256, 256->512, 512->1024, 1024->2048] + // conv2: 2048 -> 128 (split: mean[64] + scale[64]) + // ResUnits run at in_ch (before downsample), unlike decoder (at out_ch, after upsample). + static const int in_ch[] = {128, 128, 256, 512, 1024}; + static const int out_ch[] = {128, 256, 512, 1024, 2048}; + static const int strides[] = { 2, 4, 4, 6, 10}; + static const int dilations[] = {1, 3, 9}; + + // Phase 1: create weight tensors + size_t ctx_size = ggml_tensor_overhead() * 256; + struct ggml_init_params p = { ctx_size, NULL, true }; + m->weight_ctx = ggml_init(p); + struct ggml_context * ctx = m->weight_ctx; + + m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128); + m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); + + for (int i = 0; i < 5; i++) { + VAEEncBlock & b = m->blk[i]; + b.in_ch = in_ch[i]; + b.out_ch = out_ch[i]; + b.stride = strides[i]; + b.kernel = strides[i] * 2; + b.padding = (strides[i] + 1) / 2; // ceil(stride / 2) + int C = in_ch[i]; // res_units + snake at in_ch + + // 3 res units at in_ch + for (int r = 0; r < 3; r++) { + VAEResUnit & ru = b.ru[r]; + ru.dilation = dilations[r]; + ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C); + ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); + ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C); + ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); + } + + // snake at in_ch (before downsample conv) + b.sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + b.sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + + // strided conv1d: [K, in_ch, out_ch] + b.dw = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, b.kernel, in_ch[i], out_ch[i]); + b.db = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch[i]); + } + + m->sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); + m->sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); + m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128); + m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); + + // Phase 2: allocate backend buffer + BackendPair bp = backend_init("VAE-Enc"); + m->backend = bp.backend; + m->cpu_backend = bp.cpu_backend; + m->sched = backend_sched_new(bp, 8192); + m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend); + if (!m->buf) { + fprintf(stderr, "[VAE-Enc] FATAL: failed to allocate weight buffer\n"); + exit(1); + } + fprintf(stderr, "[VAE-Enc] Backend: %s, Weight buffer: %.1f MB\n", + ggml_backend_name(m->backend), + (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024)); + + // Phase 3: load and fuse weights + vae_fuse_wn(m->c1w, gf, "encoder.conv1"); + vae_load_bias(m->c1b, gf, "encoder.conv1.bias"); + + for (int i = 0; i < 5; i++) { + VAEEncBlock & b = m->blk[i]; + std::string blk_pfx = "encoder.block." + std::to_string(i); + + // res_units first (same load pattern as decoder) + for (int r = 0; r < 3; r++) { + VAEResUnit & ru = b.ru[r]; + std::string rp = blk_pfx + ".res_unit" + std::to_string(r + 1); + vae_load_snake(ru.s1a, gf, rp + ".snake1.alpha"); + vae_load_snake_inv(ru.s1b, gf, rp + ".snake1.beta"); + vae_fuse_wn(ru.c1w, gf, rp + ".conv1"); + vae_load_bias(ru.c1b, gf, rp + ".conv1.bias"); + vae_load_snake(ru.s2a, gf, rp + ".snake2.alpha"); + vae_load_snake_inv(ru.s2b, gf, rp + ".snake2.beta"); + vae_fuse_wn(ru.c2w, gf, rp + ".conv2"); + vae_load_bias(ru.c2b, gf, rp + ".conv2.bias"); + } + + // snake + strided downsample conv (regular conv1d, NOT transposed) + vae_load_snake(b.sa, gf, blk_pfx + ".snake1.alpha"); + vae_load_snake_inv(b.sb, gf, blk_pfx + ".snake1.beta"); + vae_fuse_wn(b.dw, gf, blk_pfx + ".conv1"); + vae_load_bias(b.db, gf, blk_pfx + ".conv1.bias"); + } + + vae_load_snake(m->sa, gf, "encoder.snake1.alpha"); + vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta"); + vae_fuse_wn(m->c2w, gf, "encoder.conv2"); + vae_load_bias(m->c2b, gf, "encoder.conv2.bias"); + + fprintf(stderr, "[VAE-Enc] Loaded: 5 blocks, downsample=1920x, F32 activations\n"); + gf_close(&gf); +} + +// Build encoder graph: audio [T_audio, 2] -> [T_latent, 128] +static struct ggml_tensor * vae_enc_build_graph( + struct ggml_context * ctx, + VAEEncoder * m, + struct ggml_tensor * audio) { // [T, 2] + + // conv1: [T, 2] -> [T, 128] + struct ggml_tensor * x = vae_conv1d(ctx, m->c1w, m->c1b, audio, 1, 3, 1); + + // 5 encoder blocks: resunits(in_ch) -> snake(in_ch) -> strided conv(in_ch -> out_ch) + for (int i = 0; i < 5; i++) { + VAEEncBlock & b = m->blk[i]; + for (int r = 0; r < 3; r++) + x = vae_res_unit(ctx, &b.ru[r], x); + x = vae_snake(ctx, x, b.sa, b.sb); + x = vae_conv1d(ctx, b.dw, b.db, x, b.stride, b.padding, 1); + } + + // Final: snake(2048) -> conv2(2048 -> 128, k=3, pad=1) + x = vae_snake(ctx, x, m->sa, m->sb); + x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1); + + return x; // [T_latent, 128] +} + +// Core compute: build/cache graph, set input, run. Returns T_latent or -1. +// Output stays in m->graph_output for caller to read. +static int vae_enc_compute( + VAEEncoder * m, + const float * audio, // [T_audio, 2] time-major interleaved stereo + int T_audio) { + + // Rebuild graph when T_audio changes + if (m->graph_T != T_audio) { + if (m->graph_ctx) { + ggml_backend_sched_reset(m->sched); + ggml_free(m->graph_ctx); + free(m->graph_buf); + } + + size_t ctx_size = ggml_tensor_overhead() * 1024 + ggml_graph_overhead_custom(8192, false); + m->graph_buf = (uint8_t *)malloc(ctx_size); + struct ggml_init_params p = { ctx_size, m->graph_buf, true }; + struct ggml_context * ctx = ggml_init(p); + + m->graph_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2); + ggml_set_name(m->graph_input, "enc_input"); + ggml_set_input(m->graph_input); + + m->graph_output = vae_enc_build_graph(ctx, m, m->graph_input); + ggml_set_name(m->graph_output, "enc_output"); + ggml_set_output(m->graph_output); + + m->graph = ggml_new_graph_custom(ctx, 8192, false); + ggml_build_forward_expand(m->graph, m->graph_output); + + if (!ggml_backend_sched_alloc_graph(m->sched, m->graph)) { + fprintf(stderr, "[VAE-Enc] FATAL: graph alloc failed for T=%d\n", T_audio); + ggml_free(ctx); + free(m->graph_buf); + m->graph_ctx = NULL; + m->graph_buf = NULL; + m->graph_T = 0; + return -1; + } + + m->graph_ctx = ctx; + m->graph_T = T_audio; + fprintf(stderr, "[VAE-Enc] Graph: %d nodes, T_audio=%d\n", + ggml_graph_n_nodes(m->graph), T_audio); + } + + // Transpose: [T, 2] time-major -> ggml [T, 2] channel-contiguous + // ggml ne[0]=T is the contiguous dim, so we write all T samples per channel + size_t in_size = (size_t)2 * T_audio; + if (m->scratch_in.size() < in_size) + m->scratch_in.resize(in_size); + for (int c = 0; c < 2; c++) + for (int t = 0; t < T_audio; t++) + m->scratch_in[c * T_audio + t] = audio[t * 2 + c]; + ggml_backend_tensor_set(m->graph_input, + m->scratch_in.data(), 0, in_size * sizeof(float)); + + ggml_backend_sched_graph_compute(m->sched, m->graph); + + return (int)m->graph_output->ne[0]; // T_latent +} + +// Encode API: audio [T_audio, 2] -> latent_out [T_latent, 64] (mean only, deterministic) +// Returns T_latent (or -1 on error). +// latent_out must hold at least (T_audio / 1920) * 64 floats. +static int vae_enc_encode( + VAEEncoder * m, + const float * audio, // [T_audio, 2] interleaved stereo + int T_audio, + float * latent_out, // [T_latent, 64] output, time-major + int max_T_latent) { + + int T_latent = vae_enc_compute(m, audio, T_audio); + if (T_latent < 0) return -1; + + if (T_latent > max_T_latent) { + fprintf(stderr, "[VAE-Enc] T_latent %d exceeds max %d\n", T_latent, max_T_latent); + return -1; + } + + // Graph output is [ne0=T_latent, ne1=128] in ggml, channel-contiguous. + // Channels 0..63 = mean, 64..127 = scale. We only read mean. + // ggml layout: data[c * T_latent + t] for channel c, time t. + // We write time-major: latent_out[t * 64 + c] = data[c * T_latent + t] + // + // Read the full 128ch output once, extract mean channels 0..63 + size_t out_bytes = (size_t)128 * T_latent * sizeof(float); + std::vector raw(128 * T_latent); + ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes); + + for (int t = 0; t < T_latent; t++) + for (int c = 0; c < 64; c++) + latent_out[t * 64 + c] = raw[c * T_latent + t]; + + fprintf(stderr, "[VAE-Enc] Encode: T_audio=%d -> T_latent=%d (%.2fs @ 48kHz)\n", + T_audio, T_latent, (float)T_audio / 48000.0f); + + return T_latent; +} + +// Tiled encode for long audio (same chunking strategy as decoder) +// chunk_size: latent frames per tile, overlap: context frames on each side +static int vae_enc_encode_tiled( + VAEEncoder * m, + const float * audio, // [T_audio, 2] interleaved stereo + int T_audio, + float * latent_out, // [T_latent, 64] output, time-major + int max_T_latent, + int chunk_size = 256, + int overlap = 64) { + + // Work in audio-sample space. Each latent frame = 1920 audio samples. + int audio_chunk = chunk_size * 1920; + int audio_overlap = overlap * 1920; + + // Shrink overlap until stride is positive + while (audio_chunk - 2 * audio_overlap <= 0 && audio_overlap > 0) + audio_overlap /= 2; + + // Short audio: encode directly + if (T_audio <= audio_chunk) + return vae_enc_encode(m, audio, T_audio, latent_out, max_T_latent); + + int audio_stride = audio_chunk - 2 * audio_overlap; + int num_steps = (T_audio + audio_stride - 1) / audio_stride; + + fprintf(stderr, "[VAE-Enc] Tiled encode: %d tiles (chunk=%d, overlap=%d, stride=%d audio samples)\n", + num_steps, audio_chunk, audio_overlap, audio_stride); + + float downsample_factor = 0.0f; + int latent_write_pos = 0; + + for (int i = 0; i < num_steps; i++) { + // Core range in audio samples (the part we keep) + int core_start = i * audio_stride; + int core_end = core_start + audio_stride; + if (core_end > T_audio) core_end = T_audio; + + // Window with overlap context + int win_start = core_start - audio_overlap; + if (win_start < 0) win_start = 0; + int win_end = core_end + audio_overlap; + if (win_end > T_audio) win_end = T_audio; + int win_len = win_end - win_start; + + // Encode this window + int tile_T = vae_enc_compute(m, audio + win_start * 2, win_len); + if (tile_T < 0) { + fprintf(stderr, "[VAE-Enc] FATAL: tile %d encode failed\n", i); + return -1; + } + + // Determine downsample factor from first tile + if (i == 0) { + downsample_factor = (float)tile_T / (float)win_len; + fprintf(stderr, "[VAE-Enc] Downsample factor: %.6f (expected ~1/1920)\n", + downsample_factor); + } + + // Trim in latent frames (mirror of decoder trim logic) + int added_start = core_start - win_start; + int trim_start = (int)roundf((float)added_start * downsample_factor); + int added_end = win_end - core_end; + int trim_end = (int)roundf((float)added_end * downsample_factor); + + int end_idx = (trim_end > 0) ? (tile_T - trim_end) : tile_T; + int core_len = end_idx - trim_start; + if (core_len <= 0) continue; + + if (latent_write_pos + core_len > max_T_latent) { + fprintf(stderr, "[VAE-Enc] FATAL: tiled output exceeds max_T_latent\n"); + return -1; + } + + // Read tile output [ne0=tile_T, ne1=128], extract mean (ch 0..63), transpose + // Only read the first 64 channels (mean), skip scale channels 64..127 + size_t out_bytes = (size_t)128 * tile_T * sizeof(float); + std::vector raw(128 * tile_T); + ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes); + + for (int t = 0; t < core_len; t++) + for (int c = 0; c < 64; c++) + latent_out[(latent_write_pos + t) * 64 + c] = + raw[c * tile_T + (trim_start + t)]; + + latent_write_pos += core_len; + } + + fprintf(stderr, "[VAE-Enc] Tiled encode done: %d tiles -> T_latent=%d (%.2fs @ 48kHz)\n", + num_steps, latent_write_pos, (float)T_audio / 48000.0f); + + return latent_write_pos; +} + +// Free all resources +static void vae_enc_free(VAEEncoder * m) { + if (m->graph_ctx) { + ggml_backend_sched_reset(m->sched); + ggml_free(m->graph_ctx); + free(m->graph_buf); + } + if (m->sched) ggml_backend_sched_free(m->sched); + if (m->buf) ggml_backend_buffer_free(m->buf); + if (m->weight_ctx) ggml_free(m->weight_ctx); + backend_release(m->backend, m->cpu_backend); + *m = {}; +} diff --git a/src/vae.h b/src/vae.h index 7c2a24e..fbf6d5f 100644 --- a/src/vae.h +++ b/src/vae.h @@ -14,7 +14,6 @@ #include "backend.h" #include #include -#include #include #include @@ -216,6 +215,10 @@ static void vae_ggml_load(VAEGGML * m, const char * path) { m->cpu_backend = bp.cpu_backend; m->sched = backend_sched_new(bp, 8192); m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend); + if (!m->buf) { + fprintf(stderr, "[VAE] FATAL: failed to allocate weight buffer\n"); + exit(1); + } fprintf(stderr, "[VAE] Backend: %s, Weight buffer: %.1f MB\n", ggml_backend_name(m->backend), (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024)); diff --git a/tests/CPU-BF16.log b/tests/CPU-BF16.log index b20ebae..74300ed 100644 --- a/tests/CPU-BF16.log +++ b/tests/CPU-BF16.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 464.0 ms +[Load] DiT weight load: 301.5 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 651.3 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 666.9 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.9 ms +[Load] BPE tokenizer: 30.9 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 226.8 ms -[Encode] TextEncoder (70 tokens): 59.7 ms +[Load] TextEncoder: 121.5 ms +[Encode] TextEncoder (70 tokens): 58.0 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.7 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 230.8 ms +[Load] ConditionEncoder: 111.5 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 274.9 ms, enc_S=238 +[Encode] ConditionEncoder: 268.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758296 -0.049593 -0.132844 0.058496 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 34.6 ms +[Load] Detokenizer: 23.6 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 958.8 ms +[Context] Detokenizer: 889.4 ms [Debug] detok_output: [2170, 64] first4: -0.124160 1.435260 0.310138 -0.624584 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: 0.002176 0.183052 -1.467304 3.113325 [Debug] dit_x0: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 18721.5 ms (18721.5 ms/sample) +[DiT] Total generation: 17583.4 ms (17583.4 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 51818.0 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000519 0.001024 0.000897 0.001200 +[VAE Batch0] Decode: 46859.3 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000480 0.000983 0.000816 0.001189 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:57:38.585 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:57:38.585 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:57:38.585 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:57:38.586 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:57:38.586 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:57:39.413 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:57:40.961 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:57:40.961 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:57:40.966 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:57:41.132 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:49:02.827 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:49:02.916 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:49:04.251 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:49:04.252 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:49:04.253 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:49:04.259 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:49:04.454 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:57:41.140 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:57:41.153 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:57:41.153 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:57:41.175 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:57:41.483 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:57:41.483 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:57:41.483 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00688624382019043, 'diffusion_time_cost': 0.30014586448669434, 'diffusion_per_step_time_cost': 0.03751823306083679, 'total_time_cost': 0.30703210830688477, 'offload_time_cost': 0.0} -2026-03-01 19:57:41.498 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB -2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB -2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB -2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:57:41.775 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:57:41.777 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:57:41.780 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:49:04.463 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:49:04.478 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:49:04.478 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:49:04.514 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:49:04.845 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:49:04.846 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:49:04.846 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007018327713012695, 'diffusion_time_cost': 0.32423973083496094, 'diffusion_per_step_time_cost': 0.04052996635437012, 'total_time_cost': 0.33125805854797363, 'offload_time_cost': 0.0} +2026-03-04 21:49:04.860 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:49:04.862 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:49:04.862 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:49:04.862 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:49:04.862 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:49:04.862 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:49:04.862 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:49:05.138 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:49:05.140 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:49:05.142 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf [GGML] Running acestep-v15-turbo-BF16.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.988142 dit_step7_vt 0.969102 dit_x0 0.979106 - vae_audio 0.901370 - vae_audio (STFT cosine) 0.975816 + vae_audio 0.901389 + vae_audio (STFT cosine) 0.975826 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999946 0.136541 0.006626 -0.002312 0.972951 -0.002342 0.972003 diff --git a/tests/CPU-Q4_K_M.log b/tests/CPU-Q4_K_M.log index 508a20c..540c4c4 100644 --- a/tests/CPU-Q4_K_M.log +++ b/tests/CPU-Q4_K_M.log @@ -1,5 +1,5 @@ [Load] DiT backend: CPU (CPU threads: 16) -[Load] Backend init: 6.3 ms +[Load] Backend init: 1.6 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 895.6 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 118.4 ms +[Load] DiT weight load: 121.8 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 696.2 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 699.6 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 33.0 ms +[Load] BPE tokenizer: 33.3 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 148.2 ms -[Encode] TextEncoder (70 tokens): 58.0 ms +[Load] TextEncoder: 122.9 ms +[Encode] TextEncoder (70 tokens): 60.4 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.6 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 352.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 37.5 ms +[Load] ConditionEncoder: 34.8 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 294.2 ms, enc_S=238 +[Encode] ConditionEncoder: 300.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759313 -0.049345 -0.129442 0.055759 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 10.1 ms +[Load] Detokenizer: 9.1 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 354.8 ms +[Context] Detokenizer: 361.0 ms [Debug] detok_output: [2170, 64] first4: -0.106265 1.448869 0.309591 -0.650098 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: -0.463452 0.896626 -1.673395 3.222673 [Debug] dit_x0: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 21769.5 ms (21769.5 ms/sample) +[DiT] Total generation: 21823.6 ms (21823.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 52184.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000272 0.000786 0.000556 0.000990 +[VAE Batch0] Decode: 47904.5 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000330 0.000828 0.000665 0.001038 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 20:03:15.903 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 20:03:15.903 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 20:03:15.903 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 20:03:15.903 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 20:03:15.904 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 20:03:16.714 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 20:03:18.309 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 20:03:18.309 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 20:03:18.315 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 20:03:18.480 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:54:26.607 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:54:26.698 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:54:28.050 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:54:28.050 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:54:28.054 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:54:28.059 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:54:28.263 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 20:03:18.488 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 20:03:18.501 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 20:03:18.501 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 20:03:18.540 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 20:03:18.854 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 20:03:18.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 20:03:18.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006970643997192383, 'diffusion_time_cost': 0.3072662353515625, 'diffusion_per_step_time_cost': 0.03840827941894531, 'total_time_cost': 0.3142368793487549, 'offload_time_cost': 0.0} -2026-03-01 20:03:18.869 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 20:03:19.148 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 20:03:19.151 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 20:03:19.154 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:54:28.272 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:54:28.288 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:54:28.288 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:54:28.323 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:54:28.640 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:54:28.641 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:54:28.641 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0070536136627197266, 'diffusion_time_cost': 0.30983686447143555, 'diffusion_per_step_time_cost': 0.03872960805892944, 'total_time_cost': 0.3168904781341553, 'offload_time_cost': 0.0} +2026-03-04 21:54:28.655 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:54:28.666 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:54:28.666 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:54:28.666 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:54:28.666 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:54:28.666 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:54:28.666 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:54:28.949 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:54:28.951 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:54:28.952 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf [GGML] Running acestep-v15-turbo-Q4_K_M.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.977196 dit_step7_vt 0.939970 dit_x0 0.959881 - vae_audio 0.834993 - vae_audio (STFT cosine) 0.955098 + vae_audio 0.834992 + vae_audio (STFT cosine) 0.955102 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999883 0.167680 0.010319 -0.002256 0.973185 -0.002342 0.972003 diff --git a/tests/CPU-Q5_K_M.log b/tests/CPU-Q5_K_M.log index e0d9936..6722100 100644 --- a/tests/CPU-Q5_K_M.log +++ b/tests/CPU-Q5_K_M.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 140.3 ms +[Load] DiT weight load: 110.6 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 699.1 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 698.0 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 33.4 ms +[Load] BPE tokenizer: 33.1 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 149.7 ms -[Encode] TextEncoder (70 tokens): 57.3 ms +[Load] TextEncoder: 123.1 ms +[Encode] TextEncoder (70 tokens): 57.9 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.5 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 412.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 45.1 ms +[Load] ConditionEncoder: 41.0 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 387.5 ms, enc_S=238 +[Encode] ConditionEncoder: 388.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760901 -0.053445 -0.132760 0.058505 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 11.3 ms +[Load] Detokenizer: 10.3 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 447.0 ms +[Context] Detokenizer: 446.1 ms [Debug] detok_output: [2170, 64] first4: -0.129311 1.458194 0.298132 -0.651512 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: -0.003599 0.325174 -1.377289 3.053612 [Debug] dit_x0: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 27970.1 ms (27970.1 ms/sample) +[DiT] Total generation: 28035.0 ms (28035.0 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 51966.1 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000740 0.001305 0.001083 0.001434 +[VAE Batch0] Decode: 47798.0 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000762 0.001320 0.001139 0.001557 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 20:01:55.226 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 20:01:55.226 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 20:01:55.226 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 20:01:55.226 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 20:01:55.226 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 20:01:56.032 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 20:01:57.576 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 20:01:57.577 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 20:01:57.581 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 20:01:57.747 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:53:09.193 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:53:09.323 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:53:10.674 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:53:10.674 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:53:10.676 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:53:10.682 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:53:10.881 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 20:01:57.755 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 20:01:57.768 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 20:01:57.768 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 20:01:57.801 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 20:01:58.109 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 20:01:58.109 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 20:01:58.109 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007002353668212891, 'diffusion_time_cost': 0.30033254623413086, 'diffusion_per_step_time_cost': 0.03754156827926636, 'total_time_cost': 0.30733489990234375, 'offload_time_cost': 0.0} -2026-03-01 20:01:58.124 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 20:01:58.401 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 20:01:58.403 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 20:01:58.406 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:53:10.890 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:53:10.930 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:53:10.930 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:53:10.966 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:53:11.283 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:53:11.284 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:53:11.284 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006951332092285156, 'diffusion_time_cost': 0.3100306987762451, 'diffusion_per_step_time_cost': 0.03875383734703064, 'total_time_cost': 0.3169820308685303, 'offload_time_cost': 0.0} +2026-03-04 21:53:11.298 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:53:11.300 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:53:11.300 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:53:11.300 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:53:11.300 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:53:11.300 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:53:11.300 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:53:11.575 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:53:11.577 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:53:11.579 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf [GGML] Running acestep-v15-turbo-Q5_K_M.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.983513 dit_step7_vt 0.954349 dit_x0 0.970379 - vae_audio 0.874800 - vae_audio (STFT cosine) 0.967703 + vae_audio 0.874850 + vae_audio (STFT cosine) 0.967714 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999926 0.135378 0.008030 -0.002303 0.973012 -0.002342 0.972003 diff --git a/tests/CPU-Q6_K.log b/tests/CPU-Q6_K.log index 7d4c411..75b4fd2 100644 --- a/tests/CPU-Q6_K.log +++ b/tests/CPU-Q6_K.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 169.4 ms +[Load] DiT weight load: 150.0 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 699.2 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 689.7 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.5 ms +[Load] BPE tokenizer: 33.1 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 148.3 ms -[Encode] TextEncoder (70 tokens): 57.5 ms +[Load] TextEncoder: 124.2 ms +[Encode] TextEncoder (70 tokens): 58.0 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.6 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 476.3 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 52.6 ms +[Load] ConditionEncoder: 47.5 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 348.9 ms, enc_S=238 +[Encode] ConditionEncoder: 349.5 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.761694 -0.052035 -0.131773 0.058231 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 12.3 ms +[Load] Detokenizer: 11.0 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 414.3 ms +[Context] Detokenizer: 417.1 ms [Debug] detok_output: [2170, 64] first4: -0.151355 1.462444 0.326907 -0.627213 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: 0.118016 0.207620 -1.266971 2.955565 [Debug] dit_x0: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 25398.3 ms (25398.3 ms/sample) +[DiT] Total generation: 25477.6 ms (25477.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 52074.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000467 0.001015 0.000873 0.001303 +[VAE Batch0] Decode: 47852.2 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000553 0.001102 0.000938 0.001323 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 20:00:28.298 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 20:00:28.298 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 20:00:28.298 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 20:00:28.298 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 20:00:28.298 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 20:00:29.103 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 20:00:30.690 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 20:00:30.690 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 20:00:30.695 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 20:00:30.860 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:51:45.520 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:51:45.634 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:51:46.994 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:51:46.994 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:51:46.995 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:51:47.001 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:51:47.198 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 20:00:30.869 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 20:00:30.881 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 20:00:30.882 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 20:00:30.914 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 20:00:31.231 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 20:00:31.232 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 20:00:31.232 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006938934326171875, 'diffusion_time_cost': 0.31071925163269043, 'diffusion_per_step_time_cost': 0.038839906454086304, 'total_time_cost': 0.3176581859588623, 'offload_time_cost': 0.0} -2026-03-01 20:00:31.246 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB -2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB -2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB -2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 20:00:31.524 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 20:00:31.527 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 20:00:31.531 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:51:47.208 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:51:47.224 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:51:47.224 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:51:47.259 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:51:47.579 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:51:47.579 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:51:47.579 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007021188735961914, 'diffusion_time_cost': 0.31169986724853516, 'diffusion_per_step_time_cost': 0.038962483406066895, 'total_time_cost': 0.31872105598449707, 'offload_time_cost': 0.0} +2026-03-04 21:51:47.593 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:51:47.595 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:51:47.595 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:51:47.596 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:51:47.596 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:51:47.596 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:51:47.596 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:51:47.870 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:51:47.872 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:51:47.874 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf [GGML] Running acestep-v15-turbo-Q6_K.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.984569 dit_step7_vt 0.958147 dit_x0 0.972312 - vae_audio 0.891761 - vae_audio (STFT cosine) 0.969080 + vae_audio 0.891790 + vae_audio (STFT cosine) 0.969088 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999936 0.151952 0.007283 -0.002271 0.972870 -0.002342 0.972003 diff --git a/tests/CPU-Q8_0.log b/tests/CPU-Q8_0.log index 76183ea..3eb253c 100644 --- a/tests/CPU-Q8_0.log +++ b/tests/CPU-Q8_0.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 188.0 ms +[Load] DiT weight load: 178.6 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 690.8 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 692.2 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.8 ms +[Load] BPE tokenizer: 32.4 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 160.0 ms -[Encode] TextEncoder (70 tokens): 57.9 ms +[Load] TextEncoder: 123.5 ms +[Encode] TextEncoder (70 tokens): 58.2 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 13.0 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 616.6 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 126.4 ms +[Load] ConditionEncoder: 65.1 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 390.3 ms, enc_S=238 +[Encode] ConditionEncoder: 373.4 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758873 -0.049568 -0.132802 0.057792 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 13.6 ms +[Load] Detokenizer: 14.3 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 447.8 ms +[Context] Detokenizer: 448.5 ms [Debug] detok_output: [2170, 64] first4: -0.126218 1.441045 0.305219 -0.629688 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: -0.037024 0.233524 -1.487499 3.098410 [Debug] dit_x0: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 26043.3 ms (26043.3 ms/sample) +[DiT] Total generation: 26009.5 ms (26009.5 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 52114.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000455 0.000930 0.000816 0.001121 +[VAE Batch0] Decode: 47762.1 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000441 0.000946 0.000788 0.001168 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:59:03.882 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:59:03.882 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:59:03.882 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:59:03.883 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:59:03.883 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:59:04.691 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:59:06.262 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:59:06.262 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:59:06.268 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:59:06.433 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:50:24.424 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:50:24.514 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:50:25.858 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:50:25.858 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:50:25.860 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:50:25.865 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:50:26.063 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:59:06.443 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:59:06.457 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:59:06.457 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:59:06.478 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:59:06.802 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:59:06.803 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:59:06.803 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006929874420166016, 'diffusion_time_cost': 0.3164329528808594, 'diffusion_per_step_time_cost': 0.03955411911010742, 'total_time_cost': 0.3233628273010254, 'offload_time_cost': 0.0} -2026-03-01 19:59:06.817 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:59:07.095 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:59:07.098 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:59:07.101 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:50:26.073 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:50:26.088 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:50:26.088 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:50:26.120 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:50:26.438 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:50:26.438 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:50:26.438 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007014036178588867, 'diffusion_time_cost': 0.30962181091308594, 'diffusion_per_step_time_cost': 0.03870272636413574, 'total_time_cost': 0.3166358470916748, 'offload_time_cost': 0.0} +2026-03-04 21:50:26.452 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:50:26.455 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:50:26.455 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.86 GB +2026-03-04 21:50:26.455 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:50:26.455 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.86 GB +2026-03-04 21:50:26.455 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.86 GB +2026-03-04 21:50:26.455 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:50:26.730 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:50:26.732 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:50:26.734 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf [GGML] Running acestep-v15-turbo-Q8_0.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.988647 dit_step7_vt 0.970238 dit_x0 0.980014 - vae_audio 0.903408 - vae_audio (STFT cosine) 0.976427 + vae_audio 0.903437 + vae_audio (STFT cosine) 0.976438 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999946 0.139652 0.006645 -0.002330 0.972930 -0.002342 0.972003 diff --git a/tests/CUDA-BF16.log b/tests/CUDA-BF16.log index d73a934..5ed30ff 100644 --- a/tests/CUDA-BF16.log +++ b/tests/CUDA-BF16.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 70.8 ms +[Load] Backend init: 32.6 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 375.6 ms +[Load] DiT weight load: 310.9 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 661.0 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 653.9 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.8 ms +[Load] BPE tokenizer: 30.9 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 128.5 ms -[Encode] TextEncoder (70 tokens): 50.6 ms +[Load] TextEncoder: 102.3 ms +[Encode] TextEncoder (70 tokens): 50.4 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.5 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 127.1 ms +[Load] ConditionEncoder: 90.6 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 7.9 ms, enc_S=238 +[Encode] ConditionEncoder: 8.2 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758648 -0.049409 -0.132412 0.058372 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 24.2 ms +[Load] Detokenizer: 17.6 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 141.9 ms +[Context] Detokenizer: 140.1 ms [Debug] detok_output: [2170, 64] first4: -0.124204 1.435425 0.309963 -0.624679 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: -0.004009 0.190141 -1.466879 3.103273 [Debug] dit_x0: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 248.3 ms (248.3 ms/sample) +[DiT] Total generation: 243.9 ms (243.9 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 812.8 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000547 0.000898 0.000798 0.001064 +[VAE Batch0] Decode: 615.3 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000498 0.000900 0.000800 0.001124 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:08.539 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:08.540 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:08.540 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:08.540 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:08.540 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:09.277 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:10.804 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:10.804 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:10.810 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:10.970 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:24.010 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:24.091 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:25.418 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:25.418 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:25.421 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:25.426 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:25.618 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:10.978 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:10.991 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:10.991 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:11.023 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:11.329 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:11.330 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:11.330 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0068187713623046875, 'diffusion_time_cost': 0.2986173629760742, 'diffusion_per_step_time_cost': 0.03732717037200928, 'total_time_cost': 0.3054361343383789, 'offload_time_cost': 0.0} -2026-03-01 19:54:11.344 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:11.625 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:11.628 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:11.632 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:25.628 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:25.643 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:25.643 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:25.674 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:25.993 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:25.994 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:25.994 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006845712661743164, 'diffusion_time_cost': 0.3112342357635498, 'diffusion_per_step_time_cost': 0.038904279470443726, 'total_time_cost': 0.31807994842529297, 'offload_time_cost': 0.0} +2026-03-04 21:45:26.008 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:26.010 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:26.010 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:45:26.010 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:26.010 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:45:26.010 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:45:26.010 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:26.284 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:26.286 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:26.288 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf [GGML] Running acestep-v15-turbo-BF16.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.988188 dit_step7_vt 0.969375 dit_x0 0.979213 - vae_audio 0.901377 - vae_audio (STFT cosine) 0.975525 + vae_audio 0.901411 + vae_audio (STFT cosine) 0.975533 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999945 0.135628 0.006709 -0.002312 0.972932 -0.002342 0.972003 diff --git a/tests/CUDA-Q4_K_M.log b/tests/CUDA-Q4_K_M.log index 189cb71..403d030 100644 --- a/tests/CUDA-Q4_K_M.log +++ b/tests/CUDA-Q4_K_M.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 11.2 ms +[Load] Backend init: 9.6 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 895.6 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 403.0 ms +[Load] DiT weight load: 141.8 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 655.9 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 652.4 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.4 ms +[Load] BPE tokenizer: 32.6 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 126.3 ms -[Encode] TextEncoder (70 tokens): 52.7 ms +[Load] TextEncoder: 103.0 ms +[Encode] TextEncoder (70 tokens): 50.9 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.1 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 352.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 118.9 ms +[Load] ConditionEncoder: 29.8 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 12.7 ms, enc_S=238 +[Encode] ConditionEncoder: 13.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759848 -0.046220 -0.129361 0.057668 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 22.1 ms +[Load] Detokenizer: 6.3 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 124.0 ms +[Context] Detokenizer: 124.2 ms [Debug] detok_output: [2170, 64] first4: -0.098446 1.438721 0.299255 -0.646500 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: -0.488470 0.849564 -1.659694 3.185843 [Debug] dit_x0: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 249.1 ms (249.1 ms/sample) +[DiT] Total generation: 249.0 ms (249.0 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 820.0 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000325 0.000812 0.000671 0.000911 +[VAE Batch0] Decode: 616.0 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000379 0.000847 0.000704 0.001000 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:39.264 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:39.265 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:39.265 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:39.265 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:39.265 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:40.025 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:41.587 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:41.587 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:41.592 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:41.751 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:55.364 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:55.452 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:56.779 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:56.779 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:56.781 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:56.786 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:56.978 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:56.981 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:56.981 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:56.981 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:41.759 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:41.771 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:41.772 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:41.805 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:42.113 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:42.114 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:42.114 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006765604019165039, 'diffusion_time_cost': 0.3010725975036621, 'diffusion_per_step_time_cost': 0.037634074687957764, 'total_time_cost': 0.30783820152282715, 'offload_time_cost': 0.0} -2026-03-01 19:54:42.128 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:42.405 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:42.408 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:42.411 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:56.987 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:57.002 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:57.002 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:57.032 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:57.348 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:57.349 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:57.349 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890535354614258, 'diffusion_time_cost': 0.30885934829711914, 'diffusion_per_step_time_cost': 0.03860741853713989, 'total_time_cost': 0.3157498836517334, 'offload_time_cost': 0.0} +2026-03-04 21:45:57.363 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:57.366 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:57.366 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB +2026-03-04 21:45:57.366 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:57.366 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB +2026-03-04 21:45:57.366 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB +2026-03-04 21:45:57.366 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:57.640 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:57.642 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:57.644 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf [GGML] Running acestep-v15-turbo-Q4_K_M.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.976494 dit_step7_vt 0.938658 dit_x0 0.958725 - vae_audio 0.837763 - vae_audio (STFT cosine) 0.954448 + vae_audio 0.837780 + vae_audio (STFT cosine) 0.954457 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999885 0.165835 0.010206 -0.002260 0.973133 -0.002342 0.972003 diff --git a/tests/CUDA-Q5_K_M.log b/tests/CUDA-Q5_K_M.log index 00b9652..4e72f4f 100644 --- a/tests/CUDA-Q5_K_M.log +++ b/tests/CUDA-Q5_K_M.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 25.7 ms +[Load] Backend init: 9.6 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 465.4 ms +[Load] DiT weight load: 152.8 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 656.4 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 655.0 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges [Load] BPE tokenizer: 31.3 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 127.3 ms -[Encode] TextEncoder (70 tokens): 49.5 ms +[Load] TextEncoder: 102.1 ms +[Encode] TextEncoder (70 tokens): 70.3 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.4 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 412.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 138.7 ms +[Load] ConditionEncoder: 34.4 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 13.1 ms, enc_S=238 +[Encode] ConditionEncoder: 13.5 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760389 -0.050879 -0.130835 0.059141 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 24.2 ms +[Load] Detokenizer: 6.8 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 121.7 ms +[Context] Detokenizer: 124.1 ms [Debug] detok_output: [2170, 64] first4: -0.125017 1.460327 0.292545 -0.654237 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: 0.031181 0.378487 -1.509792 3.095486 [Debug] dit_x0: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 251.1 ms (251.1 ms/sample) +[DiT] Total generation: 261.4 ms (261.4 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 804.2 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000692 0.001098 0.000938 0.001230 +[VAE Batch0] Decode: 614.5 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000681 0.001094 0.000878 0.001246 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:31.395 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:31.395 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:31.395 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:31.395 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:31.395 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:32.168 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:33.881 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:33.882 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:33.887 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:34.060 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:47.565 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:47.662 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:48.979 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:48.979 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:48.981 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:48.987 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:49.182 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:34.068 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:34.081 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:34.081 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:34.105 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:34.415 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:34.416 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:34.416 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006921052932739258, 'diffusion_time_cost': 0.3029003143310547, 'diffusion_per_step_time_cost': 0.037862539291381836, 'total_time_cost': 0.30982136726379395, 'offload_time_cost': 0.0} -2026-03-01 19:54:34.431 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:34.714 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:34.716 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:34.720 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:49.211 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:49.226 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:49.226 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:49.260 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:49.577 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:49.577 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:49.578 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00789022445678711, 'diffusion_time_cost': 0.30838513374328613, 'diffusion_per_step_time_cost': 0.03854814171791077, 'total_time_cost': 0.31627535820007324, 'offload_time_cost': 0.0} +2026-03-04 21:45:49.591 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:49.594 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:49.594 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:45:49.594 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:49.594 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:45:49.594 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:45:49.594 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:49.873 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:49.875 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:49.877 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf [GGML] Running acestep-v15-turbo-Q5_K_M.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.983446 dit_step7_vt 0.953383 dit_x0 0.970119 - vae_audio 0.883226 - vae_audio (STFT cosine) 0.968463 + vae_audio 0.883245 + vae_audio (STFT cosine) 0.968470 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999930 0.139407 0.007818 -0.002306 0.973025 -0.002342 0.972003 diff --git a/tests/CUDA-Q6_K.log b/tests/CUDA-Q6_K.log index 10b9a7a..4950234 100644 --- a/tests/CUDA-Q6_K.log +++ b/tests/CUDA-Q6_K.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 9.5 ms +[Load] Backend init: 10.2 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 514.8 ms +[Load] DiT weight load: 176.0 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 657.3 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 655.0 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 30.7 ms +[Load] BPE tokenizer: 31.5 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 125.7 ms -[Encode] TextEncoder (70 tokens): 49.2 ms +[Load] TextEncoder: 102.6 ms +[Encode] TextEncoder (70 tokens): 51.1 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.3 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 476.3 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 145.8 ms +[Load] ConditionEncoder: 40.6 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 11.0 ms, enc_S=238 +[Encode] ConditionEncoder: 10.8 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760759 -0.050104 -0.133269 0.058044 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 26.4 ms +[Load] Detokenizer: 7.8 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 123.5 ms +[Context] Detokenizer: 123.6 ms [Debug] detok_output: [2170, 64] first4: -0.140341 1.456987 0.310602 -0.632665 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: 0.081321 0.135461 -1.397063 2.986206 [Debug] dit_x0: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 273.2 ms (273.2 ms/sample) +[DiT] Total generation: 270.6 ms (270.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 804.3 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000481 0.000872 0.000838 0.001216 +[VAE Batch0] Decode: 616.4 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.001035 0.000900 0.001303 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:23.682 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:23.683 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:23.683 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:23.683 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:23.683 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:24.419 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:25.992 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:25.992 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:25.998 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:26.157 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:39.727 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:39.815 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:41.135 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:41.135 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:41.137 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:41.142 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:41.335 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:26.166 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:26.178 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:26.178 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:26.214 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:26.528 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:26.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:26.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00680994987487793, 'diffusion_time_cost': 0.30716919898986816, 'diffusion_per_step_time_cost': 0.03839614987373352, 'total_time_cost': 0.3139791488647461, 'offload_time_cost': 0.0} -2026-03-01 19:54:26.543 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB -2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB -2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB -2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:26.821 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:26.824 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:26.828 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:41.345 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:41.359 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:41.359 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:41.390 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:41.705 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:41.706 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:41.706 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890773773193359, 'diffusion_time_cost': 0.30776047706604004, 'diffusion_per_step_time_cost': 0.038470059633255005, 'total_time_cost': 0.3146512508392334, 'offload_time_cost': 0.0} +2026-03-04 21:45:41.720 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:41.722 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:41.723 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:45:41.723 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:41.723 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:45:41.723 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:45:41.723 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:41.997 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:41.999 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:42.001 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf [GGML] Running acestep-v15-turbo-Q6_K.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.985862 dit_step7_vt 0.962454 dit_x0 0.974866 - vae_audio 0.893678 - vae_audio (STFT cosine) 0.969663 + vae_audio 0.893720 + vae_audio (STFT cosine) 0.969672 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999937 0.147590 0.007252 -0.002265 0.972930 -0.002342 0.972003 diff --git a/tests/CUDA-Q8_0.log b/tests/CUDA-Q8_0.log index 3a84ce1..2744819 100644 --- a/tests/CUDA-Q8_0.log +++ b/tests/CUDA-Q8_0.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 9.5 ms +[Load] Backend init: 9.7 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 221.9 ms +[Load] DiT weight load: 201.4 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 658.9 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 655.8 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges [Load] BPE tokenizer: 31.2 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 127.0 ms -[Encode] TextEncoder (70 tokens): 68.2 ms +[Load] TextEncoder: 102.2 ms +[Encode] TextEncoder (70 tokens): 57.4 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.3 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 616.6 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 65.2 ms +[Load] ConditionEncoder: 52.3 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 8.9 ms, enc_S=238 +[Encode] ConditionEncoder: 9.0 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759220 -0.049559 -0.133467 0.058389 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 12.1 ms +[Load] Detokenizer: 9.2 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 104.8 ms +[Context] Detokenizer: 103.8 ms [Debug] detok_output: [2170, 64] first4: -0.120490 1.436288 0.301594 -0.632564 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: -0.007394 0.229067 -1.488817 3.083439 [Debug] dit_x0: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 242.9 ms (242.9 ms/sample) +[DiT] Total generation: 236.6 ms (236.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 822.6 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000524 0.000859 0.000752 0.001056 +[VAE Batch0] Decode: 618.6 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.000916 0.000781 0.001161 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:15.905 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:15.906 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:15.906 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:15.906 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:15.906 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:16.672 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:18.198 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:18.198 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:18.207 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:18.371 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:31.851 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:31.953 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:33.265 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:33.265 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:33.269 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:33.275 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:33.468 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:18.380 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:18.392 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:18.392 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:18.418 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:18.724 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:18.724 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:18.724 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006882190704345703, 'diffusion_time_cost': 0.298403263092041, 'diffusion_per_step_time_cost': 0.03730040788650513, 'total_time_cost': 0.3052854537963867, 'offload_time_cost': 0.0} -2026-03-01 19:54:18.739 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB -2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB -2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB -2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:19.031 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:19.034 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:19.037 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:33.490 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:33.505 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:33.505 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:33.539 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:33.854 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:33.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:33.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0069425106048583984, 'diffusion_time_cost': 0.30779337882995605, 'diffusion_per_step_time_cost': 0.03847417235374451, 'total_time_cost': 0.31473588943481445, 'offload_time_cost': 0.0} +2026-03-04 21:45:33.869 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:33.871 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:33.871 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB +2026-03-04 21:45:33.871 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:33.871 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB +2026-03-04 21:45:33.871 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB +2026-03-04 21:45:33.871 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:34.145 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:34.147 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:34.149 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf [GGML] Running acestep-v15-turbo-Q8_0.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.988641 dit_step7_vt 0.970144 dit_x0 0.979969 - vae_audio 0.905525 - vae_audio (STFT cosine) 0.976530 + vae_audio 0.905563 + vae_audio (STFT cosine) 0.976538 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999948 0.134961 0.006551 -0.002307 0.972901 -0.002342 0.972003 diff --git a/tests/Vulkan-BF16.log b/tests/Vulkan-BF16.log index 2d955d7..c063695 100644 --- a/tests/Vulkan-BF16.log +++ b/tests/Vulkan-BF16.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 260.3 ms +[Load] Backend init: 142.5 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 397.7 ms +[Load] DiT weight load: 338.6 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 672.5 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 661.6 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.1 ms +[Load] BPE tokenizer: 31.2 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 166.9 ms -[Encode] TextEncoder (70 tokens): 30.9 ms +[Load] TextEncoder: 141.4 ms +[Encode] TextEncoder (70 tokens): 1939.4 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 163.7 ms +[Load] ConditionEncoder: 130.2 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 22.5 ms, enc_S=238 +[Encode] ConditionEncoder: 2492.6 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758148 -0.049593 -0.132730 0.058488 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 28.1 ms +[Load] Detokenizer: 23.1 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 229.8 ms +[Context] Detokenizer: 2525.9 ms [Debug] detok_output: [2170, 64] first4: -0.125193 1.435010 0.308190 -0.624228 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] proj_in_input: [192, 2170] first4: -0.125193 1.435010 0.308190 -0.624228 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.168464 0.814954 0.327714 -0.561971 [Debug] layer0_sa_input: [2048, 1085] first4: -0.719110 -0.764019 -0.047328 0.261808 -[Debug] layer0_q_after_rope: [128, 16] first4: -2.424376 -0.094810 -0.411903 1.007324 -[Debug] layer0_k_after_rope: [128, 8] first4: -12.712339 1.106410 1.775920 1.780798 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.501171 0.169176 -0.355798 0.513027 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.598325 -0.820241 -0.296337 0.493580 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.091503 0.566892 52.584164 -0.903901 -[Debug] hidden_after_layer6: [2048, 1085] first4: -21.192070 0.040278 33.599442 -4.442998 -[Debug] hidden_after_layer12: [2048, 1085] first4: -15.068191 -18.118078 71.999359 28.597229 -[Debug] hidden_after_layer18: [2048, 1085] first4: -27.132679 15.867422 60.847614 20.940519 -[Debug] hidden_after_layer23: [2048, 1085] first4: -12.584854 45.152912 198.753845 145.517029 -[Debug] dit_step0_vt: [2170, 64] first4: 0.014936 1.119046 0.345802 2.379982 -[Debug] dit_step0_xt: [2170, 64] first4: 0.193657 2.105384 -0.187593 0.739475 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.985390 -0.040374 -0.446411 0.887640 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.133966 1.032982 1.765450 1.789189 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.285921 -0.088167 -0.083954 0.187361 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.465657 -0.778736 0.078704 0.498346 +[Debug] hidden_after_layer0: [2048, 1085] first4: -8.621284 0.720027 54.661194 -0.769228 +[Debug] hidden_after_layer6: [2048, 1085] first4: -12.726752 3.144506 -9.323353 -12.165966 +[Debug] hidden_after_layer12: [2048, 1085] first4: -16.201662 -10.547243 4.967308 15.566863 +[Debug] hidden_after_layer18: [2048, 1085] first4: -26.509827 14.787127 -25.476906 8.639433 +[Debug] hidden_after_layer23: [2048, 1085] first4: -16.044237 89.590195 45.410172 78.645676 +[Debug] dit_step0_vt: [2170, 64] first4: 0.347229 0.879013 0.198151 1.945618 +[Debug] dit_step0_xt: [2170, 64] first4: 0.178553 2.116295 -0.180882 0.759219 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: 0.086700 0.854980 -0.273651 1.728149 -[Debug] dit_step1_xt: [2170, 64] first4: 0.188928 2.058749 -0.172667 0.645212 +[Debug] dit_step1_vt: [2170, 64] first4: 0.068695 0.847748 -0.298004 1.750702 +[Debug] dit_step1_xt: [2170, 64] first4: 0.174806 2.070054 -0.164627 0.663726 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: 0.180420 0.837399 -0.150421 2.056976 -[Debug] dit_step2_xt: [2170, 64] first4: 0.176900 2.002922 -0.162639 0.508081 +[Debug] dit_step2_vt: [2170, 64] first4: 0.151260 0.875549 -0.207390 2.089754 +[Debug] dit_step2_xt: [2170, 64] first4: 0.164722 2.011684 -0.150801 0.524409 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.130821 0.833313 0.053528 2.193359 -[Debug] dit_step3_xt: [2170, 64] first4: 0.165998 1.933480 -0.167099 0.325301 +[Debug] dit_step3_vt: [2170, 64] first4: 0.077034 0.843689 -0.087112 2.299004 +[Debug] dit_step3_xt: [2170, 64] first4: 0.158302 1.941377 -0.143542 0.332826 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.273712 0.866425 0.216686 2.274872 -[Debug] dit_step4_xt: [2170, 64] first4: 0.136672 1.840648 -0.190316 0.081565 +[Debug] dit_step4_vt: [2170, 64] first4: 0.173340 0.815531 0.275307 2.367218 +[Debug] dit_step4_xt: [2170, 64] first4: 0.139730 1.853999 -0.173039 0.079195 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.347900 0.772171 0.542953 2.248352 -[Debug] dit_step5_xt: [2170, 64] first4: 0.086972 1.730338 -0.267881 -0.239629 +[Debug] dit_step5_vt: [2170, 64] first4: 0.210556 0.765915 0.470947 2.214279 +[Debug] dit_step5_xt: [2170, 64] first4: 0.109651 1.744582 -0.240317 -0.237130 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.132820 0.664673 0.218246 2.387787 -[Debug] dit_step6_xt: [2170, 64] first4: 0.060408 1.597404 -0.311530 -0.717186 +[Debug] dit_step6_vt: [2170, 64] first4: -0.038303 0.546310 0.224964 2.284607 +[Debug] dit_step6_xt: [2170, 64] first4: 0.117311 1.635320 -0.285310 -0.694052 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.335976 0.323303 0.198029 2.726624 -[Debug] dit_x0: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173 +[Debug] dit_step7_vt: [2170, 64] first4: -0.300537 0.235870 0.263802 2.617432 +[Debug] dit_x0: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 740.5 ms (740.5 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173 +[DiT] Total generation: 2630.4 ms (2630.4 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9812.1 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000591 0.001078 0.000929 0.001296 +[VAE Batch0] Decode: 2992.9 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000614 0.001141 0.000934 0.001396 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:55:13.398 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:55:13.398 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:55:13.399 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:55:13.399 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:55:13.399 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:55:14.155 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:55:15.664 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:55:15.664 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:55:15.669 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:55:15.830 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:46:56.541 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:46:56.622 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:46:57.937 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:46:57.937 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:46:57.939 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:46:57.945 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:46:58.137 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:55:15.838 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:55:15.850 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:55:15.851 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:55:15.885 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:55:16.193 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:55:16.193 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:55:16.193 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006814241409301758, 'diffusion_time_cost': 0.30007076263427734, 'diffusion_per_step_time_cost': 0.03750884532928467, 'total_time_cost': 0.3068850040435791, 'offload_time_cost': 0.0} -2026-03-01 19:55:16.208 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:55:16.485 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:55:16.488 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:55:16.491 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:46:58.146 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:46:58.161 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:46:58.161 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:46:58.195 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:46:58.511 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:46:58.512 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:46:58.512 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006871938705444336, 'diffusion_time_cost': 0.30806517601013184, 'diffusion_per_step_time_cost': 0.03850814700126648, 'total_time_cost': 0.31493711471557617, 'offload_time_cost': 0.0} +2026-03-04 21:46:58.526 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:46:58.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:46:58.528 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:46:58.528 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:46:58.528 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:46:58.528 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:46:58.528 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:46:58.802 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:46:58.804 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:46:58.806 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf [GGML] Running acestep-v15-turbo-BF16.gguf... @@ -224,36 +219,36 @@ Using precomputed LM hints temb_t 0.999999 hidden_after_proj_in 0.999987 enc_after_cond_emb 0.999825 - layer0_sa_output 0.999959 - hidden_after_layer0 0.999982 - hidden_after_layer6 0.999916 - hidden_after_layer12 0.999276 - hidden_after_layer18 0.996645 - hidden_after_layer23 0.993735 - dit_step0_vt 0.975502 - dit_step0_xt 0.999946 - dit_step1_vt 0.898326 - dit_step1_xt 0.999578 - dit_step2_vt 0.893586 - dit_step2_xt 0.998276 - dit_step3_vt 0.881101 - dit_step3_xt 0.994720 - dit_step4_vt 0.869138 - dit_step4_xt 0.986137 - dit_step5_vt 0.854878 - dit_step5_xt 0.965846 - dit_step6_vt 0.840298 - dit_step6_xt 0.925771 - dit_step7_vt 0.818271 - dit_x0 0.867399 - vae_audio 0.680412 - vae_audio (STFT cosine) 0.855380 + layer0_sa_output 0.920858 + hidden_after_layer0 0.996092 + hidden_after_layer6 0.980248 + hidden_after_layer12 0.977161 + hidden_after_layer18 0.973382 + hidden_after_layer23 0.961755 + dit_step0_vt 0.843333 + dit_step0_xt 0.999656 + dit_step1_vt 0.875601 + dit_step1_xt 0.998907 + dit_step2_vt 0.860701 + dit_step2_xt 0.996792 + dit_step3_vt 0.838816 + dit_step3_xt 0.991464 + dit_step4_vt 0.827875 + dit_step4_xt 0.978766 + dit_step5_vt 0.812689 + dit_step5_xt 0.949636 + dit_step6_vt 0.795272 + dit_step6_xt 0.894491 + dit_step7_vt 0.769772 + dit_x0 0.818406 + vae_audio 0.571274 + vae_audio (STFT cosine) 0.788509 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999946 0.135811 0.006633 -0.002316 0.972919 -0.002342 0.972003 - dit_step1_xt 0.999578 0.413265 0.019706 -0.005121 0.942541 -0.005313 0.941730 - dit_step2_xt 0.998276 0.811472 0.038208 -0.008968 0.908957 -0.009311 0.908527 - dit_step3_xt 0.994720 1.481150 0.064047 -0.014385 0.872574 -0.014577 0.873624 - dit_step4_xt 0.986137 1.857148 0.100272 -0.021489 0.837038 -0.021660 0.841995 - dit_step5_xt 0.965846 1.439633 0.154129 -0.031859 0.812819 -0.032109 0.824593 - dit_step6_xt 0.925771 2.125688 0.235367 -0.046759 0.832442 -0.046482 0.855546 + dit_step0_xt 0.999656 0.367652 0.018858 -0.002243 0.972108 -0.002342 0.972003 + dit_step1_xt 0.998907 0.763455 0.032624 -0.004985 0.941679 -0.005313 0.941730 + dit_step2_xt 0.996792 1.022189 0.053741 -0.008816 0.908019 -0.009311 0.908527 + dit_step3_xt 0.991464 1.657425 0.084380 -0.014275 0.871556 -0.014577 0.873624 + dit_step4_xt 0.978766 2.432666 0.128087 -0.021464 0.836876 -0.021660 0.841995 + dit_step5_xt 0.949636 3.423663 0.193034 -0.032107 0.813619 -0.032109 0.824593 + dit_step6_xt 0.894491 4.744513 0.289706 -0.047388 0.833987 -0.046482 0.855546 diff --git a/tests/Vulkan-Q4_K_M.log b/tests/Vulkan-Q4_K_M.log index 011c0c3..03f9985 100644 --- a/tests/Vulkan-Q4_K_M.log +++ b/tests/Vulkan-Q4_K_M.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 115.6 ms +[Load] Backend init: 146.5 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 895.6 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 126.7 ms +[Load] DiT weight load: 110.3 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 667.9 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 661.8 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.0 ms +[Load] BPE tokenizer: 31.2 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 166.1 ms -[Encode] TextEncoder (70 tokens): 18.4 ms +[Load] TextEncoder: 143.1 ms +[Encode] TextEncoder (70 tokens): 18.1 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.3 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 352.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 43.9 ms +[Load] ConditionEncoder: 40.5 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 18.2 ms, enc_S=238 +[Encode] ConditionEncoder: 2552.5 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760519 -0.046675 -0.129011 0.057651 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 8.9 ms +[Load] Detokenizer: 8.5 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 152.2 ms +[Context] Detokenizer: 574.4 ms [Debug] detok_output: [2170, 64] first4: -0.107345 1.442038 0.300564 -0.641466 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -93,56 +91,53 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] dit_step0_vt: [2170, 64] first4: 0.669312 0.442215 1.300629 2.101841 [Debug] dit_step0_xt: [2170, 64] first4: 0.163913 2.136149 -0.230995 0.752118 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: 1.120422 0.593113 1.031189 1.813599 -[Debug] dit_step1_xt: [2170, 64] first4: 0.102799 2.103798 -0.287241 0.653194 +[Debug] dit_step1_vt: [2170, 64] first4: 1.083954 0.575027 1.011414 1.785126 +[Debug] dit_step1_xt: [2170, 64] first4: 0.104788 2.104784 -0.286163 0.654747 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: 1.381363 0.295410 1.456146 1.949341 -[Debug] dit_step2_xt: [2170, 64] first4: 0.010708 2.084104 -0.384318 0.523238 +[Debug] dit_step2_vt: [2170, 64] first4: 1.406609 0.358032 1.442169 1.947861 +[Debug] dit_step2_xt: [2170, 64] first4: 0.011014 2.080915 -0.382307 0.524890 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 1.440727 0.067017 1.481567 2.158554 -[Debug] dit_step3_xt: [2170, 64] first4: -0.109353 2.078519 -0.507782 0.343359 +[Debug] dit_step3_vt: [2170, 64] first4: 1.450653 0.080627 1.479324 2.174759 +[Debug] dit_step3_xt: [2170, 64] first4: -0.109874 2.074197 -0.505584 0.343660 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 1.377216 0.234177 1.413437 2.181564 -[Debug] dit_step4_xt: [2170, 64] first4: -0.256912 2.053428 -0.659221 0.109620 +[Debug] dit_step4_vt: [2170, 64] first4: 1.396931 0.250122 1.401264 2.164902 +[Debug] dit_step4_xt: [2170, 64] first4: -0.259545 2.047398 -0.655720 0.111706 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 1.135239 0.376801 1.055233 2.272675 -[Debug] dit_step5_xt: [2170, 64] first4: -0.419089 1.999600 -0.809969 -0.215048 +[Debug] dit_step5_vt: [2170, 64] first4: 1.155813 0.405807 1.027550 2.260437 +[Debug] dit_step5_xt: [2170, 64] first4: -0.424661 1.989425 -0.802512 -0.211213 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.948242 0.399368 0.426941 2.645081 -[Debug] dit_step6_xt: [2170, 64] first4: -0.608737 1.919726 -0.895357 -0.744064 +[Debug] dit_step6_vt: [2170, 64] first4: 0.916870 0.396088 0.350647 2.622253 +[Debug] dit_step6_xt: [2170, 64] first4: -0.608035 1.910208 -0.872642 -0.735664 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: 0.549133 -0.167076 0.379578 2.984619 -[Debug] dit_x0: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450 +[Debug] dit_step7_vt: [2170, 64] first4: 0.544876 -0.215309 0.434998 3.006592 +[Debug] dit_x0: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 263.6 ms (263.6 ms/sample) -[Debug] dit_output: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450 +[DiT] Total generation: 342.3 ms (342.3 ms/sample) +[Debug] dit_output: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9686.3 ms -[Debug] vae_audio: [2, 4166400] first4: 0.015021 0.018215 0.017495 0.016521 +[VAE Batch0] Decode: 1703.5 ms +[Debug] vae_audio: [2, 4166400] first4: 0.012597 0.015460 0.014870 0.014040 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:56:19.059 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:56:19.060 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:56:19.060 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:56:19.060 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:56:19.060 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:56:19.832 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:56:21.417 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:56:21.417 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:56:21.428 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:56:21.589 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:49.166 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:49.255 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:50.597 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:50.597 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:50.598 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:50.604 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:50.793 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:56:21.597 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:56:21.610 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:56:21.610 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:56:21.642 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:56:21.955 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:56:21.956 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:56:21.956 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006905794143676758, 'diffusion_time_cost': 0.3056776523590088, 'diffusion_per_step_time_cost': 0.0382097065448761, 'total_time_cost': 0.31258344650268555, 'offload_time_cost': 0.0} -2026-03-01 19:56:21.970 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:56:22.249 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:56:22.252 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:56:22.255 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:50.802 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:50.816 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:50.816 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:50.850 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:51.166 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:51.167 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:51.167 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006922483444213867, 'diffusion_time_cost': 0.3079640865325928, 'diffusion_per_step_time_cost': 0.0384955108165741, 'total_time_cost': 0.31488656997680664, 'offload_time_cost': 0.0} +2026-03-04 21:47:51.181 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:51.183 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:51.183 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:47:51.183 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:51.183 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:47:51.183 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:47:51.183 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:51.458 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:51.460 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:51.461 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf [GGML] Running acestep-v15-turbo-Q4_K_M.gguf... @@ -232,28 +227,28 @@ Using precomputed LM hints hidden_after_layer23 0.947132 dit_step0_vt 0.790630 dit_step0_xt 0.999550 - dit_step1_vt 0.812267 - dit_step1_xt 0.998316 - dit_step2_vt 0.797855 - dit_step2_xt 0.994982 - dit_step3_vt 0.785550 - dit_step3_xt 0.987155 - dit_step4_vt 0.777677 - dit_step4_xt 0.969894 - dit_step5_vt 0.765554 - dit_step5_xt 0.933268 - dit_step6_vt 0.748164 - dit_step6_xt 0.865654 - dit_step7_vt 0.704997 - dit_x0 0.768990 - vae_audio 0.377954 - vae_audio (STFT cosine) 0.669489 + dit_step1_vt 0.756205 + dit_step1_xt 0.998148 + dit_step2_vt 0.797194 + dit_step2_xt 0.994834 + dit_step3_vt 0.784456 + dit_step3_xt 0.987026 + dit_step4_vt 0.776725 + dit_step4_xt 0.969792 + dit_step5_vt 0.765077 + dit_step5_xt 0.933184 + dit_step6_vt 0.747231 + dit_step6_xt 0.865289 + dit_step7_vt 0.704165 + dit_x0 0.767979 + vae_audio 0.376451 + vae_audio (STFT cosine) 0.668630 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999550 0.201120 0.022082 -0.002496 0.972768 -0.002342 0.972003 - dit_step1_xt 0.998316 0.415084 0.041258 -0.005641 0.942202 -0.005313 0.941730 - dit_step2_xt 0.994982 0.710340 0.068500 -0.010236 0.907728 -0.009311 0.908527 - dit_step3_xt 0.987155 1.070455 0.105302 -0.016404 0.870181 -0.014577 0.873624 - dit_step4_xt 0.969894 1.456633 0.155292 -0.024587 0.833834 -0.021660 0.841995 - dit_step5_xt 0.933268 1.997366 0.225911 -0.035903 0.808944 -0.032109 0.824593 - dit_step6_xt 0.865654 3.020976 0.331484 -0.051668 0.828925 -0.046482 0.855546 + dit_step1_xt 0.998148 0.415598 0.043234 -0.005810 0.944103 -0.005313 0.941730 + dit_step2_xt 0.994834 0.709830 0.069736 -0.010410 0.909328 -0.009311 0.908527 + dit_step3_xt 0.987026 1.071567 0.106058 -0.016584 0.871456 -0.014577 0.873624 + dit_step4_xt 0.969792 1.488428 0.155756 -0.024763 0.834729 -0.021660 0.841995 + dit_step5_xt 0.933184 1.958024 0.226224 -0.036147 0.809005 -0.032109 0.824593 + dit_step6_xt 0.865289 3.030077 0.331834 -0.051892 0.828296 -0.046482 0.855546 diff --git a/tests/Vulkan-Q5_K_M.log b/tests/Vulkan-Q5_K_M.log index ec38ab3..aa0eb9c 100644 --- a/tests/Vulkan-Q5_K_M.log +++ b/tests/Vulkan-Q5_K_M.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 114.1 ms +[Load] Backend init: 114.4 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 151.9 ms +[Load] DiT weight load: 129.5 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 677.1 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 660.3 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.6 ms +[Load] BPE tokenizer: 30.7 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 167.6 ms -[Encode] TextEncoder (70 tokens): 18.0 ms +[Load] TextEncoder: 142.0 ms +[Encode] TextEncoder (70 tokens): 17.4 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.1 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 412.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 55.7 ms +[Load] ConditionEncoder: 50.1 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 17.4 ms, enc_S=238 +[Encode] ConditionEncoder: 3109.7 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760480 -0.051691 -0.132144 0.058144 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 14.2 ms +[Load] Detokenizer: 9.1 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 176.8 ms +[Context] Detokenizer: 674.8 ms [Debug] detok_output: [2170, 64] first4: -0.125636 1.455599 0.291766 -0.651349 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -96,53 +94,50 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] dit_step1_vt: [2170, 64] first4: -0.053368 1.748116 -0.894806 1.618408 [Debug] dit_step1_xt: [2170, 64] first4: 0.197534 2.006799 -0.135800 0.647723 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.025024 1.326050 -0.792084 2.043884 -[Debug] dit_step2_xt: [2170, 64] first4: 0.199202 1.918396 -0.082994 0.511464 +[Debug] dit_step2_vt: [2170, 64] first4: -0.031860 1.378967 -0.801270 2.036987 +[Debug] dit_step2_xt: [2170, 64] first4: 0.199658 1.914868 -0.082382 0.511923 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: -0.000458 1.126770 -0.795593 2.254120 -[Debug] dit_step3_xt: [2170, 64] first4: 0.199240 1.824498 -0.016695 0.323620 +[Debug] dit_step3_vt: [2170, 64] first4: 0.009003 1.141663 -0.806183 2.229477 +[Debug] dit_step3_xt: [2170, 64] first4: 0.198908 1.819729 -0.015200 0.326134 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.174652 1.253662 -1.125977 2.441956 -[Debug] dit_step4_xt: [2170, 64] first4: 0.180528 1.690177 0.103946 0.061982 +[Debug] dit_step4_vt: [2170, 64] first4: 0.174896 1.264160 -1.139648 2.439102 +[Debug] dit_step4_xt: [2170, 64] first4: 0.180169 1.684284 0.106905 0.064801 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.205261 1.640076 -1.795410 2.452087 -[Debug] dit_step5_xt: [2170, 64] first4: 0.151205 1.455881 0.360433 -0.288316 +[Debug] dit_step5_vt: [2170, 64] first4: 0.201294 1.641151 -1.784760 2.454834 +[Debug] dit_step5_xt: [2170, 64] first4: 0.151413 1.449833 0.361871 -0.285889 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: -0.158905 1.750122 -2.412979 2.419128 -[Debug] dit_step6_xt: [2170, 64] first4: 0.182986 1.105856 0.843029 -0.772142 +[Debug] dit_step6_vt: [2170, 64] first4: -0.154907 1.748291 -2.434448 2.425964 +[Debug] dit_step6_xt: [2170, 64] first4: 0.182394 1.100175 0.848760 -0.771082 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.636047 1.672760 -3.485062 2.600891 -[Debug] dit_x0: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409 +[Debug] dit_step7_vt: [2170, 64] first4: -0.633545 1.687561 -3.500275 2.586243 +[Debug] dit_x0: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 269.9 ms (269.9 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409 +[DiT] Total generation: 354.9 ms (354.9 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9630.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.001367 0.001844 0.001533 0.001892 +[VAE Batch0] Decode: 1718.2 ms +[Debug] vae_audio: [2, 4166400] first4: 0.001432 0.001921 0.001585 0.001927 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:56:02.727 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:56:02.728 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:56:02.728 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:56:02.728 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:56:02.728 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:56:03.499 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:56:05.072 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:56:05.072 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:56:05.078 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:56:05.239 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:37.062 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:37.143 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:38.480 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:38.481 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:38.482 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:38.488 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:38.703 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:56:05.247 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:56:05.260 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:56:05.260 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:56:05.285 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:56:05.592 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:56:05.593 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:56:05.593 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00687718391418457, 'diffusion_time_cost': 0.3001282215118408, 'diffusion_per_step_time_cost': 0.0375160276889801, 'total_time_cost': 0.3070054054260254, 'offload_time_cost': 0.0} -2026-03-01 19:56:05.607 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:56:05.609 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB -2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:56:05.610 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB -2026-03-01 19:56:05.610 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB -2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:56:05.884 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:56:05.888 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:56:05.891 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:38.712 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:38.726 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:38.726 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:38.761 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:39.078 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:39.079 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:39.079 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006884098052978516, 'diffusion_time_cost': 0.3090353012084961, 'diffusion_per_step_time_cost': 0.03862941265106201, 'total_time_cost': 0.3159193992614746, 'offload_time_cost': 0.0} +2026-03-04 21:47:39.092 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:39.095 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:39.095 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:47:39.095 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:39.095 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:47:39.095 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:47:39.095 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:39.374 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:39.376 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:39.378 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf [GGML] Running acestep-v15-turbo-Q5_K_M.gguf... @@ -234,26 +229,26 @@ Using precomputed LM hints dit_step0_xt 0.999650 dit_step1_vt 0.854589 dit_step1_xt 0.998725 - dit_step2_vt 0.841602 - dit_step2_xt 0.996217 - dit_step3_vt 0.832748 - dit_step3_xt 0.990342 - dit_step4_vt 0.826828 - dit_step4_xt 0.977304 - dit_step5_vt 0.815977 - dit_step5_xt 0.948497 - dit_step6_vt 0.803425 - dit_step6_xt 0.895308 - dit_step7_vt 0.770195 - dit_x0 0.820447 - vae_audio 0.478241 - vae_audio (STFT cosine) 0.753764 + dit_step2_vt 0.826891 + dit_step2_xt 0.996124 + dit_step3_vt 0.832715 + dit_step3_xt 0.990263 + dit_step4_vt 0.826558 + dit_step4_xt 0.977265 + dit_step5_vt 0.815705 + dit_step5_xt 0.948477 + dit_step6_vt 0.802898 + dit_step6_xt 0.895216 + dit_step7_vt 0.769793 + dit_x0 0.820156 + vae_audio 0.477357 + vae_audio (STFT cosine) 0.753154 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999650 0.235954 0.018872 -0.002255 0.973213 -0.002342 0.972003 dit_step1_xt 0.998725 0.437235 0.034677 -0.005176 0.942982 -0.005313 0.941730 - dit_step2_xt 0.996217 0.735376 0.057569 -0.009210 0.909169 -0.009311 0.908527 - dit_step3_xt 0.990342 1.115564 0.088544 -0.014811 0.872820 -0.014577 0.873624 - dit_step4_xt 0.977304 1.463506 0.131044 -0.022213 0.838526 -0.021660 0.841995 - dit_step5_xt 0.948497 2.208427 0.193557 -0.032833 0.817339 -0.032109 0.824593 - dit_step6_xt 0.895308 3.287671 0.286241 -0.047639 0.842369 -0.046482 0.855546 + dit_step2_xt 0.996124 0.735913 0.058267 -0.009379 0.909744 -0.009311 0.908527 + dit_step3_xt 0.990263 1.130236 0.088998 -0.014995 0.873310 -0.014577 0.873624 + dit_step4_xt 0.977265 1.457183 0.131253 -0.022419 0.838885 -0.021660 0.841995 + dit_step5_xt 0.948477 2.197404 0.193723 -0.033044 0.817537 -0.032109 0.824593 + dit_step6_xt 0.895216 3.271284 0.286472 -0.047848 0.842172 -0.046482 0.855546 diff --git a/tests/Vulkan-Q6_K.log b/tests/Vulkan-Q6_K.log index eff680f..a938da1 100644 --- a/tests/Vulkan-Q6_K.log +++ b/tests/Vulkan-Q6_K.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 114.2 ms +[Load] Backend init: 144.9 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 181.3 ms +[Load] DiT weight load: 156.5 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 670.0 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 657.4 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.2 ms +[Load] BPE tokenizer: 32.6 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 165.9 ms -[Encode] TextEncoder (70 tokens): 17.6 ms +[Load] TextEncoder: 142.6 ms +[Encode] TextEncoder (70 tokens): 43.2 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 476.3 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 61.6 ms +[Load] ConditionEncoder: 55.4 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 15.6 ms, enc_S=238 +[Encode] ConditionEncoder: 3621.4 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.761356 -0.050570 -0.133026 0.058500 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 10.8 ms +[Load] Detokenizer: 10.5 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 143.8 ms +[Context] Detokenizer: 421.5 ms [Debug] detok_output: [2170, 64] first4: -0.141024 1.454365 0.315089 -0.623565 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] proj_in_input: [192, 2170] first4: -0.141024 1.454365 0.315089 -0.623565 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.170166 0.815842 0.310486 -0.571373 [Debug] layer0_sa_input: [2048, 1085] first4: -0.716080 -0.755969 -0.048350 0.263422 -[Debug] layer0_q_after_rope: [128, 16] first4: -2.400391 -0.081909 -0.397461 1.011719 -[Debug] layer0_k_after_rope: [128, 8] first4: -12.581572 1.117675 1.774897 1.788774 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.503906 0.211304 -0.366943 0.520996 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.586454 -0.808233 -0.324089 0.502214 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.155503 0.531986 51.823910 -0.865276 -[Debug] hidden_after_layer6: [2048, 1085] first4: -20.861578 -0.240065 34.589954 -4.288221 -[Debug] hidden_after_layer12: [2048, 1085] first4: -14.692959 -16.975090 77.250595 30.676491 -[Debug] hidden_after_layer18: [2048, 1085] first4: -28.082283 13.370504 64.661263 19.941170 -[Debug] hidden_after_layer23: [2048, 1085] first4: -16.195175 45.294254 196.766129 138.065048 -[Debug] dit_step0_vt: [2170, 64] first4: 0.098133 1.125458 0.338135 2.349396 -[Debug] dit_step0_xt: [2170, 64] first4: 0.189875 2.105093 -0.187245 0.740865 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.967773 -0.181152 -0.292236 0.785156 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.036863 1.158559 1.733423 1.789948 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.344727 -0.353271 -0.171753 0.330078 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.478606 -0.639722 0.069986 0.503358 +[Debug] hidden_after_layer0: [2048, 1085] first4: -6.179441 -0.194424 25.726625 -0.569950 +[Debug] hidden_after_layer6: [2048, 1085] first4: -12.978424 -2.696237 30.199980 -5.338717 +[Debug] hidden_after_layer12: [2048, 1085] first4: -13.710206 -8.286438 60.887405 36.884922 +[Debug] hidden_after_layer18: [2048, 1085] first4: -19.046274 10.102365 41.516960 14.606686 +[Debug] hidden_after_layer23: [2048, 1085] first4: 52.532547 37.219868 135.759094 151.323456 +[Debug] dit_step0_vt: [2170, 64] first4: 0.032410 0.877930 -0.200378 2.148727 +[Debug] dit_step0_xt: [2170, 64] first4: 0.192863 2.116344 -0.162767 0.749987 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: -0.018386 1.071533 -0.402077 1.814056 -[Debug] dit_step1_xt: [2170, 64] first4: 0.190878 2.046645 -0.165313 0.641917 +[Debug] dit_step1_vt: [2170, 64] first4: -0.018381 1.082458 -0.369057 1.835251 +[Debug] dit_step1_xt: [2170, 64] first4: 0.193865 2.057301 -0.142637 0.649882 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.052032 1.017303 -0.201233 2.115219 -[Debug] dit_step2_xt: [2170, 64] first4: 0.194347 1.978825 -0.151898 0.500902 +[Debug] dit_step2_vt: [2170, 64] first4: -0.045654 1.004852 -0.202515 2.128693 +[Debug] dit_step2_xt: [2170, 64] first4: 0.196909 1.990311 -0.129136 0.507969 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.052856 1.105988 0.072205 2.288803 -[Debug] dit_step3_xt: [2170, 64] first4: 0.189942 1.886660 -0.157915 0.310169 +[Debug] dit_step3_vt: [2170, 64] first4: 0.053986 1.098206 0.059753 2.273270 +[Debug] dit_step3_xt: [2170, 64] first4: 0.192410 1.898794 -0.134115 0.318530 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.097982 1.134430 0.083038 2.362534 -[Debug] dit_step4_xt: [2170, 64] first4: 0.179444 1.765114 -0.166812 0.057040 +[Debug] dit_step4_vt: [2170, 64] first4: 0.059109 1.133232 0.098053 2.316540 +[Debug] dit_step4_xt: [2170, 64] first4: 0.186077 1.777376 -0.144621 0.070330 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.122574 1.016464 0.173828 2.333248 -[Debug] dit_step5_xt: [2170, 64] first4: 0.161934 1.619904 -0.191644 -0.276281 +[Debug] dit_step5_vt: [2170, 64] first4: 0.060867 1.087685 0.153732 2.225224 +[Debug] dit_step5_xt: [2170, 64] first4: 0.177382 1.621992 -0.166582 -0.247560 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.070358 0.866913 -0.005890 2.297897 -[Debug] dit_step6_xt: [2170, 64] first4: 0.147862 1.446522 -0.190466 -0.735860 +[Debug] dit_step6_vt: [2170, 64] first4: -0.040359 0.926651 0.010437 2.195786 +[Debug] dit_step6_xt: [2170, 64] first4: 0.185454 1.436662 -0.168670 -0.686717 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.360962 0.376282 -0.314270 2.626526 -[Debug] dit_x0: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818 +[Debug] dit_step7_vt: [2170, 64] first4: -0.506134 0.486553 -0.233337 2.557739 +[Debug] dit_x0: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 276.6 ms (276.6 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818 +[DiT] Total generation: 336.6 ms (336.6 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9723.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000254 0.000880 0.000782 0.001025 +[VAE Batch0] Decode: 1718.9 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000037 0.000692 0.000656 0.000941 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:55:46.361 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:55:46.361 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:55:46.361 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:55:46.362 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:55:46.362 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:55:47.150 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:55:48.700 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:55:48.700 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:55:48.705 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:55:48.864 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:24.206 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:24.287 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:25.614 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:25.614 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:25.616 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:25.621 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:25.810 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:55:48.872 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:55:48.885 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:55:48.885 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:55:48.917 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:55:49.229 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:55:49.230 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:55:49.230 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006822347640991211, 'diffusion_time_cost': 0.3050048351287842, 'diffusion_per_step_time_cost': 0.03812560439109802, 'total_time_cost': 0.3118271827697754, 'offload_time_cost': 0.0} -2026-03-01 19:55:49.244 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:55:49.543 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:55:49.546 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:55:49.549 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:25.819 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:25.833 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:25.833 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:25.868 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:26.184 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:26.184 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:26.185 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006891727447509766, 'diffusion_time_cost': 0.3077425956726074, 'diffusion_per_step_time_cost': 0.03846782445907593, 'total_time_cost': 0.3146343231201172, 'offload_time_cost': 0.0} +2026-03-04 21:47:26.198 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:26.201 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:26.201 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:47:26.201 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:26.201 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:47:26.201 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:47:26.201 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:26.477 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:26.479 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:26.481 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf [GGML] Running acestep-v15-turbo-Q6_K.gguf... @@ -224,36 +219,36 @@ Using precomputed LM hints temb_t 0.999990 hidden_after_proj_in 0.999982 enc_after_cond_emb 0.999691 - layer0_sa_output 0.999774 - hidden_after_layer0 0.999710 - hidden_after_layer6 0.999855 - hidden_after_layer12 0.998856 - hidden_after_layer18 0.995803 - hidden_after_layer23 0.992072 - dit_step0_vt 0.970064 - dit_step0_xt 0.999934 - dit_step1_vt 0.924403 - dit_step1_xt 0.999650 - dit_step2_vt 0.915580 - dit_step2_xt 0.998651 - dit_step3_vt 0.914431 - dit_step3_xt 0.996098 - dit_step4_vt 0.913750 - dit_step4_xt 0.990344 - dit_step5_vt 0.906205 - dit_step5_xt 0.976856 - dit_step6_vt 0.897054 - dit_step6_xt 0.950943 - dit_step7_vt 0.876737 - dit_x0 0.912738 - vae_audio 0.744947 - vae_audio (STFT cosine) 0.875717 + layer0_sa_output 0.916347 + hidden_after_layer0 0.997124 + hidden_after_layer6 0.993692 + hidden_after_layer12 0.992958 + hidden_after_layer18 0.988620 + hidden_after_layer23 0.980873 + dit_step0_vt 0.928387 + dit_step0_xt 0.999844 + dit_step1_vt 0.919122 + dit_step1_xt 0.999441 + dit_step2_vt 0.904200 + dit_step2_xt 0.998155 + dit_step3_vt 0.897635 + dit_step3_xt 0.994890 + dit_step4_vt 0.891638 + dit_step4_xt 0.987300 + dit_step5_vt 0.886907 + dit_step5_xt 0.970219 + dit_step6_vt 0.876538 + dit_step6_xt 0.938117 + dit_step7_vt 0.853291 + dit_x0 0.891872 + vae_audio 0.694699 + vae_audio (STFT cosine) 0.858167 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999934 0.147239 0.007394 -0.002260 0.973056 -0.002342 0.972003 - dit_step1_xt 0.999650 0.408757 0.017759 -0.005276 0.943557 -0.005313 0.941730 - dit_step2_xt 0.998651 0.803721 0.033644 -0.009510 0.911087 -0.009311 0.908527 - dit_step3_xt 0.996098 1.476888 0.054660 -0.015226 0.876460 -0.014577 0.873624 - dit_step4_xt 0.990344 2.294700 0.082632 -0.022702 0.844225 -0.021660 0.841995 - dit_step5_xt 0.976856 3.284146 0.125042 -0.033545 0.825286 -0.032109 0.824593 - dit_step6_xt 0.950943 4.445529 0.188707 -0.049081 0.851111 -0.046482 0.855546 + dit_step0_xt 0.999844 0.420509 0.012210 -0.002227 0.973206 -0.002342 0.972003 + dit_step1_xt 0.999441 0.819075 0.022719 -0.005232 0.943799 -0.005313 0.941730 + dit_step2_xt 0.998155 1.085687 0.039812 -0.009404 0.911549 -0.009311 0.908527 + dit_step3_xt 0.994890 1.743559 0.063467 -0.015082 0.877147 -0.014577 0.873624 + dit_step4_xt 0.987300 2.546782 0.096584 -0.022664 0.845277 -0.021660 0.841995 + dit_step5_xt 0.970219 3.539635 0.144911 -0.033717 0.826728 -0.032109 0.824593 + dit_step6_xt 0.938117 4.795851 0.216607 -0.049484 0.852836 -0.046482 0.855546 diff --git a/tests/Vulkan-Q8_0.log b/tests/Vulkan-Q8_0.log index 774bc8a..ef53667 100644 --- a/tests/Vulkan-Q8_0.log +++ b/tests/Vulkan-Q8_0.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 113.5 ms +[Load] Backend init: 111.5 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 214.1 ms +[Load] DiT weight load: 194.1 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 671.7 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 657.9 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.9 ms +[Load] BPE tokenizer: 31.4 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 176.0 ms +[Load] TextEncoder: 145.4 ms [Encode] TextEncoder (70 tokens): 17.6 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 616.6 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 84.7 ms +[Load] ConditionEncoder: 75.8 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 19.4 ms, enc_S=238 +[Encode] ConditionEncoder: 5074.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759194 -0.049729 -0.133332 0.058435 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 15.5 ms +[Load] Detokenizer: 13.7 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 85.1 ms +[Context] Detokenizer: 437.6 ms [Debug] detok_output: [2170, 64] first4: -0.121505 1.434749 0.303808 -0.627535 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] proj_in_input: [192, 2170] first4: -0.121505 1.434749 0.303808 -0.627535 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.169861 0.817307 0.328308 -0.558397 [Debug] layer0_sa_input: [2048, 1085] first4: -0.718007 -0.757392 -0.047301 0.261071 -[Debug] layer0_q_after_rope: [128, 16] first4: -2.423828 -0.099304 -0.408203 1.004883 -[Debug] layer0_k_after_rope: [128, 8] first4: -12.718538 1.122484 1.774887 1.790079 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.510742 0.165771 -0.347900 0.511230 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.585310 -0.791508 -0.290125 0.495190 -[Debug] hidden_after_layer0: [2048, 1085] first4: -8.926053 0.558007 51.172398 -0.877717 -[Debug] hidden_after_layer6: [2048, 1085] first4: -20.768745 -0.272222 34.170349 -4.416629 -[Debug] hidden_after_layer12: [2048, 1085] first4: -14.358247 -18.625305 73.571915 30.079784 -[Debug] hidden_after_layer18: [2048, 1085] first4: -26.789474 14.346137 62.040115 19.708126 -[Debug] hidden_after_layer23: [2048, 1085] first4: -2.927731 38.887718 192.805542 144.255524 -[Debug] dit_step0_vt: [2170, 64] first4: 0.027340 1.115875 0.350609 2.345856 -[Debug] dit_step0_xt: [2170, 64] first4: 0.193093 2.105528 -0.187812 0.741026 +[Debug] layer0_q_after_rope: [128, 16] first4: -2.134766 -0.300049 -0.404541 0.904297 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.349133 1.249128 1.744302 1.794822 +[Debug] layer0_sa_output: [2048, 1085] first4: -0.636230 -0.405029 0.096436 0.194946 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.239698 -0.683206 0.416180 0.509788 +[Debug] hidden_after_layer0: [2048, 1085] first4: -4.682029 -0.464333 15.184165 -0.212429 +[Debug] hidden_after_layer6: [2048, 1085] first4: -8.053159 0.591622 20.595821 -6.469027 +[Debug] hidden_after_layer12: [2048, 1085] first4: -11.836857 -8.197025 41.079239 30.392553 +[Debug] hidden_after_layer18: [2048, 1085] first4: -20.004263 1.558971 15.575721 16.331001 +[Debug] hidden_after_layer23: [2048, 1085] first4: 23.482555 18.593208 82.512901 173.016068 +[Debug] dit_step0_vt: [2170, 64] first4: 0.084528 0.834541 -0.408783 2.115417 +[Debug] dit_step0_xt: [2170, 64] first4: 0.190494 2.118316 -0.153294 0.751501 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: 0.002377 1.005737 -0.352661 1.768188 -[Debug] dit_step1_xt: [2170, 64] first4: 0.192964 2.050670 -0.168576 0.644580 +[Debug] dit_step1_vt: [2170, 64] first4: -0.071388 1.041626 -0.270477 1.704315 +[Debug] dit_step1_xt: [2170, 64] first4: 0.194388 2.061500 -0.138541 0.658538 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.063080 1.061218 -0.344177 1.926041 -[Debug] dit_step2_xt: [2170, 64] first4: 0.197169 1.979922 -0.145631 0.516177 +[Debug] dit_step2_vt: [2170, 64] first4: -0.071960 1.095016 -0.333557 1.988541 +[Debug] dit_step2_xt: [2170, 64] first4: 0.199185 1.988499 -0.116304 0.525969 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: -0.072388 1.144592 -0.184326 2.069214 -[Debug] dit_step3_xt: [2170, 64] first4: 0.203201 1.884539 -0.130270 0.343743 +[Debug] dit_step3_vt: [2170, 64] first4: -0.037468 1.148598 -0.165955 2.091240 +[Debug] dit_step3_xt: [2170, 64] first4: 0.202307 1.892783 -0.102474 0.351699 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.004288 1.147110 0.001495 2.068916 -[Debug] dit_step4_xt: [2170, 64] first4: 0.202742 1.761635 -0.130430 0.122073 +[Debug] dit_step4_vt: [2170, 64] first4: 0.014343 1.134537 -0.033691 2.114731 +[Debug] dit_step4_xt: [2170, 64] first4: 0.200771 1.771225 -0.098864 0.125120 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.070211 1.173462 0.080673 2.086014 -[Debug] dit_step5_xt: [2170, 64] first4: 0.192712 1.593997 -0.141955 -0.175929 +[Debug] dit_step5_vt: [2170, 64] first4: 0.064150 1.159027 0.062057 2.121386 +[Debug] dit_step5_xt: [2170, 64] first4: 0.191606 1.605650 -0.107730 -0.177935 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: -0.010117 1.145203 0.186996 2.198898 -[Debug] dit_step6_xt: [2170, 64] first4: 0.194735 1.364957 -0.179354 -0.615709 +[Debug] dit_step6_vt: [2170, 64] first4: -0.041473 1.200439 0.198494 2.240326 +[Debug] dit_step6_xt: [2170, 64] first4: 0.199901 1.365562 -0.147428 -0.626000 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.244629 0.644890 0.358635 2.446594 -[Debug] dit_x0: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687 +[Debug] dit_step7_vt: [2170, 64] first4: -0.309998 0.692413 0.432823 2.469238 +[Debug] dit_x0: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 252.0 ms (252.0 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687 +[DiT] Total generation: 335.0 ms (335.0 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9843.4 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000170 0.000825 0.000784 0.001115 +[VAE Batch0] Decode: 1706.4 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000160 0.000739 0.000691 0.001054 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:55:29.948 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:55:29.948 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:55:29.948 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:55:29.948 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:55:29.948 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:55:30.699 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:55:32.273 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:55:32.274 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:55:32.279 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:55:32.442 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:11.115 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:11.205 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:12.506 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:12.506 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:12.508 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:12.513 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:12.703 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:55:32.450 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:55:32.462 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:55:32.463 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:55:32.484 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:55:32.791 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:55:32.791 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:55:32.791 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006818294525146484, 'diffusion_time_cost': 0.2995321750640869, 'diffusion_per_step_time_cost': 0.037441521883010864, 'total_time_cost': 0.3063504695892334, 'offload_time_cost': 0.0} -2026-03-01 19:55:32.806 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB -2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB -2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB -2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:55:33.083 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:55:33.084 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:55:33.088 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:12.713 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:12.727 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:12.727 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:12.758 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:13.073 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:13.073 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:13.073 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006865262985229492, 'diffusion_time_cost': 0.30722999572753906, 'diffusion_per_step_time_cost': 0.03840374946594238, 'total_time_cost': 0.31409525871276855, 'offload_time_cost': 0.0} +2026-03-04 21:47:13.087 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:13.096 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:13.096 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.82 GB +2026-03-04 21:47:13.096 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:13.096 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.82 GB +2026-03-04 21:47:13.096 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.82 GB +2026-03-04 21:47:13.096 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:13.370 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:13.372 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:13.374 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf [GGML] Running acestep-v15-turbo-Q8_0.gguf... @@ -224,36 +219,36 @@ Using precomputed LM hints temb_t 0.999998 hidden_after_proj_in 0.999985 enc_after_cond_emb 0.999817 - layer0_sa_output 0.999939 - hidden_after_layer0 0.999858 - hidden_after_layer6 0.999893 - hidden_after_layer12 0.999124 - hidden_after_layer18 0.996403 - hidden_after_layer23 0.993183 - dit_step0_vt 0.973885 - dit_step0_xt 0.999943 - dit_step1_vt 0.915468 - dit_step1_xt 0.999633 - dit_step2_vt 0.912211 - dit_step2_xt 0.998544 - dit_step3_vt 0.912707 - dit_step3_xt 0.995860 - dit_step4_vt 0.906019 - dit_step4_xt 0.989505 - dit_step5_vt 0.896537 - dit_step5_xt 0.974659 - dit_step6_vt 0.886047 - dit_step6_xt 0.945866 - dit_step7_vt 0.869793 - dit_x0 0.905017 - vae_audio 0.746037 - vae_audio (STFT cosine) 0.898352 + layer0_sa_output 0.896665 + hidden_after_layer0 0.996506 + hidden_after_layer6 0.988924 + hidden_after_layer12 0.986595 + hidden_after_layer18 0.980435 + hidden_after_layer23 0.969958 + dit_step0_vt 0.880150 + dit_step0_xt 0.999739 + dit_step1_vt 0.904993 + dit_step1_xt 0.999178 + dit_step2_vt 0.897232 + dit_step2_xt 0.997639 + dit_step3_vt 0.896753 + dit_step3_xt 0.994077 + dit_step4_vt 0.889861 + dit_step4_xt 0.986004 + dit_step5_vt 0.878511 + dit_step5_xt 0.967661 + dit_step6_vt 0.867605 + dit_step6_xt 0.933014 + dit_step7_vt 0.848412 + dit_x0 0.884572 + vae_audio 0.692036 + vae_audio (STFT cosine) 0.882942 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999943 0.140034 0.006943 -0.002318 0.973036 -0.002342 0.972003 - dit_step1_xt 0.999633 0.423125 0.018056 -0.005257 0.943026 -0.005313 0.941730 - dit_step2_xt 0.998544 0.841908 0.034537 -0.009209 0.910286 -0.009311 0.908527 - dit_step3_xt 0.995860 1.521911 0.055719 -0.014626 0.875169 -0.014577 0.873624 - dit_step4_xt 0.989505 2.346452 0.085477 -0.021803 0.842334 -0.021660 0.841995 - dit_step5_xt 0.974659 3.387389 0.130921 -0.032225 0.822365 -0.032109 0.824593 - dit_step6_xt 0.945866 4.812943 0.199910 -0.047290 0.846751 -0.046482 0.855546 + dit_step0_xt 0.999739 0.400727 0.016274 -0.002102 0.972847 -0.002342 0.972003 + dit_step1_xt 0.999178 0.814308 0.027485 -0.004968 0.942952 -0.005313 0.941730 + dit_step2_xt 0.997639 1.101152 0.044575 -0.008840 0.910138 -0.009311 0.908527 + dit_step3_xt 0.994077 1.762341 0.067497 -0.014170 0.875003 -0.014577 0.873624 + dit_step4_xt 0.986004 2.565164 0.099802 -0.021228 0.842166 -0.021660 0.841995 + dit_step5_xt 0.967661 3.593323 0.149360 -0.031486 0.822043 -0.032109 0.824593 + dit_step6_xt 0.933014 4.978329 0.224230 -0.046337 0.845793 -0.046482 0.855546 diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp index fbfd049..9605e2a 100644 --- a/tools/ace-qwen3.cpp +++ b/tools/ace-qwen3.cpp @@ -691,8 +691,7 @@ int main(int argc, char ** argv) { "# Instruction\n" "Expand the user's input into a more detailed" " and specific musical description:\n"; - std::string user_msg = ace.caption + "\n\ninstrumental: " - + std::string(req.instrumental ? "true" : "false"); + std::string user_msg = ace.caption; prompt = build_custom_prompt(bpe, sys, user_msg.c_str()); } else { prompt = build_lm_prompt(bpe, ace); diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index ac50e9f..8893f4b 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -6,13 +6,10 @@ #include #include #include -#include #include #include #include "philox.h" -#include "ggml.h" -#include "ggml-backend.h" #include "dit-sampler.h" #include "vae.h" #include "qwen3-enc.h" @@ -241,10 +238,12 @@ int main(int argc, char ** argv) { float duration = req.duration > 0 ? req.duration : 30.0f; long long seed = req.seed; int num_steps = req.inference_steps > 0 ? req.inference_steps : 8; - float guidance_scale = req.guidance_scale > 0 ? req.guidance_scale : 7.0f; + float guidance_scale = req.guidance_scale; float shift = req.shift > 0 ? req.shift : 1.0f; - if (is_turbo && guidance_scale > 1.0f) { + if (guidance_scale <= 0.0f) + guidance_scale = is_turbo ? 1.0f : 7.0f; + else if (is_turbo && guidance_scale > 1.0f) { fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n", guidance_scale); guidance_scale = 1.0f; diff --git a/tools/neural-codec.cpp b/tools/neural-codec.cpp new file mode 100644 index 0000000..e75f3c3 --- /dev/null +++ b/tools/neural-codec.cpp @@ -0,0 +1,522 @@ +// neural-codec.cpp: neural audio codec (Oobleck VAE) +// +// 2 modes: +// encode: WAV -> latent file (f32, Q8, or Q4) +// decode: latent file -> WAV (48kHz stereo) +// +// Three latent formats, decode auto-detects: +// +// f32 (default): flat [T, 64] f32, no header. +// T = file_size / 256. 25Hz, ~6.4 KB/s, ~51 kbit/s. +// +// Q8 (--q8): symmetric per-frame int8 quantization. +// header: "NAC8" magic (4B) + uint32 T_latent (4B) +// frame: f16 scale (2B) + int8[64] (64B) = 66B +// 25Hz, ~1.6 KB/s, ~13 kbit/s. +// +// Q4 (--q4): symmetric per-frame 4-bit quantization. +// header: "NAC4" magic (4B) + uint32 T_latent (4B) +// frame: f16 scale (2B) + nibbles[32] (32B) = 34B +// 25Hz, ~850 B/s, ~6.8 kbit/s. +// +// Usage: +// neural-codec --vae model.gguf --encode -i song.wav -o song.latent +// neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nac8 +// neural-codec --vae model.gguf --encode --q4 -i song.wav -o song.nac4 +// neural-codec --vae model.gguf --decode -i song.nac4 -o song.wav + +#include "vae.h" +#include "vae-enc.h" +#include +#include +#include +#include +#include + +// Minimal WAV reader: 16-bit PCM or 32-bit float, mono/stereo, any sample rate. +// Returns interleaved float [T, 2]. Sets *T_audio, *sr. Caller frees. +static float * read_wav(const char * path, int * T_audio, int * sr) { + FILE * f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "[WAV] Cannot open %s\n", path); return NULL; } + + char riff[4]; fread(riff, 1, 4, f); + if (memcmp(riff, "RIFF", 4) != 0) { + fprintf(stderr, "[WAV] Not a RIFF file: %s\n", path); fclose(f); return NULL; + } + fseek(f, 4, SEEK_CUR); + char wave[4]; fread(wave, 1, 4, f); + if (memcmp(wave, "WAVE", 4) != 0) { + fprintf(stderr, "[WAV] Not a WAVE file: %s\n", path); fclose(f); return NULL; + } + + int n_channels = 0, sample_rate = 0, bits_per_sample = 0; + short audio_format = 0; + float * audio = NULL; + int n_samples = 0; + + while (!feof(f)) { + char chunk_id[4]; + int chunk_size; + if (fread(chunk_id, 1, 4, f) != 4) break; + if (fread(&chunk_size, 4, 1, f) != 1) break; + + if (memcmp(chunk_id, "fmt ", 4) == 0) { + fread(&audio_format, 2, 1, f); + short nc; fread(&nc, 2, 1, f); n_channels = nc; + fread(&sample_rate, 4, 1, f); + fseek(f, 4, SEEK_CUR); // byte_rate + fseek(f, 2, SEEK_CUR); // block_align + short bps; fread(&bps, 2, 1, f); bits_per_sample = bps; + int consumed = 16; + if (chunk_size > consumed) fseek(f, chunk_size - consumed, SEEK_CUR); + + } else if (memcmp(chunk_id, "data", 4) == 0 && n_channels > 0) { + if (audio_format == 1 && bits_per_sample == 16) { + n_samples = chunk_size / (n_channels * 2); + audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float)); + std::vector buf((size_t)n_samples * n_channels); + fread(buf.data(), 2, (size_t)n_samples * n_channels, f); + for (int t = 0; t < n_samples; t++) { + if (n_channels == 1) { + float s = (float)buf[t] / 32768.0f; + audio[t * 2 + 0] = s; + audio[t * 2 + 1] = s; + } else { + audio[t * 2 + 0] = (float)buf[t * n_channels + 0] / 32768.0f; + audio[t * 2 + 1] = (float)buf[t * n_channels + 1] / 32768.0f; + } + } + } else if (audio_format == 3 && bits_per_sample == 32) { + n_samples = chunk_size / (n_channels * 4); + audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float)); + std::vector buf((size_t)n_samples * n_channels); + fread(buf.data(), 4, (size_t)n_samples * n_channels, f); + for (int t = 0; t < n_samples; t++) { + if (n_channels == 1) { + audio[t * 2 + 0] = buf[t]; + audio[t * 2 + 1] = buf[t]; + } else { + audio[t * 2 + 0] = buf[t * n_channels + 0]; + audio[t * 2 + 1] = buf[t * n_channels + 1]; + } + } + } else { + fprintf(stderr, "[WAV] Unsupported: format=%d bits=%d (need PCM16 or float32)\n", + audio_format, bits_per_sample); + fclose(f); return NULL; + } + break; + } else { + fseek(f, chunk_size, SEEK_CUR); + } + } + fclose(f); + if (!audio) { fprintf(stderr, "[WAV] No audio data in %s\n", path); return NULL; } + + *T_audio = n_samples; + *sr = sample_rate; + fprintf(stderr, "[WAV] Read %s: %d samples, %d Hz, %d ch, %d bit\n", + path, n_samples, sample_rate, n_channels, bits_per_sample); + return audio; +} + +// WAV writer: planar [ch0: T, ch1: T] -> 16-bit PCM stereo +static bool write_wav(const char * path, const float * audio, int T_audio, int sr) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + int n_channels = 2, bits = 16; + int byte_rate = sr * n_channels * (bits / 8); + int block_align = n_channels * (bits / 8); + int data_size = T_audio * n_channels * (bits / 8); + int file_size = 36 + data_size; + fwrite("RIFF", 1, 4, f); + fwrite(&file_size, 4, 1, f); + fwrite("WAVE", 1, 4, f); + fwrite("fmt ", 1, 4, f); + int fmt_size = 16; fwrite(&fmt_size, 4, 1, f); + short audio_fmt = 1; fwrite(&audio_fmt, 2, 1, f); + short nc = (short)n_channels; fwrite(&nc, 2, 1, f); + fwrite(&sr, 4, 1, f); + fwrite(&byte_rate, 4, 1, f); + short ba = (short)block_align; fwrite(&ba, 2, 1, f); + short bp = (short)bits; fwrite(&bp, 2, 1, f); + fwrite("data", 1, 4, f); + fwrite(&data_size, 4, 1, f); + for (int t = 0; t < T_audio; t++) { + for (int c = 0; c < 2; c++) { + float s = audio[c * T_audio + t]; + s = s < -1.0f ? -1.0f : (s > 1.0f ? 1.0f : s); + short v = (short)(s * 32767.0f); + fwrite(&v, 2, 1, f); + } + } + fclose(f); + return true; +} + +// Q8 format constants +static const char NAC8_MAGIC[4] = {'N', 'A', 'C', '8'}; +static const int NAC8_HEADER = 8; // 4B magic + 4B T_latent +static const int NAC8_FRAME = 66; // 2B f16 scale + 64B int8 + +// Write Q8 quantized latent +static bool write_latent_q8(const char * path, const float * data, int T_latent) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + + fwrite(NAC8_MAGIC, 1, 4, f); + uint32_t t = (uint32_t)T_latent; + fwrite(&t, 4, 1, f); + + for (int i = 0; i < T_latent; i++) { + const float * frame = data + i * 64; + + // find max abs for symmetric quant + float amax = 0.0f; + for (int j = 0; j < 64; j++) { + float a = fabsf(frame[j]); + if (a > amax) amax = a; + } + float scale = amax / 127.0f; + ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale); + fwrite(&scale_f16, 2, 1, f); + + // quantize + int8_t q[64]; + float inv = (scale > 0.0f) ? 127.0f / amax : 0.0f; + for (int j = 0; j < 64; j++) { + int v = (int)roundf(frame[j] * inv); + q[j] = (int8_t)(v < -127 ? -127 : (v > 127 ? 127 : v)); + } + fwrite(q, 1, 64, f); + } + fclose(f); + + size_t bytes = NAC8_HEADER + (size_t)T_latent * NAC8_FRAME; + float duration = (float)T_latent * 1920.0f / 48000.0f; + float kbps = (float)bytes * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Wrote %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, T_latent, duration, (float)bytes / 1024.0f, kbps); + return true; +} + +// Q4 format constants +static const char NAC4_MAGIC[4] = {'N', 'A', 'C', '4'}; +static const int NAC4_HEADER = 8; // 4B magic + 4B T_latent +static const int NAC4_FRAME = 34; // 2B f16 scale + 32B packed nibbles + +// Write Q4 quantized latent +// Symmetric 4-bit: range [-7, 7], scale = amax / 7.0 +// Packing: byte = (low & 0x0F) | (high << 4), two signed nibbles per byte +static bool write_latent_q4(const char * path, const float * data, int T_latent) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + + fwrite(NAC4_MAGIC, 1, 4, f); + uint32_t t = (uint32_t)T_latent; + fwrite(&t, 4, 1, f); + + for (int i = 0; i < T_latent; i++) { + const float * frame = data + i * 64; + + // find max abs for symmetric quant + float amax = 0.0f; + for (int j = 0; j < 64; j++) { + float a = fabsf(frame[j]); + if (a > amax) amax = a; + } + float scale = amax / 7.0f; + ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale); + fwrite(&scale_f16, 2, 1, f); + + // quantize and pack pairs into bytes + float inv = (scale > 0.0f) ? 7.0f / amax : 0.0f; + uint8_t packed[32]; + for (int j = 0; j < 32; j++) { + int lo = (int)roundf(frame[j * 2 + 0] * inv); + int hi = (int)roundf(frame[j * 2 + 1] * inv); + lo = lo < -7 ? -7 : (lo > 7 ? 7 : lo); + hi = hi < -7 ? -7 : (hi > 7 ? 7 : hi); + packed[j] = (uint8_t)((lo & 0x0F) | (hi << 4)); + } + fwrite(packed, 1, 32, f); + } + fclose(f); + + size_t bytes = NAC4_HEADER + (size_t)T_latent * NAC4_FRAME; + float duration = (float)T_latent * 1920.0f / 48000.0f; + float kbps = (float)bytes * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Wrote %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, T_latent, duration, (float)bytes / 1024.0f, kbps); + return true; +} + +// Write f32 raw latent (no header) +static bool write_latent_f32(const char * path, const float * data, int T_latent) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + size_t bytes = (size_t)T_latent * 64 * sizeof(float); + fwrite(data, 1, bytes, f); + fclose(f); + float duration = (float)T_latent * 1920.0f / 48000.0f; + fprintf(stderr, "[Latent] Wrote %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, T_latent, duration, (float)bytes / 1024.0f, + (float)bytes * 8.0f / (duration * 1000.0f)); + return true; +} + +// Read latent, auto-detect format (NAC8 -> Q8, NAC4 -> Q4, else f32). +// Returns [T_latent, 64] f32 (dequantized if quantized). Caller frees. +static float * read_latent(const char * path, int * T_latent) { + FILE * f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "[Latent] Cannot open %s\n", path); return NULL; } + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + fseek(f, 0, SEEK_SET); + + // Check magic + char magic[4] = {}; + if (fsize >= 8) fread(magic, 1, 4, f); + + if (memcmp(magic, NAC8_MAGIC, 4) == 0) { + // Q8 format + uint32_t t; + fread(&t, 4, 1, f); + *T_latent = (int)t; + + long expected = NAC8_HEADER + (long)t * NAC8_FRAME; + if (fsize != expected) { + fprintf(stderr, "[Latent] Q8 size mismatch: expected %ld, got %ld\n", expected, fsize); + fclose(f); return NULL; + } + + float * data = (float *)malloc((size_t)t * 64 * sizeof(float)); + for (int i = 0; i < (int)t; i++) { + ggml_fp16_t scale_f16; + fread(&scale_f16, 2, 1, f); + float scale = ggml_fp16_to_fp32(scale_f16); + + int8_t q[64]; + fread(q, 1, 64, f); + + float * frame = data + i * 64; + for (int j = 0; j < 64; j++) + frame[j] = (float)q[j] * scale; + } + fclose(f); + + float duration = (float)(*T_latent) * 1920.0f / 48000.0f; + float kbps = (float)fsize * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Read %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, *T_latent, duration, (float)fsize / 1024.0f, kbps); + return data; + } + + if (memcmp(magic, NAC4_MAGIC, 4) == 0) { + // Q4 format + uint32_t t; + fread(&t, 4, 1, f); + *T_latent = (int)t; + + long expected = NAC4_HEADER + (long)t * NAC4_FRAME; + if (fsize != expected) { + fprintf(stderr, "[Latent] Q4 size mismatch: expected %ld, got %ld\n", expected, fsize); + fclose(f); return NULL; + } + + float * data = (float *)malloc((size_t)t * 64 * sizeof(float)); + for (int i = 0; i < (int)t; i++) { + ggml_fp16_t scale_f16; + fread(&scale_f16, 2, 1, f); + float scale = ggml_fp16_to_fp32(scale_f16); + + uint8_t packed[32]; + fread(packed, 1, 32, f); + + // unpack signed nibbles + float * frame = data + i * 64; + for (int j = 0; j < 32; j++) { + int lo = (int)(packed[j] & 0x0F); + int hi = (int)(packed[j] >> 4); + if (lo >= 8) lo -= 16; + if (hi >= 8) hi -= 16; + frame[j * 2 + 0] = (float)lo * scale; + frame[j * 2 + 1] = (float)hi * scale; + } + } + fclose(f); + + float duration = (float)(*T_latent) * 1920.0f / 48000.0f; + float kbps = (float)fsize * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Read %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, *T_latent, duration, (float)fsize / 1024.0f, kbps); + return data; + } + + // f32 format (no header, rewind) + fseek(f, 0, SEEK_SET); + if (fsize % (64 * (int)sizeof(float)) != 0) { + fprintf(stderr, "[Latent] File size %ld not a multiple of %d (64 * f32)\n", + fsize, (int)(64 * sizeof(float))); + fclose(f); return NULL; + } + + *T_latent = (int)(fsize / (64 * sizeof(float))); + float * data = (float *)malloc(fsize); + fread(data, 1, fsize, f); + fclose(f); + + float duration = (float)(*T_latent) * 1920.0f / 48000.0f; + fprintf(stderr, "[Latent] Read %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, *T_latent, duration, (float)fsize / 1024.0f, + (float)fsize * 8.0f / (duration * 1000.0f)); + return data; +} + +static void print_usage(const char * prog) { + fprintf(stderr, + "Usage: %s --vae --encode|--decode -i [-o ] [--q8|--q4]\n\n" + "Required:\n" + " --vae VAE GGUF file\n" + " --encode | --decode Encode WAV to latent, or decode latent to WAV\n" + " -i Input (WAV for encode, latent for decode)\n\n" + "Output:\n" + " -o Output file (auto-named if omitted)\n" + " --q8 Quantize latent to int8 (~13 kbit/s)\n" + " --q4 Quantize latent to int4 (~6.8 kbit/s)\n\n" + "Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n" + " song.latent -> song.wav\n\n" + "VAE tiling (memory control):\n" + " --vae-chunk Latent frames per tile (default: 256)\n" + " --vae-overlap Overlap frames per side (default: 64)\n\n" + "Latent formats (decode auto-detects):\n" + " f32: flat [T, 64] f32, no header. ~51 kbit/s.\n" + " NAC8: header + per-frame Q8. ~13 kbit/s.\n" + " NAC4: header + per-frame Q4. ~6.8 kbit/s.\n", + prog); +} + +static std::string auto_output(const char * input, const char * ext) { + std::string s = input; + size_t dot = s.rfind('.'); + if (dot != std::string::npos) + return s.substr(0, dot) + ext; + return s + ext; +} + +int main(int argc, char ** argv) { + const char * vae_path = NULL; + const char * input_path = NULL; + const char * output_path = NULL; + int chunk_size = 256; + int overlap = 64; + int mode = -1; // 0 = encode, 1 = decode + int quant = 0; // 0 = f32, 8 = q8, 4 = q4 + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--vae") == 0 && i + 1 < argc) vae_path = argv[++i]; + else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) input_path = argv[++i]; + else if (strcmp(argv[i], "--input") == 0 && i + 1 < argc) input_path = argv[++i]; + else if (strcmp(argv[i], "-o") == 0 && i + 1 < argc) output_path = argv[++i]; + else if (strcmp(argv[i], "--output") == 0 && i + 1 < argc) output_path = argv[++i]; + else if (strcmp(argv[i], "--vae-chunk") == 0 && i + 1 < argc) chunk_size = atoi(argv[++i]); + else if (strcmp(argv[i], "--vae-overlap") == 0 && i + 1 < argc) overlap = atoi(argv[++i]); + else if (strcmp(argv[i], "--encode") == 0) mode = 0; + else if (strcmp(argv[i], "--decode") == 0) mode = 1; + else if (strcmp(argv[i], "--q8") == 0) quant = 8; + else if (strcmp(argv[i], "--q4") == 0) quant = 4; + else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { + print_usage(argv[0]); return 0; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[i]); + print_usage(argv[0]); return 1; + } + } + + if (!vae_path || !input_path || mode < 0) { + print_usage(argv[0]); return 1; + } + + // Auto output names + std::string out_str; + if (!output_path) { + if (mode == 0) { + const char * ext = ".latent"; + if (quant == 8) ext = ".nac8"; + if (quant == 4) ext = ".nac4"; + out_str = auto_output(input_path, ext); + } else { + out_str = auto_output(input_path, ".wav"); + } + output_path = out_str.c_str(); + } + + const char * quant_str = ""; + if (mode == 0 && quant == 8) quant_str = " (Q8)"; + if (mode == 0 && quant == 4) quant_str = " (Q4)"; + fprintf(stderr, "\n[VAE] Mode: %s%s\n", mode == 0 ? "encode" : "decode", quant_str); + fprintf(stderr, "[VAE] Input: %s\n", input_path); + fprintf(stderr, "[VAE] Output: %s\n\n", output_path); + + // ENCODE + if (mode == 0) { + int T_audio = 0, sr = 0; + float * audio = read_wav(input_path, &T_audio, &sr); + if (!audio) return 1; + if (sr != 48000) + fprintf(stderr, "[WARN] Input is %d Hz, VAE expects 48000. Resample with ffmpeg first.\n", sr); + + VAEEncoder enc = {}; + vae_enc_load(&enc, vae_path); + + int max_T = (T_audio / 1920) + 64; + std::vector latent((size_t)max_T * 64); + + fprintf(stderr, "\n[VAE] Encoding %d samples (%.2fs)...\n", + T_audio, (float)T_audio / (float)(sr > 0 ? sr : 48000)); + int T_latent = vae_enc_encode_tiled(&enc, audio, T_audio, + latent.data(), max_T, chunk_size, overlap); + free(audio); + if (T_latent < 0) { vae_enc_free(&enc); return 1; } + + if (quant == 8) + write_latent_q8(output_path, latent.data(), T_latent); + else if (quant == 4) + write_latent_q4(output_path, latent.data(), T_latent); + else + write_latent_f32(output_path, latent.data(), T_latent); + + vae_enc_free(&enc); + fprintf(stderr, "[VAE] Done.\n"); + return 0; + } + + // DECODE (auto-detects f32 vs Q8 vs Q4 from file content) + { + int T_latent = 0; + float * latent = read_latent(input_path, &T_latent); + if (!latent) return 1; + + VAEGGML dec = {}; + vae_ggml_load(&dec, vae_path); + + int max_T = T_latent * 1920 + 4096; + std::vector audio((size_t)2 * max_T, 0.0f); + + fprintf(stderr, "\n[VAE] Decoding %d latent frames...\n", T_latent); + int T_audio = vae_ggml_decode_tiled(&dec, latent, T_latent, + audio.data(), max_T, chunk_size, overlap); + free(latent); + if (T_audio < 0) { vae_ggml_free(&dec); return 1; } + + if (write_wav(output_path, audio.data(), T_audio, 48000)) + fprintf(stderr, "\n[VAE] Output: %s (%d samples, %.2fs @ 48kHz)\n", + output_path, T_audio, (float)T_audio / 48000.0f); + else + fprintf(stderr, "[VAE] FATAL: failed to write %s\n", output_path); + + vae_ggml_free(&dec); + fprintf(stderr, "[VAE] Done.\n"); + return 0; + } +} diff --git a/tools/quantize.cpp b/tools/quantize.cpp index c778a47..84a3dd4 100644 --- a/tools/quantize.cpp +++ b/tools/quantize.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #ifdef _WIN32