diff --git a/README.md b/README.md index 8ad6ebf..a9beca0 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,6 @@ cmake --build . --config Release -j$(nproc) Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE). -**CI (GitHub Actions)** -- **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`. -- **Test generation**: on release or manual trigger only; runs the same checks as **local** `tests/run-generation-tests.sh`. Validate locally first (build + `./models.sh`, then `tests/run-generation-tests.sh`), then use CI to confirm. See `.github/workflows/`. - ## Models Pre-quantized GGUFs on [Hugging Face](https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF). @@ -143,16 +139,10 @@ cd examples ./partial.sh # caption + lyrics + duration ./full.sh # all metadata provided ./dit-only.sh # skip LLM, DiT from noise -./cover.sh # cover mode: decode precomputed audio_codes (no LLM) -./cover-reference.sh # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3) -./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength -./lora.sh # DiT + LoRA adapter ``` Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0) -alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights). - -**LoRA adapters**: use `--lora ` and optional `--lora-scale ` with dit-vae to run the DiT with PEFT-style Ace-Step LoRAs. +alongside the turbo default (8 steps, no CFG). ## Generation modes @@ -180,11 +170,10 @@ Run `dit-vae` to decode existing codes. See `examples/dit-only.json`. ## Request JSON reference -All fields with defaults. Only `caption` is required. Built-in modes (text2music, cover, repaint) and audio inputs follow the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md); see [docs/MODES.md](docs/MODES.md) for what is implemented. +All fields with defaults. Only `caption` is required. ```json { - "task_type": "text2music", "caption": "", "lyrics": "", "instrumental": false, @@ -199,12 +188,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music "lm_top_p": 0.9, "lm_top_k": 0, "lm_negative_prompt": "", - "reference_audio": "", - "src_audio": "", "audio_codes": "", - "audio_cover_strength": 1.0, - "repainting_start": 0.0, - "repainting_end": 0.0, "inference_steps": 8, "guidance_scale": 7.0, "shift": 3.0 @@ -214,12 +198,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music Key fields: `seed` -1 means random (resolved once, then +1 per batch element). `audio_codes` is generated by ace-qwen3 and consumed by dit-vae (comma separated FSQ token IDs). When present, the LLM is -skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: path to a **WAV or MP3** for cover source; dit-vae encodes it (VAE + FSQ nearest-codeword) to codes internally, no Python required (see docs/MODES.md). - -**Reference and cover strength (not the same as guidance_scale):** -- **`audio_cover_strength`** (0.0–1.0): Controls how strongly the **cover/source** (from `audio_codes` or `src_audio`) influences the DiT context. The context is blended with silence: `(1 - audio_cover_strength)*silence + audio_cover_strength*decoded`. Use 1.0 for full cover influence, lower values to soften it. Only applies when cover context is present. -- **`reference_audio`**: Timbre from the reference file is applied at full strength; there is no separate strength parameter for reference timbre. -- **`guidance_scale`**: This is **DiT classifier-free guidance** (conditioned vs unconditioned prediction), not reference or cover strength. Turbo models ignore it (forced to 1.0). +skipped entirely. Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG). SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`. @@ -241,6 +220,7 @@ Output naming: input.json -> input0.json, input1.json, ... (last digit = batch i Debug: --max-seq KV cache size (default: 8192) --no-fsm Disable FSM constrained decoding + --no-fa Disable flash attention --dump-logits Dump prefill logits (binary f32) --dump-tokens Dump prompt token IDs (CSV) ``` @@ -262,10 +242,6 @@ Required: --dit DiT GGUF file --vae VAE GGUF file -LoRA: - --lora LoRA adapter (adapter_model.safetensors) - --lora-scale LoRA scale, e.g. alpha/rank (default: 1.0) - Batch: --batch DiT variations per request (default: 1, max 9) @@ -276,6 +252,7 @@ VAE tiling (memory control): --vae-overlap Overlap frames per side (default: 64) Debug: + --no-fa Disable flash attention --dump Dump intermediate tensors ``` @@ -320,10 +297,7 @@ conditional and N unconditional sequences are packed into a single forward pass `logits = uncond + scale * (cond - uncond)`. The KV cache is a single 4D tensor `[D, max_seq, Nkv, n_sets]` shared across all batch elements and CFG paths. Shared prompts are prefilled once and cloned to other KV sets via copy, avoiding redundant -prefills. Embedding lookup bypasses ggml_get_rows entirely: rows are read directly -from the mmap'd GGUF file on CPU, dequantized, and uploaded as F32 input tensors. -Decode uses a dedicated single-backend graph allocator (gallocr) with no scheduler -dispatch overhead, while prefill uses the multi-backend scheduler for flexibility. +prefills. ## Accuracy @@ -343,42 +317,42 @@ python3 debug-dit-cossim.py # DiT: per-layer cossim GGML vs Python (turbo/ ## Patched GGML fork -Uses a patched GGML fork (submodule) with ops added for the Oobleck VAE decoder. +Uses a patched GGML fork (submodule) with two new ops and a CUDA bugfix for the Oobleck +VAE decoder. All backends: CPU, CUDA, Metal, Vulkan. F32/F16/BF16 data types. +The DiT uses only standard GGML ops and needs no patches. The VAE reconstructs audio from latent space through 5 upsampling blocks (total 1920x), each running a transposed convolution followed by 3 WaveNet-style residual units with dilated convolutions and Snake activations. A single tile builds a graph of 36 snake activations, 5 transposed convolutions, and 32 regular convolutions. At the final blocks, -sequence lengths reach 491520 timesteps, which stresses GGML ops designed for short NLP sequences. -The DiT (flow matching diffusion transformer) uses only standard GGML ops and needs no patches. - -Patches on top of upstream GGML, oldest first: - -| Commit | Scope | Description | -|--------|-------|-------------| -| `8c70db84` | CUDA | `conv_transpose_1d`: replace O(T_in) brute-force loop with bounded range | -| `b65bf458` | CUDA | `im2col`: grid-stride loop on OW to fix gridDim.y overflow when T > 65535 | -| `e0e36f3c` | Metal | `conv_transpose_1d`: same bounded loop fix as CUDA | -| `2b9080bd` | CPU, CUDA, Metal | New `GGML_OP_COL2IM_1D`: scatter-add for GEMM-based conv_transpose_1d decomposition | -| `02c8041f` | CPU, CUDA, Metal | New `GGML_OP_SNAKE`: fused activation y = x + sin^2(a*x) / b (replaces 5 element-wise ops) | -| `3f60b19c` | Metal | Fix snake kernel to use current C wrapper API | -| `cb5d7067` | Vulkan | Guard `VK_EXT_layer_settings` for legacy Vulkan SDK (fixes MI50/gfx906) | -| `1f0f4214` | Vulkan | `col2im_1d`: add Vulkan backend | -| `efbf3df6` | Vulkan | `snake`: add Vulkan backend | -| `6608cd11` | Vulkan | Fix rvalue ref for `col2im_1d` and `snake` push constants | -| `06101d38` | Vulkan | Fix double-division dispatch for `col2im_1d` and `snake` | -| `91416cee` | CPU, CUDA, Metal, Vulkan | `col2im_1d`: fuse padding crop via p0 parameter (saves 5 allocs + 5 memcpy per VAE tile) | -| `20675b09` | Vulkan | `col2im_1d`, `snake`: 2D dispatch (fixes workgroup overflow on MI50) | - -**Why col2im_1d**: upstream `ggml_conv_transpose_1d` uses a naive CUDA kernel (one scalar -FMA loop per output element, no shared memory, no tensor cores). The VAE spends 40% of its -FLOP budget on transposed convolutions. We decompose it as `mul_mat + col2im_1d`, routing -the heavy GEMM through cuBLAS/BLAS/MPS tensor cores. The col2im_1d gather has a 2-iteration -inner loop and is pure bandwidth. - -**Why snake**: the Oobleck VAE uses Snake1d activation (x + sin^2(a*x) / b) 36 times per -tile. Without a fused op, each activation requires 5 separate GGML kernels (mul, sin, sqr, -mul, add), causing 5x the memory traffic. The fused kernel reads x once, writes y once. +sequence lengths reach 491520 timesteps, which stresses GGML ops designed for short NLP +sequences. + +### `GGML_OP_SNAKE` (fused Snake activation) + +Computes y = x + sin^2(a * x) * inv_b in a single kernel. +The Oobleck VAE calls this 36 times per tile. Without a fused op, each activation +requires 5 separate GGML kernels (mul, sin, sqr, mul, add), causing 5x the memory +traffic. The fused kernel reads x once and writes y once. BF16 cast nodes before/after +each snake call halve memory bandwidth at the cost of negligible precision loss +(cossim > 0.999 vs F32 baseline). + +### `GGML_OP_COL2IM_1D` (scatter-add for GEMM-based conv_transpose_1d) + +Gather-based reconstruction of a 1D signal from GEMM columns [K*OC, T_in] to +[T_out, OC], with fused padding crop via the p0 parameter. +Upstream `ggml_conv_transpose_1d` uses a naive kernel (one scalar FMA loop per output +element, no shared memory, no tensor cores). The VAE spends 40% of its FLOP budget on +transposed convolutions. We decompose each as `mul_mat + col2im_1d`, routing the heavy +GEMM through cuBLAS/BLAS/MPS tensor cores. The col2im_1d gather has a 2-iteration inner +loop and is pure bandwidth. BF16 cast nodes around col2im_1d halve the scatter bandwidth. + +### Bugfix: `im2col` gridDim.y overflow (CUDA) + +Upstream `im2col_kernel` uses OW directly as grid dimension Y, which exceeds the CUDA +65535 gridDim limit on long sequences. The VAE calls `ggml_conv_1d` (im2col path) 32 +times per tile at output widths up to 491520. Fixed with a grid-stride loop on OW and +`MIN(OW, MAX_GRIDDIM_Z)` clamping. ## Acknowledgements diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root new file mode 120000 index 0000000..945c9b4 --- /dev/null +++ b/_codeql_detected_source_root @@ -0,0 +1 @@ +. \ No newline at end of file diff --git a/buildcuda.sh b/buildcuda.sh new file mode 100755 index 0000000..67f711f --- /dev/null +++ b/buildcuda.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +rm -rf build +mkdir build +cd build + +cmake .. -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc +cmake --build . --config Release -j "$(nproc)" diff --git a/ggml b/ggml index c04770a..55e062a 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit c04770a7056267bf0264b7c96d34cd84b24b04e8 +Subproject commit 55e062ab597eccaa3e7ee7c7b230197d83d94bc8 diff --git a/src/cond-enc.h b/src/cond-enc.h index 7de70a8..880cbf7 100644 --- a/src/cond-enc.h +++ b/src/cond-enc.h @@ -69,6 +69,7 @@ struct CondGGML { ggml_backend_t backend; ggml_backend_t cpu_backend; ggml_backend_sched_t sched; + bool use_flash_attn; WeightCtx wctx; }; @@ -78,6 +79,7 @@ static void cond_ggml_init_backend(CondGGML * m) { m->backend = bp.backend; m->cpu_backend = bp.cpu_backend; m->sched = backend_sched_new(bp, 8192); + m->use_flash_attn = true; } // Load from ACEStep DiT GGUF @@ -191,7 +193,8 @@ static void cond_ggml_forward(CondGGML * m, for (int i = 0; i < m->lyric_cfg.n_layers; i++) { struct ggml_tensor * layer_mask = (i % 2 == 0) ? lyric_slide_mask : NULL; lyric_h = qwen3_build_layer(ctx, m->lyric_cfg, &m->lyric_layers[i], - lyric_h, lyric_pos, layer_mask, S_lyric); + lyric_h, lyric_pos, layer_mask, S_lyric, + m->use_flash_attn); } lyric_h = qwen3_rms_norm(ctx, lyric_h, m->lyric_norm, m->lyric_cfg.rms_norm_eps); @@ -236,7 +239,8 @@ static void cond_ggml_forward(CondGGML * m, for (int i = 0; i < m->timbre_cfg.n_layers; i++) { struct ggml_tensor * layer_mask = (i % 2 == 0) ? timbre_slide_mask : NULL; timbre_h = qwen3_build_layer(ctx, m->timbre_cfg, &m->timbre_layers[i], - timbre_h, timbre_pos, layer_mask, S_ref); + timbre_h, timbre_pos, layer_mask, S_ref, + m->use_flash_attn); } timbre_h = qwen3_rms_norm(ctx, timbre_h, m->timbre_norm, m->timbre_cfg.rms_norm_eps); diff --git a/src/fsq-detok.h b/src/fsq-detok.h index 29eef5f..7430db7 100644 --- a/src/fsq-detok.h +++ b/src/fsq-detok.h @@ -64,6 +64,7 @@ struct DetokGGML { ggml_backend_t backend; ggml_backend_t cpu_backend; ggml_backend_sched_t sched; + bool use_flash_attn; WeightCtx wctx; }; @@ -73,6 +74,7 @@ static bool detok_ggml_load(DetokGGML * m, const char * gguf_path, m->cfg = detok_config(); m->backend = backend; m->cpu_backend = cpu_backend; + m->use_flash_attn = true; GGUFModel gf; if (!gf_load(&gf, gguf_path)) { @@ -169,7 +171,8 @@ static int detok_ggml_decode(DetokGGML * m, const int * codes, int T_5Hz, // 2L encoder + norm (non-causal, no mask needed at S=5) hidden = qwen3_build_layers(ctx, m->cfg, m->layers, m->norm, - hidden, positions, NULL, P); + hidden, positions, NULL, P, + m->use_flash_attn); // proj_out: [2048, 5] -> [64, 5] struct ggml_tensor * output = ggml_mul_mat(ctx, m->proj_out_w, hidden); diff --git a/src/qwen3-enc.h b/src/qwen3-enc.h index 07dce03..02bf9c9 100644 --- a/src/qwen3-enc.h +++ b/src/qwen3-enc.h @@ -71,6 +71,7 @@ struct Qwen3GGML { ggml_backend_t backend; ggml_backend_t cpu_backend; ggml_backend_sched_t sched; + bool use_flash_attn; WeightCtx wctx; }; @@ -94,6 +95,23 @@ static struct ggml_tensor * qwen3_linear_bias(struct ggml_context * ctx, return ggml_add(ctx, out, qwen3_f32(ctx, b)); } +// F32 manual attention (fallback when flash_attn_ext is disabled). +// Works for 3D [D, S, X] and 4D [D, S, X, N] inputs. +// Returns same layout as flash_attn_ext: dims 1 and 2 swapped vs input. +static struct ggml_tensor * qwen3_attn_f32( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * mask, + float scale) { + struct ggml_tensor * scores = ggml_mul_mat(ctx, k, q); + scores = ggml_soft_max_ext(ctx, scores, mask, scale, 0.0f); + struct ggml_tensor * vt = ggml_cont(ctx, ggml_transpose(ctx, v)); + struct ggml_tensor * out = ggml_mul_mat(ctx, vt, scores); + return ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); +} + static struct ggml_tensor * qwen3_rms_norm(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * w, @@ -114,7 +132,8 @@ static struct ggml_tensor * qwen3_build_self_attn( struct ggml_tensor * x, // [H, S] struct ggml_tensor * positions, // [S] int32 struct ggml_tensor * mask, // [S, S] or NULL - int S) { + int S, + bool use_flash_attn = true) { int D = c.head_dim; int Nh = c.n_heads; @@ -164,10 +183,13 @@ static struct ggml_tensor * qwen3_build_self_attn( k = ggml_permute(ctx, k, 0, 2, 1, 3); v = ggml_permute(ctx, v, 0, 2, 1, 3); - // 6) Flash attention (handles GQA) + // 6) Attention (flash or F32 manual fallback) float scale = 1.0f / sqrtf((float)D); - struct ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0.0f, 0.0f); - ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); // F32 accumulation + struct ggml_tensor * attn = use_flash_attn + ? ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0.0f, 0.0f) + : qwen3_attn_f32(ctx, q, k, v, mask, scale); + if (use_flash_attn) + ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); // 7) Reshape back: [D, Nh, S] -> [Nh*D, S] attn = ggml_reshape_2d(ctx, attn, Nh * D, S); @@ -203,11 +225,12 @@ static struct ggml_tensor * qwen3_build_layer( struct ggml_tensor * hidden, struct ggml_tensor * positions, struct ggml_tensor * mask, - int S) { + int S, + bool use_flash_attn = true) { // Self-attention block struct ggml_tensor * norm = qwen3_rms_norm(ctx, hidden, ly->input_layernorm, c.rms_norm_eps); - struct ggml_tensor * attn = qwen3_build_self_attn(ctx, c, ly, norm, positions, mask, S); + struct ggml_tensor * attn = qwen3_build_self_attn(ctx, c, ly, norm, positions, mask, S, use_flash_attn); hidden = ggml_add(ctx, hidden, attn); // MLP block @@ -227,10 +250,11 @@ static struct ggml_tensor * qwen3_build_layers( struct ggml_tensor * hidden, struct ggml_tensor * positions, struct ggml_tensor * mask, - int S) { + int S, + bool use_flash_attn = true) { for (int i = 0; i < c.n_layers; i++) { - hidden = qwen3_build_layer(ctx, c, &layers[i], hidden, positions, mask, S); + hidden = qwen3_build_layer(ctx, c, &layers[i], hidden, positions, mask, S, use_flash_attn); } return qwen3_rms_norm(ctx, hidden, final_norm_w, c.rms_norm_eps); } @@ -287,6 +311,7 @@ static void qwen3_init_backend(Qwen3GGML * m) { m->backend = bp.backend; m->cpu_backend = bp.cpu_backend; m->sched = backend_sched_new(bp, 4096); + m->use_flash_attn = true; } // Load standalone text encoder (Qwen3-Embedding) from GGUF @@ -372,7 +397,8 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o // N layers + final norm struct ggml_tensor * out = qwen3_build_layers(ctx, c, m->layers, m->final_norm, - hidden, positions, mask, S); + hidden, positions, mask, S, + m->use_flash_attn); ggml_set_name(out, "output"); ggml_set_output(out); ggml_build_forward_expand(gf, out); @@ -409,27 +435,33 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o ggml_free(ctx); } -// CPU vocab lookup utility -// For lyric embedding: look up token IDs in text encoder's embed table (bf16 -> f32) -// GGUF keeps mmapped data alive. Output: [H, S] float (H contiguous per token). -// -// embed_data: pointer to bf16 weight data [vocab, H] in PyTorch layout (H contiguous per row) +// Embedding lookup via ggml graph (reuses text encoder weights + scheduler) // token_ids: [S] int32 // output: [H * S] float (ggml layout: H contiguous, S tokens) -static void qwen3_cpu_embed_lookup(const void * embed_data, int H, - const int * token_ids, int S, - float * output) { - const uint16_t * bf16 = (const uint16_t *)embed_data; - for (int s = 0; s < S; s++) { - int tok = token_ids[s]; - const uint16_t * row = bf16 + (int64_t)tok * H; - float * dst = output + (int64_t)s * H; - for (int h = 0; h < H; h++) { - // bf16 to f32: shift left 16 bits - uint32_t bits = (uint32_t)row[h] << 16; - memcpy(&dst[h], &bits, 4); - } - } +static void qwen3_embed_lookup(Qwen3GGML * m, const int * token_ids, int S, float * output) { + int H = m->cfg.hidden_size; + + size_t ctx_size = 16 * ggml_tensor_overhead() + ggml_graph_overhead(); + struct ggml_init_params gp = { ctx_size, NULL, true }; + struct ggml_context * ctx = ggml_init(gp); + struct ggml_cgraph * gf = ggml_new_graph(ctx); + + struct ggml_tensor * t_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, S); + ggml_set_name(t_ids, "token_ids"); + ggml_set_input(t_ids); + + struct ggml_tensor * out = ggml_get_rows(ctx, m->embed_tokens, t_ids); + ggml_set_name(out, "embed_out"); + ggml_set_output(out); + ggml_build_forward_expand(gf, out); + + ggml_backend_sched_alloc_graph(m->sched, gf); + ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int)); + ggml_backend_sched_graph_compute(m->sched, gf); + ggml_backend_tensor_get(out, output, 0, (size_t)H * S * sizeof(float)); + + ggml_backend_sched_reset(m->sched); + ggml_free(ctx); } // Free diff --git a/src/qwen3-lm.h b/src/qwen3-lm.h index 5395b5a..29b254f 100644 --- a/src/qwen3-lm.h +++ b/src/qwen3-lm.h @@ -45,14 +45,8 @@ struct Qwen3LM { WeightCtx wctx; ggml_backend_t backend; ggml_backend_t cpu_backend; - ggml_backend_sched_t sched; // prefill (variable shapes, runs once) - ggml_gallocr_t galloc; // decode (single GPU, tight loop) - - // CPU-side embed lookup via mmap (avoids ggml_get_rows which lacks - // CUDA K-quant support, preventing costly cross-backend tensor copies) - GGUFModel gf_mmap; - const void * embed_mmap_data; - enum ggml_type embed_type; + ggml_backend_sched_t sched; + bool use_flash_attn; // KV cache: per-set, per-layer [D, max_seq, Nkv] f16 struct ggml_context * kv_ctx; @@ -150,7 +144,7 @@ static void qw3lm_init_backend(Qwen3LM * m) { m->backend = bp.backend; m->cpu_backend = bp.cpu_backend; m->sched = backend_sched_new(bp, 8192); - m->galloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(m->backend)); + m->use_flash_attn = true; } // Allocate KV cache @@ -253,19 +247,7 @@ static bool qw3lm_load(Qwen3LM * m, const char * gguf_path, int max_seq_len, int } wctx_alloc(&m->wctx, m->backend); - - // Keep mmap alive for CPU embed dequant lookup - m->embed_mmap_data = gf_get_data(gf, "model.embed_tokens.weight"); - m->embed_type = m->embed_tokens->type; - if (!m->embed_mmap_data) { - fprintf(stderr, "[LM-Load] FATAL: embed_tokens not found in mmap\n"); - gf_close(&gf); - return false; - } - m->gf_mmap = gf; // transfer ownership (no gf_close here) - fprintf(stderr, "[LM-Load] CPU embed lookup: type=%s, row=%zu bytes\n", - ggml_type_name(m->embed_type), - ggml_row_size(m->embed_type, c.hidden_size)); + gf_close(&gf); // KV cache qw3lm_alloc_kv_cache(m, n_kv_sets > 0 ? n_kv_sets : 1); @@ -287,7 +269,8 @@ static struct ggml_tensor * qw3lm_build_attn( struct ggml_tensor * cache_v, // [D, max_seq, Nkv] f16 int kv_pos, int kv_len, - int n_tokens) { + int n_tokens, + bool use_flash_attn = true) { int D = c.head_dim; int Nh = c.n_heads; @@ -356,10 +339,13 @@ static struct ggml_tensor * qw3lm_build_attn( struct ggml_tensor * k_full = ggml_view_3d(ctx, cache_k, D, kv_len, Nkv, nb1, nb2, 0); struct ggml_tensor * v_full = ggml_view_3d(ctx, cache_v, D, kv_len, Nkv, nb1, nb2, 0); - // Flash attention + // Attention (flash or F32 manual fallback) float scale = 1.0f / sqrtf((float)D); - struct ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k_full, v_full, mask, scale, 0.0f, 0.0f); - ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); // F32 accumulation + struct ggml_tensor * attn = use_flash_attn + ? ggml_flash_attn_ext(ctx, q, k_full, v_full, mask, scale, 0.0f, 0.0f) + : qwen3_attn_f32(ctx, q, k_full, v_full, mask, scale); + if (use_flash_attn) + ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); // Reshape: [D, Nh, S] -> [Nh*D, S] attn = ggml_reshape_2d(ctx, attn, Nh * D, S); @@ -401,14 +387,12 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens, ggml_set_input(mask); } - // Embedding: CPU dequant from mmap, fed as F32 input. - // This keeps embed_tokens out of get_rows (no CUDA K-quant support) - // and only in mul_mat (lm_head) which has full K-quant CUDA support. - struct ggml_tensor * embed_out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, H, n_tokens); - ggml_set_name(embed_out, "embed_out"); - ggml_set_input(embed_out); + // Embedding via ggml_get_rows (scheduler handles backend fallback) + struct ggml_tensor * token_ids_t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); + ggml_set_name(token_ids_t, "token_ids"); + ggml_set_input(token_ids_t); - struct ggml_tensor * hidden = embed_out; + struct ggml_tensor * hidden = ggml_get_rows(ctx, m->embed_tokens, token_ids_t); // Transformer layers for (int l = 0; l < c.n_layers; l++) { @@ -421,7 +405,7 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens, struct ggml_tensor * attn = qw3lm_build_attn( ctx, gf, c, ly, norm, positions, mask, m->kv_k[kv_set][l], m->kv_v[kv_set][l], - kv_pos, kv_len, n_tokens); + kv_pos, kv_len, n_tokens, m->use_flash_attn); // Residual hidden = ggml_add(ctx, hidden, attn); @@ -450,18 +434,8 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens, // Schedule + allocate ggml_backend_sched_alloc_graph(m->sched, gf); - // CPU-side embedding dequantization from mmap - { - const int64_t row_size = (int64_t)ggml_row_size(m->embed_type, H); - const ggml_to_float_t to_float = ggml_get_type_traits(m->embed_type)->to_float; - std::vector embed_buf((size_t)H * n_tokens); - for (int i = 0; i < n_tokens; i++) { - const void * row = (const char *)m->embed_mmap_data + (int64_t)token_ids[i] * row_size; - to_float(row, embed_buf.data() + (int64_t)i * H, H); - } - ggml_backend_tensor_set(embed_out, embed_buf.data(), 0, - (size_t)H * n_tokens * sizeof(float)); - } + // Set token IDs + ggml_backend_tensor_set(token_ids_t, token_ids, 0, n_tokens * sizeof(int)); { std::vector pos_data(n_tokens); @@ -507,7 +481,6 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, const int * kv_sets, int N, float * logits, int lm_offset = 0, int lm_count = 0) { const Qwen3LMConfig & c = m->cfg; - int H = c.hidden_size; int D = c.head_dim; int Nh = c.n_heads; int Nkv = c.n_kv_heads; @@ -530,10 +503,10 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, struct ggml_context * ctx = ggml_init(gp); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 16384, false); - // Embedding: [H, N] - struct ggml_tensor * embed_out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, H, N); - ggml_set_name(embed_out, "embed_out"); - ggml_set_input(embed_out); + // Embedding via ggml_get_rows (scheduler handles backend fallback) + struct ggml_tensor * token_ids_t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); + ggml_set_name(token_ids_t, "token_ids"); + ggml_set_input(token_ids_t); // Positions: [N], per-element kv_pos struct ggml_tensor * positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); @@ -546,7 +519,7 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, ggml_set_name(attn_mask, "attn_mask"); ggml_set_input(attn_mask); - struct ggml_tensor * hidden = embed_out; + struct ggml_tensor * hidden = ggml_get_rows(ctx, m->embed_tokens, token_ids_t); for (int l = 0; l < c.n_layers; l++) { Qwen3Layer * ly = &m->layers[l]; @@ -639,10 +612,12 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, m->kv_v4[l]->nb[1], m->kv_v4[l]->nb[2], m->kv_v4[l]->nb[3], (size_t)s0 * m->kv_v4[l]->nb[3]); - // Batched flash attention: 1 kernel per layer instead of N - struct ggml_tensor * attn_result = ggml_flash_attn_ext(ctx, - q4, k_batch, v_batch, attn_mask, scale, 0.0f, 0.0f); - ggml_flash_attn_ext_set_prec(attn_result, GGML_PREC_F32); + // Batched attention (flash or F32 manual fallback) + struct ggml_tensor * attn_result = m->use_flash_attn + ? ggml_flash_attn_ext(ctx, q4, k_batch, v_batch, attn_mask, scale, 0.0f, 0.0f) + : qwen3_attn_f32(ctx, q4, k_batch, v_batch, attn_mask, scale); + if (m->use_flash_attn) + ggml_flash_attn_ext_set_prec(attn_result, GGML_PREC_F32); // Output: [D, Nh, 1, N] -> [Nh*D, N] struct ggml_tensor * attn_cat = ggml_reshape_2d(ctx, attn_result, Nh * D, N); @@ -673,20 +648,11 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, ggml_set_output(lgt); ggml_build_forward_expand(gf, lgt); - // Allocate (gallocr: single-backend, no scheduler overhead) - ggml_gallocr_alloc_graph(m->galloc, gf); + // Allocate + ggml_backend_sched_alloc_graph(m->sched, gf); - // CPU-side embedding dequant - { - const int64_t row_size = (int64_t)ggml_row_size(m->embed_type, H); - const ggml_to_float_t to_float = ggml_get_type_traits(m->embed_type)->to_float; - std::vector embed_buf((size_t)H * N); - for (int i = 0; i < N; i++) { - const void * row = (const char *)m->embed_mmap_data + (int64_t)token_ids[i] * row_size; - to_float(row, embed_buf.data() + (int64_t)i * H, H); - } - ggml_backend_tensor_set(embed_out, embed_buf.data(), 0, (size_t)H * N * sizeof(float)); - } + // Set token IDs + ggml_backend_tensor_set(token_ids_t, token_ids, 0, N * sizeof(int)); // Positions: per-element kv_pos { @@ -710,8 +676,8 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, mask_data.size() * sizeof(uint16_t)); } - // Compute (direct backend, no scheduler dispatch) - ggml_backend_graph_compute(m->backend, gf); + // Compute + ggml_backend_sched_graph_compute(m->sched, gf); // Read logits [out_V, N] ggml_backend_tensor_get(lgt, logits, 0, (size_t)out_V * N * sizeof(float)); @@ -720,18 +686,17 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, for (int i = 0; i < N; i++) m->kv_pos[kv_sets[i]]++; + ggml_backend_sched_reset(m->sched); ggml_free(ctx); } // Free all resources static void qw3lm_free(Qwen3LM * m) { - if (m->galloc) ggml_gallocr_free(m->galloc); if (m->sched) ggml_backend_sched_free(m->sched); if (m->kv_buf) ggml_backend_buffer_free(m->kv_buf); if (m->kv_ctx) ggml_free(m->kv_ctx); if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend); if (m->cpu_backend) ggml_backend_free(m->cpu_backend); wctx_free(&m->wctx); - gf_close(&m->gf_mmap); *m = {}; } diff --git a/tests/CPU-BF16.log b/tests/CPU-BF16.log new file mode 100644 index 0000000..b20ebae --- /dev/null +++ b/tests/CPU-BF16.log @@ -0,0 +1,257 @@ +[Load] DiT backend: CPU (CPU threads: 16) +[Load] Backend init: 1.5 ms +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 3007.9 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 464.0 ms +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CPU (CPU threads: 16) +[VAE] Backend: CPU, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 651.3 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 31.9 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 226.8 ms +[Encode] TextEncoder (70 tokens): 59.7 ms +[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.7 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 1160.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 230.8 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 274.9 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.758296 -0.049593 -0.132844 0.058496 +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[WeightCtx] Loaded 30 tensors, 200.3 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 34.6 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 958.8 ms +[Debug] detok_output: [2170, 64] first4: -0.124160 1.435260 0.310138 -0.624584 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.124160 1.435260 0.310138 -0.624584 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 2129 nodes +[Debug] tproj: [12288] first4: 0.260222 -0.161617 -0.097078 0.052346 +[Debug] temb: [2048] first4: 0.000077 -0.132559 -0.035432 0.064735 +[Debug] temb_t: [2048] first4: 0.001069 0.026790 -0.052756 0.063697 +[Debug] temb_r: [2048] first4: -0.000991 -0.159349 0.017324 0.001038 +[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049513 -0.051899 -0.014138 -0.038434 +[Debug] temb_lin1_r: [2048] first4: -0.013266 -0.018319 -0.016375 0.008532 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039830 -0.969685 0.533102 0.446442 +[Debug] proj_in_input: [192, 2170] first4: -0.124160 1.435260 0.310138 -0.624584 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.168787 0.814833 0.326668 -0.562433 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.719501 -0.764459 -0.047725 0.261760 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.541141 -1.045404 0.186748 0.455664 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.168787 0.814833 0.326668 -0.562433 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.500309 0.170627 -0.354600 0.512837 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.541141 -1.045404 0.186748 0.455664 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.541141 -1.045404 0.186748 0.455664 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.599016 -0.822108 -0.298718 0.492092 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.098095 0.568142 52.394512 -0.905627 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.346304 0.043589 33.440353 -4.467471 +[Debug] hidden_after_layer12: [2048, 1085] first4: -14.856287 -18.096371 72.046799 28.866295 +[Debug] hidden_after_layer18: [2048, 1085] first4: -27.298880 15.859982 59.802349 20.914667 +[Debug] hidden_after_layer23: [2048, 1085] first4: -11.120972 45.536430 196.515015 145.620667 +[Debug] dit_step0_vt: [2170, 64] first4: 0.017592 1.109134 0.340961 2.380328 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193536 2.105835 -0.187373 0.739460 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.231590 1.299610 -0.120825 1.895337 +[Debug] dit_step1_xt: [2170, 64] first4: 0.206168 2.034947 -0.180783 0.636078 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.025322 1.214425 0.100767 2.387164 +[Debug] dit_step2_xt: [2170, 64] first4: 0.207857 1.953985 -0.187501 0.476933 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.242072 1.092567 0.260294 2.643174 +[Debug] dit_step3_xt: [2170, 64] first4: 0.187684 1.862938 -0.209192 0.256669 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.292635 1.007325 0.109474 2.707222 +[Debug] dit_step4_xt: [2170, 64] first4: 0.156330 1.755010 -0.220921 -0.033391 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.268947 0.924783 -0.284788 2.767856 +[Debug] dit_step5_xt: [2170, 64] first4: 0.117909 1.622898 -0.180237 -0.428799 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.170391 0.634803 -0.816809 2.824526 +[Debug] dit_step6_xt: [2170, 64] first4: 0.083831 1.495938 -0.016875 -0.993704 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: 0.002176 0.183052 -1.467304 3.113325 +[Debug] dit_x0: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 18721.5 ms (18721.5 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 51818.0 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000519 0.001024 0.000897 0.001200 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:57:38.585 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:57:38.585 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:57:38.585 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:57:38.586 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:57:38.586 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:57:39.413 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:57:40.961 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:57:40.961 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:57:40.966 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:57:41.132 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:57:41.140 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:57:41.153 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:57:41.153 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:57:41.175 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:57:41.483 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:57:41.483 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:57:41.483 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00688624382019043, 'diffusion_time_cost': 0.30014586448669434, 'diffusion_per_step_time_cost': 0.03751823306083679, 'total_time_cost': 0.30703210830688477, 'offload_time_cost': 0.0} +2026-03-01 19:57:41.498 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB +2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB +2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB +2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:57:41.775 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:57:41.777 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:57:41.780 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf +[GGML] Running acestep-v15-turbo-BF16.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999816 + lyric_embed 1.000000 + enc_hidden 0.999841 + detok_output 0.999995 + context 0.999997 + noise 1.000000 + temb_t 0.999999 + hidden_after_proj_in 0.999988 + enc_after_cond_emb 0.999832 + layer0_sa_output 0.999960 + hidden_after_layer0 0.999982 + hidden_after_layer6 0.999924 + hidden_after_layer12 0.999332 + hidden_after_layer18 0.996692 + hidden_after_layer23 0.993786 + dit_step0_vt 0.975712 + dit_step0_xt 0.999946 + dit_step1_vt 0.979525 + dit_step1_xt 0.999833 + dit_step2_vt 0.981808 + dit_step2_xt 0.999552 + dit_step3_vt 0.982382 + dit_step3_xt 0.998917 + dit_step4_vt 0.980777 + dit_step4_xt 0.997480 + dit_step5_vt 0.978078 + dit_step5_xt 0.994264 + dit_step6_vt 0.974849 + dit_step6_xt 0.988142 + dit_step7_vt 0.969102 + dit_x0 0.979106 + vae_audio 0.901370 + vae_audio (STFT cosine) 0.975816 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999946 0.136541 0.006626 -0.002312 0.972951 -0.002342 0.972003 + dit_step1_xt 0.999833 0.265486 0.011288 -0.005309 0.942692 -0.005313 0.941730 + dit_step2_xt 0.999552 0.451896 0.017477 -0.009347 0.909217 -0.009311 0.908527 + dit_step3_xt 0.998917 0.642624 0.025957 -0.014710 0.873863 -0.014577 0.873624 + dit_step4_xt 0.997480 0.778374 0.037868 -0.021751 0.842047 -0.021660 0.841995 + dit_step5_xt 0.994264 1.244624 0.055630 -0.031814 0.825360 -0.032109 0.824593 + dit_step6_xt 0.988142 2.080976 0.082605 -0.046091 0.856212 -0.046482 0.855546 diff --git a/tests/CPU-Q4_K_M.log b/tests/CPU-Q4_K_M.log new file mode 100644 index 0000000..508a20c --- /dev/null +++ b/tests/CPU-Q4_K_M.log @@ -0,0 +1,257 @@ +[Load] DiT backend: CPU (CPU threads: 16) +[Load] Backend init: 6.3 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 895.6 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 118.4 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CPU (CPU threads: 16) +[VAE] Backend: CPU, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 696.2 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 33.0 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 148.2 ms +[Encode] TextEncoder (70 tokens): 58.0 ms +[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.6 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 352.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 37.5 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 294.2 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.759313 -0.049345 -0.129442 0.055759 +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 64.7 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 10.1 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 354.8 ms +[Debug] detok_output: [2170, 64] first4: -0.106265 1.448869 0.309591 -0.650098 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.106265 1.448869 0.309591 -0.650098 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 2063 nodes +[Debug] tproj: [12288] first4: 0.261574 -0.159668 -0.089874 0.048361 +[Debug] temb: [2048] first4: 0.000181 -0.133893 -0.034492 0.065095 +[Debug] temb_t: [2048] first4: 0.000984 0.025702 -0.052155 0.063359 +[Debug] temb_r: [2048] first4: -0.000803 -0.159595 0.017663 0.001736 +[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049462 -0.052971 -0.011985 -0.047441 +[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.057382 -0.990466 0.522861 0.451163 +[Debug] proj_in_input: [192, 2170] first4: -0.106265 1.448869 0.309591 -0.650098 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.171472 0.759029 0.290676 -0.533397 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.732369 -0.771010 -0.041992 0.259081 +[Debug] layer0_q_after_rope: [128, 16] first4: -0.171472 0.759029 0.290676 -0.533397 +[Debug] layer0_k_after_rope: [128, 8] first4: -1.587325 -1.063579 0.053489 0.460284 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.605205 0.165836 -0.485558 0.452734 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.587325 -1.063579 0.053489 0.460284 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.587325 -1.063579 0.053489 0.460284 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.703787 -0.846621 -0.436453 0.503148 +[Debug] hidden_after_layer0: [2048, 1085] first4: -8.930592 0.456150 48.587612 -0.801327 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.484295 -2.022109 30.954683 -3.475530 +[Debug] hidden_after_layer12: [2048, 1085] first4: -18.011547 -13.821573 70.228333 29.257874 +[Debug] hidden_after_layer18: [2048, 1085] first4: -17.142008 9.257736 59.313492 18.404408 +[Debug] hidden_after_layer23: [2048, 1085] first4: -20.417297 8.254404 182.146759 136.554886 +[Debug] dit_step0_vt: [2170, 64] first4: -0.054831 1.071052 0.246038 2.201593 +[Debug] dit_step0_xt: [2170, 64] first4: 0.196828 2.107566 -0.183059 0.747584 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.128807 1.226092 -0.249701 1.890724 +[Debug] dit_step1_xt: [2170, 64] first4: 0.203854 2.040688 -0.169438 0.644453 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: 0.003495 1.153559 0.065743 2.214043 +[Debug] dit_step2_xt: [2170, 64] first4: 0.203621 1.963784 -0.173821 0.496851 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.260204 1.180074 0.269396 2.564617 +[Debug] dit_step3_xt: [2170, 64] first4: 0.181937 1.865445 -0.196271 0.283133 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.294849 1.093781 0.087178 2.615031 +[Debug] dit_step4_xt: [2170, 64] first4: 0.150346 1.748254 -0.205612 0.002951 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.142651 1.068169 -0.503217 2.724137 +[Debug] dit_step5_xt: [2170, 64] first4: 0.129968 1.595658 -0.133723 -0.386212 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: -0.109419 1.023015 -1.102168 2.820799 +[Debug] dit_step6_xt: [2170, 64] first4: 0.151852 1.391055 0.086710 -0.950372 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.463452 0.896626 -1.673395 3.222673 +[Debug] dit_x0: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 21769.5 ms (21769.5 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 52184.7 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000272 0.000786 0.000556 0.000990 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 20:03:15.903 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 20:03:15.903 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 20:03:15.903 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 20:03:15.903 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 20:03:15.904 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 20:03:16.714 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 20:03:18.309 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 20:03:18.309 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 20:03:18.315 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 20:03:18.480 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 20:03:18.488 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 20:03:18.501 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 20:03:18.501 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 20:03:18.540 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 20:03:18.854 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 20:03:18.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 20:03:18.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006970643997192383, 'diffusion_time_cost': 0.3072662353515625, 'diffusion_per_step_time_cost': 0.03840827941894531, 'total_time_cost': 0.3142368793487549, 'offload_time_cost': 0.0} +2026-03-01 20:03:18.869 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 20:03:19.148 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 20:03:19.151 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 20:03:19.154 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf +[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999816 + lyric_embed 1.000000 + enc_hidden 0.997095 + detok_output 0.999577 + context 0.999730 + noise 1.000000 + temb_t 0.999896 + hidden_after_proj_in 0.999903 + enc_after_cond_emb 0.997571 + layer0_sa_output 0.998370 + hidden_after_layer0 0.999619 + hidden_after_layer6 0.999177 + hidden_after_layer12 0.995111 + hidden_after_layer18 0.991459 + hidden_after_layer23 0.985217 + dit_step0_vt 0.946613 + dit_step0_xt 0.999883 + dit_step1_vt 0.947613 + dit_step1_xt 0.999611 + dit_step2_vt 0.958491 + dit_step2_xt 0.999010 + dit_step3_vt 0.962965 + dit_step3_xt 0.997773 + dit_step4_vt 0.960997 + dit_step4_xt 0.994989 + dit_step5_vt 0.957636 + dit_step5_xt 0.988832 + dit_step6_vt 0.952016 + dit_step6_xt 0.977196 + dit_step7_vt 0.939970 + dit_x0 0.959881 + vae_audio 0.834993 + vae_audio (STFT cosine) 0.955098 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999883 0.167680 0.010319 -0.002256 0.973185 -0.002342 0.972003 + dit_step1_xt 0.999611 0.268237 0.018204 -0.005104 0.943179 -0.005313 0.941730 + dit_step2_xt 0.999010 0.434671 0.027774 -0.009029 0.910147 -0.009311 0.908527 + dit_step3_xt 0.997773 0.601206 0.039926 -0.014325 0.875171 -0.014577 0.873624 + dit_step4_xt 0.994989 0.892883 0.057385 -0.021274 0.843615 -0.021660 0.841995 + dit_step5_xt 0.988832 1.381146 0.083605 -0.031218 0.827061 -0.032109 0.824593 + dit_step6_xt 0.977196 2.021005 0.123750 -0.045473 0.858175 -0.046482 0.855546 diff --git a/tests/CPU-Q5_K_M.log b/tests/CPU-Q5_K_M.log new file mode 100644 index 0000000..e0d9936 --- /dev/null +++ b/tests/CPU-Q5_K_M.log @@ -0,0 +1,257 @@ +[Load] DiT backend: CPU (CPU threads: 16) +[Load] Backend init: 1.6 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 140.3 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CPU (CPU threads: 16) +[VAE] Backend: CPU, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 699.1 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 33.4 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 149.7 ms +[Encode] TextEncoder (70 tokens): 57.3 ms +[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.5 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 412.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 45.1 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 387.5 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.760901 -0.053445 -0.132760 0.058505 +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 73.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 11.3 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 447.0 ms +[Debug] detok_output: [2170, 64] first4: -0.129311 1.458194 0.298132 -0.651512 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.129311 1.458194 0.298132 -0.651512 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 2063 nodes +[Debug] tproj: [12288] first4: 0.261152 -0.161305 -0.103153 0.050892 +[Debug] temb: [2048] first4: -0.000119 -0.132132 -0.035650 0.065085 +[Debug] temb_t: [2048] first4: 0.000588 0.026848 -0.052924 0.063878 +[Debug] temb_r: [2048] first4: -0.000708 -0.158980 0.017274 0.001208 +[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.051319 -0.053246 -0.011899 -0.038818 +[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.048950 -0.942691 0.537616 0.450821 +[Debug] proj_in_input: [192, 2170] first4: -0.129311 1.458194 0.298132 -0.651512 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.151010 0.749188 0.347886 -0.528254 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.726623 -0.748099 -0.053174 0.262053 +[Debug] layer0_q_after_rope: [128, 16] first4: -0.151010 0.749188 0.347886 -0.528254 +[Debug] layer0_k_after_rope: [128, 8] first4: -1.551637 -1.002339 0.163270 0.462290 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.510043 0.134910 -0.385166 0.487419 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.551637 -1.002339 0.163270 0.462290 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.551637 -1.002339 0.163270 0.462290 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.601043 -0.768895 -0.323166 0.504161 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.313718 0.740223 52.142769 -0.880804 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.028343 0.455638 29.972351 -4.651019 +[Debug] hidden_after_layer12: [2048, 1085] first4: -17.875141 -17.099358 67.074074 24.887821 +[Debug] hidden_after_layer18: [2048, 1085] first4: -24.271315 11.994616 56.276474 19.815941 +[Debug] hidden_after_layer23: [2048, 1085] first4: -9.757540 40.914558 193.229523 152.458817 +[Debug] dit_step0_vt: [2170, 64] first4: -0.008601 1.160695 0.325083 2.395968 +[Debug] dit_step0_xt: [2170, 64] first4: 0.194727 2.103491 -0.186652 0.738749 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.246968 1.361296 -0.140900 1.930280 +[Debug] dit_step1_xt: [2170, 64] first4: 0.208198 2.029238 -0.178966 0.633461 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.093393 1.253966 0.122121 2.387282 +[Debug] dit_step2_xt: [2170, 64] first4: 0.214424 1.945641 -0.187107 0.474308 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.283676 1.140476 0.250461 2.641533 +[Debug] dit_step3_xt: [2170, 64] first4: 0.190784 1.850601 -0.207979 0.254181 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.314606 0.873225 0.069223 2.711446 +[Debug] dit_step4_xt: [2170, 64] first4: 0.157077 1.757041 -0.215396 -0.036331 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.377209 0.828215 -0.406894 2.727257 +[Debug] dit_step5_xt: [2170, 64] first4: 0.103190 1.638725 -0.157268 -0.425940 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.230187 0.630044 -0.936850 2.799204 +[Debug] dit_step6_xt: [2170, 64] first4: 0.057152 1.512716 0.030102 -0.985780 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.003599 0.325174 -1.377289 3.053612 +[Debug] dit_x0: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 27970.1 ms (27970.1 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 51966.1 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000740 0.001305 0.001083 0.001434 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 20:01:55.226 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 20:01:55.226 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 20:01:55.226 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 20:01:55.226 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 20:01:55.226 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 20:01:56.032 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 20:01:57.576 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 20:01:57.577 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 20:01:57.581 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 20:01:57.747 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 20:01:57.755 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 20:01:57.768 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 20:01:57.768 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 20:01:57.801 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 20:01:58.109 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 20:01:58.109 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 20:01:58.109 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007002353668212891, 'diffusion_time_cost': 0.30033254623413086, 'diffusion_per_step_time_cost': 0.03754156827926636, 'total_time_cost': 0.30733489990234375, 'offload_time_cost': 0.0} +2026-03-01 20:01:58.124 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 20:01:58.401 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 20:01:58.403 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 20:01:58.406 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf +[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999816 + lyric_embed 1.000000 + enc_hidden 0.999099 + detok_output 0.999843 + context 0.999900 + noise 1.000000 + temb_t 0.999968 + hidden_after_proj_in 0.999954 + enc_after_cond_emb 0.999196 + layer0_sa_output 0.999388 + hidden_after_layer0 0.999773 + hidden_after_layer6 0.999687 + hidden_after_layer12 0.998560 + hidden_after_layer18 0.995178 + hidden_after_layer23 0.990907 + dit_step0_vt 0.966084 + dit_step0_xt 0.999926 + dit_step1_vt 0.972329 + dit_step1_xt 0.999780 + dit_step2_vt 0.971107 + dit_step2_xt 0.999383 + dit_step3_vt 0.973886 + dit_step3_xt 0.998543 + dit_step4_vt 0.971976 + dit_step4_xt 0.996642 + dit_step5_vt 0.967575 + dit_step5_xt 0.992211 + dit_step6_vt 0.962964 + dit_step6_xt 0.983513 + dit_step7_vt 0.954349 + dit_x0 0.970379 + vae_audio 0.874800 + vae_audio (STFT cosine) 0.967703 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999926 0.135378 0.008030 -0.002303 0.973012 -0.002342 0.972003 + dit_step1_xt 0.999780 0.276712 0.013491 -0.005310 0.942849 -0.005313 0.941730 + dit_step2_xt 0.999383 0.460420 0.021261 -0.009337 0.909465 -0.009311 0.908527 + dit_step3_xt 0.998543 0.681684 0.031463 -0.014739 0.874175 -0.014577 0.873624 + dit_step4_xt 0.996642 0.853164 0.045737 -0.021967 0.842445 -0.021660 0.841995 + dit_step5_xt 0.992211 1.314129 0.067657 -0.032346 0.825989 -0.032109 0.824593 + dit_step6_xt 0.983513 2.191432 0.101363 -0.046949 0.857195 -0.046482 0.855546 diff --git a/tests/CPU-Q6_K.log b/tests/CPU-Q6_K.log new file mode 100644 index 0000000..7d4c411 --- /dev/null +++ b/tests/CPU-Q6_K.log @@ -0,0 +1,257 @@ +[Load] DiT backend: CPU (CPU threads: 16) +[Load] Backend init: 1.6 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 169.4 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CPU (CPU threads: 16) +[VAE] Backend: CPU, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 699.2 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 32.5 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 148.3 ms +[Encode] TextEncoder (70 tokens): 57.5 ms +[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.6 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 476.3 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 52.6 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 348.9 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.761694 -0.052035 -0.131773 0.058231 +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 82.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 12.3 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 414.3 ms +[Debug] detok_output: [2170, 64] first4: -0.151355 1.462444 0.326907 -0.627213 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.151355 1.462444 0.326907 -0.627213 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 2129 nodes +[Debug] tproj: [12288] first4: 0.261809 -0.161156 -0.099489 0.050901 +[Debug] temb: [2048] first4: 0.000441 -0.132284 -0.035603 0.064823 +[Debug] temb_t: [2048] first4: 0.001519 0.026983 -0.052936 0.063921 +[Debug] temb_r: [2048] first4: -0.001078 -0.159268 0.017333 0.000903 +[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049242 -0.050737 -0.017494 -0.036973 +[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.041706 -0.935163 0.543316 0.447904 +[Debug] proj_in_input: [192, 2170] first4: -0.151355 1.462444 0.326907 -0.627213 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.170483 0.826965 0.338536 -0.581525 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.719262 -0.743265 -0.048909 0.260726 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.546578 -1.031349 0.213821 0.458892 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.170483 0.826965 0.338536 -0.581525 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.510827 0.216662 -0.337830 0.522569 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.546578 -1.031349 0.213821 0.458892 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.546578 -1.031349 0.213821 0.458892 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.610117 -0.795587 -0.288174 0.502934 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.220036 0.587352 53.159882 -0.942435 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.447939 -0.975549 35.157303 -4.845882 +[Debug] hidden_after_layer12: [2048, 1085] first4: -16.561256 -16.121094 76.819672 30.808043 +[Debug] hidden_after_layer18: [2048, 1085] first4: -29.809811 13.925017 66.285889 19.847790 +[Debug] hidden_after_layer23: [2048, 1085] first4: -21.918661 46.159637 204.710663 138.480270 +[Debug] dit_step0_vt: [2170, 64] first4: 0.100316 1.102248 0.318693 2.394090 +[Debug] dit_step0_xt: [2170, 64] first4: 0.189776 2.106148 -0.186361 0.738834 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.077579 1.336049 -0.205877 1.979667 +[Debug] dit_step1_xt: [2170, 64] first4: 0.194008 2.033272 -0.175131 0.630852 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: 0.089277 1.192314 0.088705 2.392204 +[Debug] dit_step2_xt: [2170, 64] first4: 0.188056 1.953785 -0.181045 0.471372 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.306248 1.088670 0.212184 2.674479 +[Debug] dit_step3_xt: [2170, 64] first4: 0.162535 1.863062 -0.198727 0.248499 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.330824 1.012156 0.074096 2.759729 +[Debug] dit_step4_xt: [2170, 64] first4: 0.127090 1.754617 -0.206666 -0.047187 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.330529 0.879730 -0.335447 2.785841 +[Debug] dit_step5_xt: [2170, 64] first4: 0.079871 1.628941 -0.158745 -0.445164 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.198573 0.657394 -0.886720 2.779941 +[Debug] dit_step6_xt: [2170, 64] first4: 0.040157 1.497462 0.018599 -1.001152 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: 0.118016 0.207620 -1.266971 2.955565 +[Debug] dit_x0: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 25398.3 ms (25398.3 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 52074.7 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000467 0.001015 0.000873 0.001303 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 20:00:28.298 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 20:00:28.298 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 20:00:28.298 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 20:00:28.298 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 20:00:28.298 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 20:00:29.103 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 20:00:30.690 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 20:00:30.690 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 20:00:30.695 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 20:00:30.860 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 20:00:30.869 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 20:00:30.881 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 20:00:30.882 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 20:00:30.914 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 20:00:31.231 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 20:00:31.232 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 20:00:31.232 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006938934326171875, 'diffusion_time_cost': 0.31071925163269043, 'diffusion_per_step_time_cost': 0.038839906454086304, 'total_time_cost': 0.3176581859588623, 'offload_time_cost': 0.0} +2026-03-01 20:00:31.246 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB +2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB +2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB +2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 20:00:31.524 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 20:00:31.527 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 20:00:31.531 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf +[GGML] Running acestep-v15-turbo-Q6_K.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999816 + lyric_embed 1.000000 + enc_hidden 0.999634 + detok_output 0.999927 + context 0.999954 + noise 1.000000 + temb_t 0.999986 + hidden_after_proj_in 0.999975 + enc_after_cond_emb 0.999619 + layer0_sa_output 0.999718 + hidden_after_layer0 0.999827 + hidden_after_layer6 0.999788 + hidden_after_layer12 0.998843 + hidden_after_layer18 0.995848 + hidden_after_layer23 0.992196 + dit_step0_vt 0.971124 + dit_step0_xt 0.999936 + dit_step1_vt 0.975111 + dit_step1_xt 0.999802 + dit_step2_vt 0.978218 + dit_step2_xt 0.999477 + dit_step3_vt 0.977576 + dit_step3_xt 0.998723 + dit_step4_vt 0.973938 + dit_step4_xt 0.996945 + dit_step5_vt 0.969356 + dit_step5_xt 0.992753 + dit_step6_vt 0.965671 + dit_step6_xt 0.984569 + dit_step7_vt 0.958147 + dit_x0 0.972312 + vae_audio 0.891761 + vae_audio (STFT cosine) 0.969080 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999936 0.151952 0.007283 -0.002271 0.972870 -0.002342 0.972003 + dit_step1_xt 0.999802 0.296519 0.012516 -0.005212 0.942575 -0.005313 0.941730 + dit_step2_xt 0.999477 0.478400 0.019283 -0.009184 0.908992 -0.009311 0.908527 + dit_step3_xt 0.998723 0.734609 0.028810 -0.014535 0.873457 -0.014577 0.873624 + dit_step4_xt 0.996945 1.045720 0.042804 -0.021712 0.841447 -0.021660 0.841995 + dit_step5_xt 0.992753 1.512605 0.064324 -0.032020 0.824620 -0.032109 0.824593 + dit_step6_xt 0.984569 2.166596 0.096699 -0.046604 0.855715 -0.046482 0.855546 diff --git a/tests/CPU-Q8_0.log b/tests/CPU-Q8_0.log new file mode 100644 index 0000000..76183ea --- /dev/null +++ b/tests/CPU-Q8_0.log @@ -0,0 +1,257 @@ +[Load] DiT backend: CPU (CPU threads: 16) +[Load] Backend init: 1.6 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 188.0 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CPU (CPU threads: 16) +[VAE] Backend: CPU, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 690.8 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 32.8 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 160.0 ms +[Encode] TextEncoder (70 tokens): 57.9 ms +[Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 13.0 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CPU (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 616.6 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 126.4 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 390.3 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.758873 -0.049568 -0.132802 0.057792 +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 106.5 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 13.6 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 447.8 ms +[Debug] detok_output: [2170, 64] first4: -0.126218 1.441045 0.305219 -0.629688 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.126218 1.441045 0.305219 -0.629688 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 2129 nodes +[Debug] tproj: [12288] first4: 0.259485 -0.161550 -0.096885 0.051766 +[Debug] temb: [2048] first4: 0.000214 -0.132557 -0.035428 0.064847 +[Debug] temb_t: [2048] first4: 0.001194 0.026823 -0.052744 0.063762 +[Debug] temb_r: [2048] first4: -0.000980 -0.159380 0.017316 0.001084 +[Debug] sinusoidal_t: [256] first4: 0.562379 0.789627 0.439928 -0.023645 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049228 -0.051913 -0.015026 -0.038076 +[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018835 -0.015731 0.008462 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.038249 -0.957445 0.537078 0.447006 +[Debug] proj_in_input: [192, 2170] first4: -0.126218 1.441045 0.305219 -0.629688 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.176016 0.814970 0.334600 -0.563971 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.718529 -0.757126 -0.047071 0.261381 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.545586 -1.032032 0.192079 0.456504 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.176016 0.814970 0.334600 -0.563971 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.519029 0.168016 -0.353233 0.508560 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.545586 -1.032032 0.192079 0.456504 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.545586 -1.032032 0.192079 0.456504 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.604149 -0.815843 -0.286884 0.491781 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.102718 0.576853 52.433601 -0.866220 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.554432 0.201925 34.636509 -4.160976 +[Debug] hidden_after_layer12: [2048, 1085] first4: -15.075979 -18.545254 72.497665 28.997612 +[Debug] hidden_after_layer18: [2048, 1085] first4: -26.391603 14.396175 61.327370 20.126297 +[Debug] hidden_after_layer23: [2048, 1085] first4: -4.878841 39.642975 194.063141 143.022125 +[Debug] dit_step0_vt: [2170, 64] first4: 0.030129 1.134737 0.345365 2.365999 +[Debug] dit_step0_xt: [2170, 64] first4: 0.192966 2.104671 -0.187573 0.740111 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.191913 1.346320 -0.134135 1.880714 +[Debug] dit_step1_xt: [2170, 64] first4: 0.203434 2.031235 -0.180257 0.637526 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.032953 1.239032 0.099210 2.371356 +[Debug] dit_step2_xt: [2170, 64] first4: 0.205631 1.948633 -0.186871 0.479436 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.254387 1.085867 0.272314 2.643562 +[Debug] dit_step3_xt: [2170, 64] first4: 0.184432 1.858144 -0.209564 0.259139 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.255440 1.003537 0.102939 2.722830 +[Debug] dit_step4_xt: [2170, 64] first4: 0.157064 1.750623 -0.220593 -0.032593 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.281173 0.936761 -0.295195 2.736938 +[Debug] dit_step5_xt: [2170, 64] first4: 0.116896 1.616800 -0.178422 -0.423584 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.167723 0.621779 -0.826056 2.808025 +[Debug] dit_step6_xt: [2170, 64] first4: 0.083352 1.492444 -0.013211 -0.985189 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.037024 0.233524 -1.487499 3.098410 +[Debug] dit_x0: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 26043.3 ms (26043.3 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 52114.7 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000455 0.000930 0.000816 0.001121 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:59:03.882 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:59:03.882 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:59:03.882 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:59:03.883 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:59:03.883 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:59:04.691 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:59:06.262 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:59:06.262 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:59:06.268 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:59:06.433 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:59:06.443 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:59:06.457 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:59:06.457 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:59:06.478 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:59:06.802 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:59:06.803 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:59:06.803 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006929874420166016, 'diffusion_time_cost': 0.3164329528808594, 'diffusion_per_step_time_cost': 0.03955411911010742, 'total_time_cost': 0.3233628273010254, 'offload_time_cost': 0.0} +2026-03-01 19:59:06.817 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:59:07.095 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:59:07.098 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:59:07.101 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf +[GGML] Running acestep-v15-turbo-Q8_0.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999816 + lyric_embed 1.000000 + enc_hidden 0.999814 + detok_output 0.999983 + context 0.999990 + noise 1.000000 + temb_t 0.999997 + hidden_after_proj_in 0.999985 + enc_after_cond_emb 0.999791 + layer0_sa_output 0.999925 + hidden_after_layer0 0.999955 + hidden_after_layer6 0.999892 + hidden_after_layer12 0.999219 + hidden_after_layer18 0.996644 + hidden_after_layer23 0.993707 + dit_step0_vt 0.975605 + dit_step0_xt 0.999946 + dit_step1_vt 0.978928 + dit_step1_xt 0.999831 + dit_step2_vt 0.981129 + dit_step2_xt 0.999551 + dit_step3_vt 0.982813 + dit_step3_xt 0.998932 + dit_step4_vt 0.981292 + dit_step4_xt 0.997544 + dit_step5_vt 0.979091 + dit_step5_xt 0.994467 + dit_step6_vt 0.976152 + dit_step6_xt 0.988647 + dit_step7_vt 0.970238 + dit_x0 0.980014 + vae_audio 0.903408 + vae_audio (STFT cosine) 0.976427 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999946 0.139652 0.006645 -0.002330 0.972930 -0.002342 0.972003 + dit_step1_xt 0.999831 0.267117 0.011368 -0.005325 0.942659 -0.005313 0.941730 + dit_step2_xt 0.999551 0.452101 0.017578 -0.009369 0.909163 -0.009311 0.908527 + dit_step3_xt 0.998932 0.629880 0.025911 -0.014735 0.873792 -0.014577 0.873624 + dit_step4_xt 0.997544 0.759572 0.037583 -0.021796 0.841987 -0.021660 0.841995 + dit_step5_xt 0.994467 1.235701 0.054893 -0.031886 0.825306 -0.032109 0.824593 + dit_step6_xt 0.988647 2.096131 0.081207 -0.046181 0.856264 -0.046482 0.855546 diff --git a/tests/CUDA-BF16.log b/tests/CUDA-BF16.log new file mode 100644 index 0000000..d73a934 --- /dev/null +++ b/tests/CUDA-BF16.log @@ -0,0 +1,259 @@ +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes +[Load] DiT backend: CUDA0 (CPU threads: 16) +[Load] Backend init: 70.8 ms +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 3007.9 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 375.6 ms +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CUDA0 (CPU threads: 16) +[VAE] Backend: CUDA0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 661.0 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 32.8 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 128.5 ms +[Encode] TextEncoder (70 tokens): 50.6 ms +[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.5 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 1160.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 127.1 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 7.9 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.758648 -0.049409 -0.132412 0.058372 +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[WeightCtx] Loaded 30 tensors, 200.3 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 24.2 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 141.9 ms +[Debug] detok_output: [2170, 64] first4: -0.124204 1.435425 0.309963 -0.624679 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.124204 1.435425 0.309963 -0.624679 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] tproj: [12288] first4: 0.260062 -0.161562 -0.097030 0.052313 +[Debug] temb: [2048] first4: 0.000069 -0.132499 -0.035430 0.064753 +[Debug] temb_t: [2048] first4: 0.001065 0.026818 -0.052754 0.063717 +[Debug] temb_r: [2048] first4: -0.000996 -0.159317 0.017323 0.001036 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049318 -0.051829 -0.014251 -0.038444 +[Debug] temb_lin1_r: [2048] first4: -0.013266 -0.018319 -0.016375 0.008532 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039547 -0.969737 0.533554 0.446556 +[Debug] proj_in_input: [192, 2170] first4: -0.124204 1.435425 0.309963 -0.624679 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.166382 0.814621 0.325745 -0.561218 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.719041 -0.764240 -0.047643 0.261711 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.600161 -0.822879 -0.294099 0.491351 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.166382 0.814621 0.325745 -0.561218 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.500000 0.170898 -0.351562 0.515625 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.540346 -1.045535 0.190276 0.455950 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540346 -1.045535 0.190276 0.455950 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.600161 -0.822879 -0.294099 0.491351 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.073158 0.560212 52.141960 -0.912522 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.385975 0.074876 33.328918 -4.446253 +[Debug] hidden_after_layer12: [2048, 1085] first4: -15.000174 -17.960159 71.364281 28.422548 +[Debug] hidden_after_layer18: [2048, 1085] first4: -27.019310 15.715343 59.139381 20.656757 +[Debug] hidden_after_layer23: [2048, 1085] first4: -9.519342 45.743378 195.522568 144.389435 +[Debug] dit_step0_vt: [2170, 64] first4: 0.016157 1.119429 0.348312 2.379197 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193602 2.105367 -0.187707 0.739511 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.224607 1.308204 -0.126253 1.900889 +[Debug] dit_step1_xt: [2170, 64] first4: 0.205853 2.034010 -0.180821 0.635826 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.011260 1.217733 0.098172 2.384965 +[Debug] dit_step2_xt: [2170, 64] first4: 0.206604 1.952828 -0.187366 0.476828 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.242402 1.085806 0.261774 2.646892 +[Debug] dit_step3_xt: [2170, 64] first4: 0.186403 1.862344 -0.209180 0.256254 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.281105 1.015777 0.102466 2.709046 +[Debug] dit_step4_xt: [2170, 64] first4: 0.156285 1.753511 -0.220159 -0.034001 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.265994 0.916073 -0.297680 2.755516 +[Debug] dit_step5_xt: [2170, 64] first4: 0.118286 1.622644 -0.177633 -0.427646 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.172145 0.636800 -0.808572 2.809288 +[Debug] dit_step6_xt: [2170, 64] first4: 0.083857 1.495284 -0.015919 -0.989503 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.004009 0.190141 -1.466879 3.103273 +[Debug] dit_x0: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 248.3 ms (248.3 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 812.8 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000547 0.000898 0.000798 0.001064 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:54:08.539 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:54:08.540 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:54:08.540 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:54:08.540 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:54:08.540 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:54:09.277 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:54:10.804 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:54:10.804 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:54:10.810 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:54:10.970 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:54:10.978 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:54:10.991 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:54:10.991 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:54:11.023 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:54:11.329 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:54:11.330 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:54:11.330 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0068187713623046875, 'diffusion_time_cost': 0.2986173629760742, 'diffusion_per_step_time_cost': 0.03732717037200928, 'total_time_cost': 0.3054361343383789, 'offload_time_cost': 0.0} +2026-03-01 19:54:11.344 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:54:11.625 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:54:11.628 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:54:11.632 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf +[GGML] Running acestep-v15-turbo-BF16.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999805 + lyric_embed 1.000000 + enc_hidden 0.999830 + detok_output 0.999996 + context 0.999998 + noise 1.000000 + temb_t 0.999999 + hidden_after_proj_in 0.999988 + enc_after_cond_emb 0.999818 + layer0_sa_output 0.999951 + hidden_after_layer0 0.999978 + hidden_after_layer6 0.999916 + hidden_after_layer12 0.999234 + hidden_after_layer18 0.996570 + hidden_after_layer23 0.993528 + dit_step0_vt 0.974876 + dit_step0_xt 0.999945 + dit_step1_vt 0.980053 + dit_step1_xt 0.999834 + dit_step2_vt 0.981541 + dit_step2_xt 0.999553 + dit_step3_vt 0.982418 + dit_step3_xt 0.998924 + dit_step4_vt 0.980811 + dit_step4_xt 0.997503 + dit_step5_vt 0.977877 + dit_step5_xt 0.994298 + dit_step6_vt 0.974930 + dit_step6_xt 0.988188 + dit_step7_vt 0.969375 + dit_x0 0.979213 + vae_audio 0.901377 + vae_audio (STFT cosine) 0.975525 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999945 0.135628 0.006709 -0.002312 0.972932 -0.002342 0.972003 + dit_step1_xt 0.999834 0.266762 0.011267 -0.005306 0.942657 -0.005313 0.941730 + dit_step2_xt 0.999553 0.453190 0.017486 -0.009350 0.909152 -0.009311 0.908527 + dit_step3_xt 0.998924 0.643865 0.025962 -0.014715 0.873769 -0.014577 0.873624 + dit_step4_xt 0.997503 0.790038 0.037807 -0.021768 0.841938 -0.021660 0.841995 + dit_step5_xt 0.994298 1.239881 0.055598 -0.031834 0.825214 -0.032109 0.824593 + dit_step6_xt 0.988188 2.076383 0.082565 -0.046121 0.856115 -0.046482 0.855546 diff --git a/tests/CUDA-Q4_K_M.log b/tests/CUDA-Q4_K_M.log new file mode 100644 index 0000000..189cb71 --- /dev/null +++ b/tests/CUDA-Q4_K_M.log @@ -0,0 +1,259 @@ +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes +[Load] DiT backend: CUDA0 (CPU threads: 16) +[Load] Backend init: 11.2 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 895.6 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 403.0 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CUDA0 (CPU threads: 16) +[VAE] Backend: CUDA0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 655.9 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 31.4 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 126.3 ms +[Encode] TextEncoder (70 tokens): 52.7 ms +[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.1 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 352.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 118.9 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 12.7 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.759848 -0.046220 -0.129361 0.057668 +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 64.7 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 22.1 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 124.0 ms +[Debug] detok_output: [2170, 64] first4: -0.098446 1.438721 0.299255 -0.646500 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.098446 1.438721 0.299255 -0.646500 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +[Debug] tproj: [12288] first4: 0.260848 -0.159996 -0.090771 0.048441 +[Debug] temb: [2048] first4: 0.000246 -0.134045 -0.034408 0.064910 +[Debug] temb_t: [2048] first4: 0.001029 0.025591 -0.052085 0.063187 +[Debug] temb_r: [2048] first4: -0.000783 -0.159636 0.017677 0.001723 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049559 -0.053563 -0.011978 -0.047026 +[Debug] temb_lin1_r: [2048] first4: -0.015462 -0.031532 -0.021258 0.006134 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.048604 -0.990237 0.529252 0.453491 +[Debug] proj_in_input: [192, 2170] first4: -0.098446 1.438721 0.299255 -0.646500 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.164939 0.740011 0.286775 -0.551167 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.724411 -0.771269 -0.042124 0.260209 +[Debug] layer0_q_after_rope: [128, 16] first4: -26.611641 -0.173146 0.216591 0.344494 +[Debug] layer0_k_after_rope: [128, 8] first4: -3.965077 0.386751 0.211083 0.672416 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.608527 0.164282 -0.474735 0.450532 +[Debug] layer0_attn_out: [2048, 1085] first4: -26.943256 -0.119716 0.379954 0.343082 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.581287 -1.062661 0.069874 0.462384 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.708075 -0.853060 -0.446424 0.497258 +[Debug] hidden_after_layer0: [2048, 1085] first4: -8.841661 0.391934 47.472157 -0.764472 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.532463 -0.603226 30.787485 -3.431937 +[Debug] hidden_after_layer12: [2048, 1085] first4: -17.481373 -13.959963 61.344299 28.807806 +[Debug] hidden_after_layer18: [2048, 1085] first4: -15.247349 10.312581 47.860855 16.436914 +[Debug] hidden_after_layer23: [2048, 1085] first4: -13.968861 1.714361 170.159424 132.288422 +[Debug] dit_step0_vt: [2170, 64] first4: -0.165321 1.077570 0.220752 2.218085 +[Debug] dit_step0_xt: [2170, 64] first4: 0.201851 2.107270 -0.181909 0.746834 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.110858 1.235523 -0.287918 1.796672 +[Debug] dit_step1_xt: [2170, 64] first4: 0.207897 2.039877 -0.166205 0.648834 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.030571 1.208156 0.092450 2.195761 +[Debug] dit_step2_xt: [2170, 64] first4: 0.209935 1.959334 -0.172368 0.502450 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.247537 1.164770 0.276511 2.503829 +[Debug] dit_step3_xt: [2170, 64] first4: 0.189307 1.862270 -0.195410 0.293797 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.384617 1.107927 0.073075 2.612695 +[Debug] dit_step4_xt: [2170, 64] first4: 0.148098 1.743563 -0.203240 0.013866 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.180515 0.944257 -0.458470 2.697840 +[Debug] dit_step5_xt: [2170, 64] first4: 0.122310 1.608669 -0.137744 -0.371540 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: -0.245520 0.941769 -1.135058 2.750750 +[Debug] dit_step6_xt: [2170, 64] first4: 0.171414 1.420316 0.089267 -0.921690 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.488470 0.849564 -1.659694 3.185843 +[Debug] dit_x0: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 249.1 ms (249.1 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 820.0 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000325 0.000812 0.000671 0.000911 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:54:39.264 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:54:39.265 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:54:39.265 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:54:39.265 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:54:39.265 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:54:40.025 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:54:41.587 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:54:41.587 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:54:41.592 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:54:41.751 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:54:41.759 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:54:41.771 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:54:41.772 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:54:41.805 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:54:42.113 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:54:42.114 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:54:42.114 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006765604019165039, 'diffusion_time_cost': 0.3010725975036621, 'diffusion_per_step_time_cost': 0.037634074687957764, 'total_time_cost': 0.30783820152282715, 'offload_time_cost': 0.0} +2026-03-01 19:54:42.128 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:54:42.405 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:54:42.408 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:54:42.411 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf +[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999805 + lyric_embed 1.000000 + enc_hidden 0.997032 + detok_output 0.999610 + context 0.999750 + noise 1.000000 + temb_t 0.999902 + hidden_after_proj_in 0.999908 + enc_after_cond_emb 0.997517 + layer0_sa_output 0.998371 + hidden_after_layer0 0.999675 + hidden_after_layer6 0.999257 + hidden_after_layer12 0.995500 + hidden_after_layer18 0.991597 + hidden_after_layer23 0.985460 + dit_step0_vt 0.947383 + dit_step0_xt 0.999885 + dit_step1_vt 0.947784 + dit_step1_xt 0.999617 + dit_step2_vt 0.957305 + dit_step2_xt 0.999014 + dit_step3_vt 0.961931 + dit_step3_xt 0.997757 + dit_step4_vt 0.959773 + dit_step4_xt 0.994900 + dit_step5_vt 0.956611 + dit_step5_xt 0.988539 + dit_step6_vt 0.950669 + dit_step6_xt 0.976494 + dit_step7_vt 0.938658 + dit_x0 0.958725 + vae_audio 0.837763 + vae_audio (STFT cosine) 0.954448 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999885 0.165835 0.010206 -0.002260 0.973133 -0.002342 0.972003 + dit_step1_xt 0.999617 0.269038 0.018058 -0.005119 0.943095 -0.005313 0.941730 + dit_step2_xt 0.999014 0.433553 0.027847 -0.009033 0.910111 -0.009311 0.908527 + dit_step3_xt 0.997757 0.593449 0.040253 -0.014301 0.875156 -0.014577 0.873624 + dit_step4_xt 0.994900 0.889597 0.058068 -0.021205 0.843622 -0.021660 0.841995 + dit_step5_xt 0.988539 1.371047 0.084767 -0.031100 0.827136 -0.032109 0.824593 + dit_step6_xt 0.976494 1.997185 0.125556 -0.045244 0.858177 -0.046482 0.855546 diff --git a/tests/CUDA-Q5_K_M.log b/tests/CUDA-Q5_K_M.log new file mode 100644 index 0000000..00b9652 --- /dev/null +++ b/tests/CUDA-Q5_K_M.log @@ -0,0 +1,259 @@ +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes +[Load] DiT backend: CUDA0 (CPU threads: 16) +[Load] Backend init: 25.7 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 465.4 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CUDA0 (CPU threads: 16) +[VAE] Backend: CUDA0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 656.4 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 31.3 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 127.3 ms +[Encode] TextEncoder (70 tokens): 49.5 ms +[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.4 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 412.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 138.7 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 13.1 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.760389 -0.050879 -0.130835 0.059141 +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 73.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 24.2 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 121.7 ms +[Debug] detok_output: [2170, 64] first4: -0.125017 1.460327 0.292545 -0.654237 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.125017 1.460327 0.292545 -0.654237 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +[Debug] tproj: [12288] first4: 0.260130 -0.161413 -0.102271 0.051211 +[Debug] temb: [2048] first4: -0.000033 -0.132307 -0.035515 0.064775 +[Debug] temb_t: [2048] first4: 0.000653 0.026699 -0.052806 0.063542 +[Debug] temb_r: [2048] first4: -0.000685 -0.159005 0.017290 0.001234 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.051436 -0.053873 -0.011918 -0.038393 +[Debug] temb_lin1_r: [2048] first4: -0.016164 -0.021120 -0.015800 -0.000525 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.043269 -0.943395 0.541080 0.455623 +[Debug] proj_in_input: [192, 2170] first4: -0.125017 1.460327 0.292545 -0.654237 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.158078 0.738352 0.324930 -0.519564 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.721699 -0.748479 -0.051910 0.264453 +[Debug] layer0_q_after_rope: [128, 16] first4: -26.700098 -0.191763 0.241664 0.327243 +[Debug] layer0_k_after_rope: [128, 8] first4: -3.876794 0.412444 0.096899 0.724944 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.497476 0.145466 -0.380354 0.485316 +[Debug] layer0_attn_out: [2048, 1085] first4: -27.034651 -0.125372 0.405539 0.333085 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540176 -1.007621 0.171218 0.466798 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.603106 -0.810148 -0.307159 0.493001 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.100931 0.548624 50.178547 -0.840484 +[Debug] hidden_after_layer6: [2048, 1085] first4: -20.448851 0.734318 29.757233 -4.634385 +[Debug] hidden_after_layer12: [2048, 1085] first4: -18.620174 -17.772619 67.315002 24.878105 +[Debug] hidden_after_layer18: [2048, 1085] first4: -25.252079 10.759434 60.574448 19.297585 +[Debug] hidden_after_layer23: [2048, 1085] first4: -3.474268 32.243759 194.636520 160.608047 +[Debug] dit_step0_vt: [2170, 64] first4: 0.008642 1.131305 0.289193 2.355634 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193943 2.104827 -0.185020 0.740582 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.205228 1.406502 -0.196234 1.800572 +[Debug] dit_step1_xt: [2170, 64] first4: 0.205137 2.028109 -0.174316 0.642369 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.122410 1.295395 0.084284 2.386701 +[Debug] dit_step2_xt: [2170, 64] first4: 0.213298 1.941749 -0.179935 0.483256 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.323829 1.081727 0.260844 2.578709 +[Debug] dit_step3_xt: [2170, 64] first4: 0.186312 1.851605 -0.201672 0.268363 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.355370 0.943008 0.097293 2.745308 +[Debug] dit_step4_xt: [2170, 64] first4: 0.148237 1.750569 -0.212097 -0.025777 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.370461 0.859429 -0.430240 2.691899 +[Debug] dit_step5_xt: [2170, 64] first4: 0.095314 1.627793 -0.150634 -0.410334 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.268117 0.608156 -0.982653 2.831516 +[Debug] dit_step6_xt: [2170, 64] first4: 0.041691 1.506162 0.045897 -0.976637 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: 0.031181 0.378487 -1.509792 3.095486 +[Debug] dit_x0: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 251.1 ms (251.1 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 804.2 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000692 0.001098 0.000938 0.001230 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:54:31.395 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:54:31.395 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:54:31.395 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:54:31.395 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:54:31.395 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:54:32.168 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:54:33.881 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:54:33.882 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:54:33.887 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:54:34.060 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:54:34.068 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:54:34.081 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:54:34.081 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:54:34.105 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:54:34.415 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:54:34.416 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:54:34.416 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006921052932739258, 'diffusion_time_cost': 0.3029003143310547, 'diffusion_per_step_time_cost': 0.037862539291381836, 'total_time_cost': 0.30982136726379395, 'offload_time_cost': 0.0} +2026-03-01 19:54:34.431 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:54:34.714 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:54:34.716 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:54:34.720 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf +[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999805 + lyric_embed 1.000000 + enc_hidden 0.999038 + detok_output 0.999875 + context 0.999920 + noise 1.000000 + temb_t 0.999972 + hidden_after_proj_in 0.999960 + enc_after_cond_emb 0.999148 + layer0_sa_output 0.999386 + hidden_after_layer0 0.999829 + hidden_after_layer6 0.999741 + hidden_after_layer12 0.998654 + hidden_after_layer18 0.995432 + hidden_after_layer23 0.991374 + dit_step0_vt 0.968035 + dit_step0_xt 0.999930 + dit_step1_vt 0.971217 + dit_step1_xt 0.999785 + dit_step2_vt 0.970740 + dit_step2_xt 0.999391 + dit_step3_vt 0.973678 + dit_step3_xt 0.998557 + dit_step4_vt 0.972169 + dit_step4_xt 0.996665 + dit_step5_vt 0.967356 + dit_step5_xt 0.992218 + dit_step6_vt 0.962469 + dit_step6_xt 0.983446 + dit_step7_vt 0.953383 + dit_x0 0.970119 + vae_audio 0.883226 + vae_audio (STFT cosine) 0.968463 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999930 0.139407 0.007818 -0.002306 0.973025 -0.002342 0.972003 + dit_step1_xt 0.999785 0.264377 0.013418 -0.005299 0.942885 -0.005313 0.941730 + dit_step2_xt 0.999391 0.455966 0.021259 -0.009285 0.909477 -0.009311 0.908527 + dit_step3_xt 0.998557 0.657160 0.031461 -0.014661 0.874187 -0.014577 0.873624 + dit_step4_xt 0.996665 0.973354 0.045708 -0.021890 0.842366 -0.021660 0.841995 + dit_step5_xt 0.992218 1.446589 0.067697 -0.032248 0.825911 -0.032109 0.824593 + dit_step6_xt 0.983446 2.092730 0.101558 -0.046788 0.857148 -0.046482 0.855546 diff --git a/tests/CUDA-Q6_K.log b/tests/CUDA-Q6_K.log new file mode 100644 index 0000000..10b9a7a --- /dev/null +++ b/tests/CUDA-Q6_K.log @@ -0,0 +1,259 @@ +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes +[Load] DiT backend: CUDA0 (CPU threads: 16) +[Load] Backend init: 9.5 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 514.8 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CUDA0 (CPU threads: 16) +[VAE] Backend: CUDA0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 657.3 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 30.7 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 125.7 ms +[Encode] TextEncoder (70 tokens): 49.2 ms +[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.3 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 476.3 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 145.8 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 11.0 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.760759 -0.050104 -0.133269 0.058044 +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 82.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 26.4 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 123.5 ms +[Debug] detok_output: [2170, 64] first4: -0.140341 1.456987 0.310602 -0.632665 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.140341 1.456987 0.310602 -0.632665 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] tproj: [12288] first4: 0.259936 -0.161027 -0.098424 0.051532 +[Debug] temb: [2048] first4: 0.000362 -0.132329 -0.035400 0.064685 +[Debug] temb_t: [2048] first4: 0.001493 0.026964 -0.052786 0.063738 +[Debug] temb_r: [2048] first4: -0.001131 -0.159293 0.017385 0.000947 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049350 -0.051345 -0.017496 -0.036550 +[Debug] temb_lin1_r: [2048] first4: -0.014407 -0.020607 -0.015728 0.003874 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.035398 -0.945894 0.539823 0.447660 +[Debug] proj_in_input: [192, 2170] first4: -0.140341 1.456987 0.310602 -0.632665 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.173062 0.808074 0.315076 -0.565566 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.714711 -0.749357 -0.048320 0.261221 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.602913 -0.815329 -0.317055 0.489857 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.173062 0.808074 0.315076 -0.565566 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.503780 0.189824 -0.364929 0.517029 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.537518 -1.029960 0.183371 0.458036 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.537518 -1.029960 0.183371 0.458036 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.602913 -0.815329 -0.317055 0.489857 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.163809 0.540625 51.895596 -0.846802 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.398865 0.172627 33.376564 -4.390195 +[Debug] hidden_after_layer12: [2048, 1085] first4: -14.881160 -16.518404 74.148743 29.243643 +[Debug] hidden_after_layer18: [2048, 1085] first4: -27.662983 14.134428 61.787987 20.210526 +[Debug] hidden_after_layer23: [2048, 1085] first4: -15.642601 51.246216 194.762726 138.743362 +[Debug] dit_step0_vt: [2170, 64] first4: 0.094566 1.115330 0.308673 2.389967 +[Debug] dit_step0_xt: [2170, 64] first4: 0.190037 2.105553 -0.185906 0.739021 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.145169 1.334249 -0.184111 1.908013 +[Debug] dit_step1_xt: [2170, 64] first4: 0.197956 2.032776 -0.175863 0.634948 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: 0.039341 1.248196 0.097777 2.389248 +[Debug] dit_step2_xt: [2170, 64] first4: 0.195333 1.949563 -0.182382 0.475665 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.285024 1.101088 0.266534 2.655225 +[Debug] dit_step3_xt: [2170, 64] first4: 0.171581 1.857805 -0.204593 0.254396 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.327536 1.017564 0.096598 2.731005 +[Debug] dit_step4_xt: [2170, 64] first4: 0.136488 1.748781 -0.214943 -0.038212 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.307848 0.903341 -0.319663 2.789687 +[Debug] dit_step5_xt: [2170, 64] first4: 0.092510 1.619732 -0.169276 -0.436738 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.196603 0.584326 -0.838176 2.772917 +[Debug] dit_step6_xt: [2170, 64] first4: 0.053189 1.502867 -0.001641 -0.991322 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: 0.081321 0.135461 -1.397063 2.986206 +[Debug] dit_x0: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 273.2 ms (273.2 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 804.3 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000481 0.000872 0.000838 0.001216 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:54:23.682 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:54:23.683 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:54:23.683 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:54:23.683 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:54:23.683 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:54:24.419 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:54:25.992 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:54:25.992 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:54:25.998 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:54:26.157 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:54:26.166 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:54:26.178 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:54:26.178 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:54:26.214 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:54:26.528 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:54:26.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:54:26.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00680994987487793, 'diffusion_time_cost': 0.30716919898986816, 'diffusion_per_step_time_cost': 0.03839614987373352, 'total_time_cost': 0.3139791488647461, 'offload_time_cost': 0.0} +2026-03-01 19:54:26.543 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB +2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB +2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB +2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:54:26.821 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:54:26.824 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:54:26.828 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf +[GGML] Running acestep-v15-turbo-Q6_K.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999805 + lyric_embed 1.000000 + enc_hidden 0.999638 + detok_output 0.999962 + context 0.999976 + noise 1.000000 + temb_t 0.999990 + hidden_after_proj_in 0.999980 + enc_after_cond_emb 0.999648 + layer0_sa_output 0.999763 + hidden_after_layer0 0.999888 + hidden_after_layer6 0.999853 + hidden_after_layer12 0.998917 + hidden_after_layer18 0.995924 + hidden_after_layer23 0.992281 + dit_step0_vt 0.971207 + dit_step0_xt 0.999937 + dit_step1_vt 0.975354 + dit_step1_xt 0.999803 + dit_step2_vt 0.978312 + dit_step2_xt 0.999479 + dit_step3_vt 0.977879 + dit_step3_xt 0.998730 + dit_step4_vt 0.976291 + dit_step4_xt 0.997040 + dit_step5_vt 0.973193 + dit_step5_xt 0.993208 + dit_step6_vt 0.969738 + dit_step6_xt 0.985862 + dit_step7_vt 0.962454 + dit_x0 0.974866 + vae_audio 0.893678 + vae_audio (STFT cosine) 0.969663 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999937 0.147590 0.007252 -0.002265 0.972930 -0.002342 0.972003 + dit_step1_xt 0.999803 0.291665 0.012432 -0.005192 0.942660 -0.005313 0.941730 + dit_step2_xt 0.999479 0.474224 0.019215 -0.009147 0.909068 -0.009311 0.908527 + dit_step3_xt 0.998730 0.730810 0.028734 -0.014438 0.873565 -0.014577 0.873624 + dit_step4_xt 0.997040 1.058607 0.042049 -0.021507 0.841532 -0.021660 0.841995 + dit_step5_xt 0.993208 1.534989 0.062024 -0.031604 0.824595 -0.032109 0.824593 + dit_step6_xt 0.985862 2.188862 0.092252 -0.045920 0.855268 -0.046482 0.855546 diff --git a/tests/CUDA-Q8_0.log b/tests/CUDA-Q8_0.log new file mode 100644 index 0000000..3a84ce1 --- /dev/null +++ b/tests/CUDA-Q8_0.log @@ -0,0 +1,259 @@ +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes +[Load] DiT backend: CUDA0 (CPU threads: 16) +[Load] Backend init: 9.5 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 221.9 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: CUDA0 (CPU threads: 16) +[VAE] Backend: CUDA0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 658.9 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 31.2 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 127.0 ms +[Encode] TextEncoder (70 tokens): 68.2 ms +[Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 12.3 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 616.6 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 65.2 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 8.9 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.759220 -0.049559 -0.133467 0.058389 +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 106.5 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 12.1 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 104.8 ms +[Debug] detok_output: [2170, 64] first4: -0.120490 1.436288 0.301594 -0.632564 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.120490 1.436288 0.301594 -0.632564 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] tproj: [12288] first4: 0.259485 -0.161550 -0.096885 0.051766 +[Debug] temb: [2048] first4: 0.000214 -0.132557 -0.035428 0.064847 +[Debug] temb_t: [2048] first4: 0.001194 0.026823 -0.052744 0.063762 +[Debug] temb_r: [2048] first4: -0.000980 -0.159380 0.017316 0.001084 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049228 -0.051913 -0.015026 -0.038076 +[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018835 -0.015731 0.008462 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.038152 -0.959088 0.538689 0.447583 +[Debug] proj_in_input: [192, 2170] first4: -0.120490 1.436288 0.301594 -0.632564 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.179956 0.813643 0.335613 -0.560954 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.718369 -0.758056 -0.046880 0.261627 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.602359 -0.824703 -0.282831 0.487491 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.179956 0.813643 0.335613 -0.560954 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.515045 0.163439 -0.354657 0.502281 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.544158 -1.031644 0.192299 0.456963 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.544158 -1.031644 0.192299 0.456963 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.602359 -0.824703 -0.282831 0.487491 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.065077 0.563297 52.194237 -0.851381 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.390320 0.130250 33.949810 -4.149052 +[Debug] hidden_after_layer12: [2048, 1085] first4: -15.173199 -18.820404 72.616402 28.693943 +[Debug] hidden_after_layer18: [2048, 1085] first4: -25.768595 14.047658 61.759544 20.186539 +[Debug] hidden_after_layer23: [2048, 1085] first4: -4.011688 41.168625 196.180222 144.774246 +[Debug] dit_step0_vt: [2170, 64] first4: 0.018630 1.127245 0.345143 2.384104 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193489 2.105012 -0.187563 0.739288 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.199466 1.323973 -0.114465 1.890695 +[Debug] dit_step1_xt: [2170, 64] first4: 0.204369 2.032795 -0.181320 0.636159 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.009733 1.241250 0.116473 2.389213 +[Debug] dit_step2_xt: [2170, 64] first4: 0.205018 1.950045 -0.189085 0.476878 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.246129 1.078655 0.270095 2.675214 +[Debug] dit_step3_xt: [2170, 64] first4: 0.184507 1.860157 -0.211593 0.253944 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.271080 1.036363 0.114070 2.726085 +[Debug] dit_step4_xt: [2170, 64] first4: 0.155463 1.749118 -0.223814 -0.038137 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.276045 0.944946 -0.294077 2.780135 +[Debug] dit_step5_xt: [2170, 64] first4: 0.116028 1.614126 -0.181803 -0.435299 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.156088 0.649257 -0.836919 2.794098 +[Debug] dit_step6_xt: [2170, 64] first4: 0.084810 1.484275 -0.014420 -0.994119 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.007394 0.229067 -1.488817 3.083439 +[Debug] dit_x0: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 242.9 ms (242.9 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 822.6 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000524 0.000859 0.000752 0.001056 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:54:15.905 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:54:15.906 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:54:15.906 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:54:15.906 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:54:15.906 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:54:16.672 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:54:18.198 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:54:18.198 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:54:18.207 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:54:18.371 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:54:18.380 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:54:18.392 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:54:18.392 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:54:18.418 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:54:18.724 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:54:18.724 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:54:18.724 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006882190704345703, 'diffusion_time_cost': 0.298403263092041, 'diffusion_per_step_time_cost': 0.03730040788650513, 'total_time_cost': 0.3052854537963867, 'offload_time_cost': 0.0} +2026-03-01 19:54:18.739 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB +2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB +2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB +2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:54:19.031 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:54:19.034 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:54:19.037 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf +[GGML] Running acestep-v15-turbo-Q8_0.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999805 + lyric_embed 1.000000 + enc_hidden 0.999784 + detok_output 0.999983 + context 0.999990 + noise 1.000000 + temb_t 0.999997 + hidden_after_proj_in 0.999986 + enc_after_cond_emb 0.999765 + layer0_sa_output 0.999924 + hidden_after_layer0 0.999957 + hidden_after_layer6 0.999892 + hidden_after_layer12 0.999346 + hidden_after_layer18 0.996758 + hidden_after_layer23 0.993881 + dit_step0_vt 0.976421 + dit_step0_xt 0.999948 + dit_step1_vt 0.979128 + dit_step1_xt 0.999834 + dit_step2_vt 0.982059 + dit_step2_xt 0.999561 + dit_step3_vt 0.983029 + dit_step3_xt 0.998948 + dit_step4_vt 0.981353 + dit_step4_xt 0.997565 + dit_step5_vt 0.978860 + dit_step5_xt 0.994480 + dit_step6_vt 0.976051 + dit_step6_xt 0.988641 + dit_step7_vt 0.970144 + dit_x0 0.979969 + vae_audio 0.905525 + vae_audio (STFT cosine) 0.976530 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999948 0.134961 0.006551 -0.002307 0.972901 -0.002342 0.972003 + dit_step1_xt 0.999834 0.262688 0.011280 -0.005306 0.942604 -0.005313 0.941730 + dit_step2_xt 0.999561 0.448301 0.017428 -0.009351 0.909110 -0.009311 0.908527 + dit_step3_xt 0.998948 0.617858 0.025766 -0.014708 0.873709 -0.014577 0.873624 + dit_step4_xt 0.997565 0.740504 0.037507 -0.021763 0.841873 -0.021660 0.841995 + dit_step5_xt 0.994480 1.211945 0.054863 -0.031844 0.825164 -0.032109 0.824593 + dit_step6_xt 0.988641 2.056566 0.081142 -0.046105 0.856063 -0.046482 0.855546 diff --git a/tests/Metal-Q4_K_M.log b/tests/Metal-Q4_K_M.log new file mode 100644 index 0000000..e1ad24f --- /dev/null +++ b/tests/Metal-Q4_K_M.log @@ -0,0 +1,835 @@ +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.006 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 20.9 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 895.6 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 1421.5 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 337.8 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 42.3 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 593.9 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x11de0dee0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x11de0e340 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x11de0ebb0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x11de0f030 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x11de0f8a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x11de0fed0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x11de107b0 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x11de11170 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x11de10350 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 44.4 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 33.8 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 140 tensors, 352.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 543.9 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x11de1b4b0 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1 0x11de1ba60 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x11de1bea0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x11de1c500 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 149.3 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.751263 -0.045978 -0.129705 0.058765 +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 64.7 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 113.4 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x11de10d70 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x11de0aff0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x11de0b950 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x11de1c9a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8 0x11de1d9f0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x11de1dfa0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x11de1e320 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x11de1e580 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x11de1ef20 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 1044.0 ms +[Debug] detok_output: [2170, 64] first4: -0.105288 1.440285 0.304742 -0.636920 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.105288 1.440285 0.304742 -0.636920 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x11f008d70 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x11f009830 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x11f009c40 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2 0x11f00ac80 | th_max = 768 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x11f00b000 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x11f00b6c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x11f00b920 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x11f00bec0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x11f00c450 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x11f00cd60 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.260912 -0.160417 -0.090199 0.048634 +[Debug] temb: [2048] first4: 0.000215 -0.133911 -0.034469 0.065007 +[Debug] temb_t: [2048] first4: 0.000971 0.025677 -0.052124 0.063327 +[Debug] temb_r: [2048] first4: -0.000756 -0.159588 0.017655 0.001680 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049286 -0.053324 -0.012254 -0.047666 +[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.050396 -0.992003 0.526498 0.458000 +[Debug] proj_in_input: [192, 2170] first4: -0.105288 1.440285 0.304742 -0.636920 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.174268 0.781178 0.275122 -0.515942 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.726228 -0.772737 -0.041859 0.262417 +[Debug] layer0_q_after_rope: [128, 16] first4: -12.136272 0.820533 1.509364 1.799582 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.174268 0.781178 0.275122 -0.515942 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.599759 0.160940 -0.480259 0.455996 +[Debug] layer0_attn_out: [2048, 1085] first4: -12.315464 1.144032 1.760677 1.796125 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.579560 -1.062863 0.061853 0.466855 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.703488 -0.838320 -0.450424 0.503514 +[Debug] hidden_after_layer0: [2048, 1085] first4: -8.870923 0.423529 48.381233 -0.778579 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.397562 -1.526012 29.991730 -3.928804 +[Debug] hidden_after_layer12: [2048, 1085] first4: -17.419617 -13.309786 66.317848 28.914410 +[Debug] hidden_after_layer18: [2048, 1085] first4: -16.562674 9.657765 55.222641 17.661957 +[Debug] hidden_after_layer23: [2048, 1085] first4: -19.112629 7.039753 181.464966 133.927719 +[Debug] dit_step0_vt: [2170, 64] first4: -0.112419 1.107940 0.244994 2.200569 +[Debug] dit_step0_xt: [2170, 64] first4: 0.199446 2.105889 -0.183011 0.747630 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.082195 1.204432 -0.273788 1.824850 +[Debug] dit_step1_xt: [2170, 64] first4: 0.203929 2.040193 -0.168077 0.648093 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.043690 1.209513 0.074423 2.191977 +[Debug] dit_step2_xt: [2170, 64] first4: 0.206842 1.959559 -0.173039 0.501961 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.238132 1.171738 0.272480 2.506455 +[Debug] dit_step3_xt: [2170, 64] first4: 0.186998 1.861914 -0.195745 0.293090 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.293275 1.147521 0.096848 2.639339 +[Debug] dit_step4_xt: [2170, 64] first4: 0.155575 1.738965 -0.206122 0.010304 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.200179 1.089847 -0.403776 2.739777 +[Debug] dit_step5_xt: [2170, 64] first4: 0.126978 1.583273 -0.148440 -0.381093 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: -0.078240 0.999644 -1.058107 2.768797 +[Debug] dit_step6_xt: [2170, 64] first4: 0.142626 1.383344 0.063182 -0.934852 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.417903 0.862772 -1.662739 3.246292 +[Debug] dit_x0: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 7809.5 ms (7809.5 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x11de1ab80 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x11de1ceb0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x11de1f410 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x11de1f670 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x11de1fa20 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x11de20200 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x11de20760 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x11de216c0 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x11de21920 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609663.4 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000307 0.000830 0.000664 0.001050 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.006 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 18.8 ms +[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 895.6 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 1269.3 ms +[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 272.1 ms +[Request 1/1] ggml-sft/request0.json (batch=1) +[Request] parsed ggml-sft/request0.json (18 fields) +[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 41.8 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 231.9 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x15570a490 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x15570a8f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x15570b160 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x15570b5e0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x15570be50 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x15570c480 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x15570cd60 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x15570d170 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x15570d3d0 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 48.9 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 33.9 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 140 tensors, 352.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 601.2 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x155717100 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1 0x1557176b0 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x155717a30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x155718090 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 151.9 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.751314 -0.046022 -0.129862 0.058756 +[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 30 tensors, 64.7 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 101.7 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x15570ebf0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x155707790 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x155707dc0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x1557074e0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8 0x1557192f0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x1557198a0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x155719c20 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x155719e80 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x15571a8c0 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 1040.2 ms +[Debug] detok_output: [2170, 64] first4: -0.105274 1.439665 0.307319 -0.637002 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.105274 1.439665 0.307319 -0.637002 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x15560cd80 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x15560d720 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x15560db30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2 0x15560eb70 | th_max = 768 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x15560eef0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x15560f5b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x15560f810 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x15560fdb0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x155610340 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x155610d60 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.154826 -0.114975 -0.093002 0.082122 +[Debug] temb: [2048] first4: -0.003593 -0.176168 0.003892 -0.001352 +[Debug] temb_t: [2048] first4: -0.002002 0.003482 -0.013423 -0.001611 +[Debug] temb_r: [2048] first4: -0.001591 -0.179650 0.017315 0.000259 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.026166 0.013606 0.032789 -0.028782 +[Debug] temb_lin1_r: [2048] first4: -0.001795 -0.011535 -0.006725 -0.011136 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.101326 -0.948224 0.490204 0.449757 +[Debug] proj_in_input: [192, 2170] first4: -0.105274 1.439665 0.307319 -0.637002 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.189214 0.805478 0.284418 -0.472295 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.923880 -0.725952 -0.044805 0.297821 +[Debug] layer0_q_after_rope: [128, 16] first4: -12.125128 0.516320 1.460617 1.783048 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.189214 0.805478 0.284418 -0.472295 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.642741 0.751692 -0.708185 0.515940 +[Debug] layer0_attn_out: [2048, 1085] first4: -11.610563 1.032188 1.685498 1.814675 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803507 -1.373816 -0.306776 0.394307 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.998315 -1.012332 -0.558752 0.397301 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.408201 1.261657 57.661659 -1.674409 +[Debug] hidden_after_layer6: [2048, 1085] first4: -13.125732 4.401457 57.923130 1.593087 +[Debug] hidden_after_layer12: [2048, 1085] first4: -12.760151 8.784775 -27.576780 1.266083 +[Debug] hidden_after_layer18: [2048, 1085] first4: -3.029438 18.924910 -37.522003 -17.408060 +[Debug] hidden_after_layer23: [2048, 1085] first4: 25.718348 50.253456 58.487469 -24.616550 +[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.549879 2.587143 -0.199758 1.525680 +[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.221552 2.068977 -0.854510 1.731250 +[Debug] dit_step0_vt: [2170, 64] first4: -0.770128 3.170936 0.103367 1.213956 +[Debug] dit_step0_xt: [2170, 64] first4: 0.209738 2.092831 -0.173942 0.823377 +[DiT] step 1/50 t=1.000 +[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.553963 2.540515 -0.004453 1.412831 +[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.507386 2.385438 -0.093360 1.515296 +[Debug] dit_step1_vt: [2170, 64] first4: -0.244245 1.996188 -0.241419 1.557151 +[Debug] dit_step1_xt: [2170, 64] first4: 0.214623 2.052907 -0.169114 0.792234 +[DiT] step 2/50 t=0.980 +[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.537810 2.506870 -0.002615 1.406658 +[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.493937 2.362162 -0.101829 1.455003 +[Debug] dit_step2_vt: [2170, 64] first4: -0.705231 2.991064 0.252674 1.183649 +[Debug] dit_step2_xt: [2170, 64] first4: 0.228728 1.993086 -0.174167 0.768561 +[DiT] step 3/50 t=0.960 +[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.501613 2.438805 -0.019274 1.410215 +[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.521661 2.364079 -0.095044 1.376828 +[Debug] dit_step3_vt: [2170, 64] first4: -0.201260 2.055526 -0.239553 1.689172 +[Debug] dit_step3_xt: [2170, 64] first4: 0.232753 1.951976 -0.169376 0.734778 +[DiT] step 4/50 t=0.940 +[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.465795 2.359768 -0.032364 1.399407 +[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.532122 2.334485 -0.099644 1.341739 +[Debug] dit_step4_vt: [2170, 64] first4: -0.511269 2.624130 0.214392 1.268924 +[Debug] dit_step4_xt: [2170, 64] first4: 0.242979 1.899493 -0.173664 0.709399 +[DiT] step 5/50 t=0.920 +[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.416940 2.273875 -0.055556 1.387350 +[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.572103 2.299005 -0.092359 1.353066 +[Debug] dit_step5_vt: [2170, 64] first4: 0.057514 1.863401 -0.254107 1.537004 +[Debug] dit_step5_xt: [2170, 64] first4: 0.241828 1.862225 -0.168582 0.678659 +[DiT] step 6/50 t=0.900 +[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.399254 2.210152 -0.071076 1.369134 +[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.539425 2.227666 -0.114236 1.361075 +[Debug] dit_step6_vt: [2170, 64] first4: -0.380751 2.356979 0.167812 1.213706 +[Debug] dit_step6_xt: [2170, 64] first4: 0.249443 1.815086 -0.171938 0.654385 +[DiT] step 7/50 t=0.880 +[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.339429 2.118412 -0.091855 1.350106 +[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.475619 2.122700 -0.120878 1.360558 +[Debug] dit_step7_vt: [2170, 64] first4: 0.052995 1.858614 -0.256165 1.379718 +[Debug] dit_step7_xt: [2170, 64] first4: 0.248383 1.777913 -0.166815 0.626791 +[DiT] step 8/50 t=0.860 +[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.274483 2.023758 -0.093816 1.332238 +[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.393477 2.023790 -0.130114 1.332444 +[Debug] dit_step8_vt: [2170, 64] first4: -0.218486 2.105614 0.134615 1.230365 +[Debug] dit_step8_xt: [2170, 64] first4: 0.252753 1.735801 -0.169507 0.602183 +[DiT] step 9/50 t=0.840 +[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.208702 1.940720 -0.100297 1.317338 +[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.302713 1.942014 -0.150935 1.306566 +[Debug] dit_step9_vt: [2170, 64] first4: 0.068625 1.756381 -0.163156 1.360642 +[Debug] dit_step9_xt: [2170, 64] first4: 0.251381 1.700673 -0.166244 0.574971 +[DiT] step 10/50 t=0.820 +[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.162154 1.880021 -0.110640 1.303073 +[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.210358 1.886162 -0.152094 1.303815 +[Debug] dit_step10_vt: [2170, 64] first4: -0.200484 1.879984 0.061434 1.187651 +[Debug] dit_step10_xt: [2170, 64] first4: 0.255390 1.663074 -0.167473 0.551217 +[DiT] step 11/50 t=0.800 +[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.132763 1.841353 -0.152935 1.280443 +[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.156466 1.839952 -0.166283 1.309973 +[Debug] dit_step11_vt: [2170, 64] first4: -0.006319 1.715424 -0.248815 1.180641 +[Debug] dit_step11_xt: [2170, 64] first4: 0.255517 1.628765 -0.162497 0.527605 +[DiT] step 12/50 t=0.780 +[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.108732 1.804132 -0.204569 1.271017 +[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.137749 1.799717 -0.174060 1.349185 +[Debug] dit_step12_vt: [2170, 64] first4: -0.093850 1.775385 -0.218540 0.972914 +[Debug] dit_step12_xt: [2170, 64] first4: 0.257394 1.593257 -0.158126 0.508146 +[DiT] step 13/50 t=0.760 +[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.084325 1.755919 -0.251734 1.253830 +[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.116151 1.744928 -0.223829 1.345488 +[Debug] dit_step13_vt: [2170, 64] first4: 0.034148 1.681178 -0.334965 1.042164 +[Debug] dit_step13_xt: [2170, 64] first4: 0.256711 1.559634 -0.151426 0.487303 +[DiT] step 14/50 t=0.740 +[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.062454 1.706585 -0.275264 1.242871 +[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.092396 1.687153 -0.270903 1.319513 +[Debug] dit_step14_vt: [2170, 64] first4: -0.030339 1.704105 -0.218537 1.004399 +[Debug] dit_step14_xt: [2170, 64] first4: 0.257318 1.525552 -0.147056 0.467215 +[DiT] step 15/50 t=0.720 +[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.039531 1.653934 -0.274129 1.244472 +[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.065533 1.623524 -0.308950 1.280105 +[Debug] dit_step15_vt: [2170, 64] first4: 0.042593 1.646848 -0.174753 1.192683 +[Debug] dit_step15_xt: [2170, 64] first4: 0.256466 1.492615 -0.143561 0.443362 +[DiT] step 16/50 t=0.700 +[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.024221 1.582624 -0.288380 1.229998 +[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.041512 1.552975 -0.330420 1.243577 +[Debug] dit_step16_vt: [2170, 64] first4: -0.014702 1.584471 -0.181940 1.121346 +[Debug] dit_step16_xt: [2170, 64] first4: 0.256760 1.460925 -0.139922 0.420935 +[DiT] step 17/50 t=0.680 +[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.016144 1.507916 -0.306446 1.209517 +[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.023216 1.483080 -0.342848 1.208134 +[Debug] dit_step17_vt: [2170, 64] first4: 0.010192 1.492126 -0.218166 1.213425 +[Debug] dit_step17_xt: [2170, 64] first4: 0.256556 1.431083 -0.135559 0.396666 +[DiT] step 18/50 t=0.660 +[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.011327 1.429419 -0.322466 1.189975 +[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.006504 1.414708 -0.351011 1.186830 +[Debug] dit_step18_vt: [2170, 64] first4: -0.055648 1.401301 -0.242752 1.127735 +[Debug] dit_step18_xt: [2170, 64] first4: 0.257669 1.403057 -0.130704 0.374111 +[DiT] step 19/50 t=0.640 +[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.008919 1.352955 -0.336887 1.164963 +[Debug] dit_step19_vt_uncond: [2170, 64] first4: 0.006420 1.358623 -0.354804 1.168313 +[Debug] dit_step19_vt: [2170, 64] first4: -0.054127 1.236317 -0.295143 1.130394 +[Debug] dit_step19_xt: [2170, 64] first4: 0.258751 1.378330 -0.124801 0.351504 +[DiT] step 20/50 t=0.620 +[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.004449 1.272026 -0.345863 1.142193 +[Debug] dit_step20_vt_uncond: [2170, 64] first4: 0.019787 1.305161 -0.354228 1.148333 +[Debug] dit_step20_vt: [2170, 64] first4: -0.100401 1.079987 -0.320124 1.076506 +[Debug] dit_step20_xt: [2170, 64] first4: 0.260759 1.356731 -0.118398 0.329973 +[DiT] step 21/50 t=0.600 +[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.002161 1.194354 -0.356476 1.115376 +[Debug] dit_step21_vt_uncond: [2170, 64] first4: 0.027066 1.258520 -0.355503 1.123235 +[Debug] dit_step21_vt: [2170, 64] first4: -0.089629 0.890893 -0.367078 1.066256 +[Debug] dit_step21_xt: [2170, 64] first4: 0.262552 1.338913 -0.111057 0.308648 +[DiT] step 22/50 t=0.580 +[Debug] dit_step22_vt_cond: [2170, 64] first4: 0.001542 1.116787 -0.366798 1.082653 +[Debug] dit_step22_vt_uncond: [2170, 64] first4: 0.034784 1.215104 -0.359348 1.094688 +[Debug] dit_step22_vt: [2170, 64] first4: -0.114017 0.710875 -0.381058 1.001636 +[Debug] dit_step22_xt: [2170, 64] first4: 0.264832 1.324695 -0.103435 0.288616 +[DiT] step 23/50 t=0.560 +[Debug] dit_step23_vt_cond: [2170, 64] first4: 0.004356 1.043939 -0.376088 1.054782 +[Debug] dit_step23_vt_uncond: [2170, 64] first4: 0.040331 1.176215 -0.358597 1.069999 +[Debug] dit_step23_vt: [2170, 64] first4: -0.106657 0.513238 -0.439613 0.976581 +[Debug] dit_step23_xt: [2170, 64] first4: 0.266965 1.314431 -0.094643 0.269084 +[DiT] step 24/50 t=0.540 +[Debug] dit_step24_vt_cond: [2170, 64] first4: 0.004404 0.961254 -0.387939 1.015311 +[Debug] dit_step24_vt_uncond: [2170, 64] first4: 0.043793 1.129819 -0.356263 1.035491 +[Debug] dit_step24_vt: [2170, 64] first4: -0.131273 0.309370 -0.487982 0.900439 +[Debug] dit_step24_xt: [2170, 64] first4: 0.269591 1.308243 -0.084884 0.251075 +[DiT] step 25/50 t=0.520 +[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.001606 0.858703 -0.396162 0.970976 +[Debug] dit_step25_vt_uncond: [2170, 64] first4: 0.045187 1.067146 -0.350258 0.994534 +[Debug] dit_step25_vt: [2170, 64] first4: -0.160841 0.082930 -0.542274 0.862474 +[Debug] dit_step25_xt: [2170, 64] first4: 0.272808 1.306585 -0.074038 0.233826 +[DiT] step 26/50 t=0.500 +[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.011834 0.743138 -0.406478 0.912916 +[Debug] dit_step26_vt_uncond: [2170, 64] first4: 0.044098 0.988983 -0.348666 0.943761 +[Debug] dit_step26_vt: [2170, 64] first4: -0.203731 -0.135469 -0.575882 0.759197 +[Debug] dit_step26_xt: [2170, 64] first4: 0.276882 1.309294 -0.062520 0.218642 +[DiT] step 27/50 t=0.480 +[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.028043 0.640231 -0.413465 0.856122 +[Debug] dit_step27_vt_uncond: [2170, 64] first4: 0.038067 0.910543 -0.350117 0.887872 +[Debug] dit_step27_vt: [2170, 64] first4: -0.249926 -0.275849 -0.588337 0.733838 +[Debug] dit_step27_xt: [2170, 64] first4: 0.281881 1.314811 -0.050754 0.203965 +[DiT] step 28/50 t=0.460 +[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.048697 0.519480 -0.427048 0.785924 +[Debug] dit_step28_vt_uncond: [2170, 64] first4: 0.029577 0.811304 -0.356754 0.820204 +[Debug] dit_step28_vt: [2170, 64] first4: -0.313111 -0.465662 -0.625360 0.626629 +[Debug] dit_step28_xt: [2170, 64] first4: 0.288143 1.324124 -0.038247 0.191432 +[DiT] step 29/50 t=0.440 +[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.073682 0.390412 -0.435695 0.713586 +[Debug] dit_step29_vt_uncond: [2170, 64] first4: 0.022755 0.688592 -0.366629 0.750458 +[Debug] dit_step29_vt: [2170, 64] first4: -0.404692 -0.558608 -0.601264 0.570632 +[Debug] dit_step29_xt: [2170, 64] first4: 0.296237 1.335296 -0.026221 0.180020 +[DiT] step 30/50 t=0.420 +[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.100612 0.256910 -0.442863 0.643070 +[Debug] dit_step30_vt_uncond: [2170, 64] first4: 0.014270 0.550700 -0.380145 0.680719 +[Debug] dit_step30_vt: [2170, 64] first4: -0.477652 -0.675684 -0.591087 0.486411 +[Debug] dit_step30_xt: [2170, 64] first4: 0.305790 1.348810 -0.014400 0.170292 +[DiT] step 31/50 t=0.400 +[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.127005 0.130974 -0.446946 0.576489 +[Debug] dit_step31_vt_uncond: [2170, 64] first4: 0.003612 0.415976 -0.399074 0.614345 +[Debug] dit_step31_vt: [2170, 64] first4: -0.549710 -0.743030 -0.526327 0.431312 +[Debug] dit_step31_xt: [2170, 64] first4: 0.316784 1.363671 -0.003873 0.161665 +[DiT] step 32/50 t=0.380 +[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.154932 -0.000795 -0.447535 0.511295 +[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.007317 0.275916 -0.413101 0.549311 +[Debug] dit_step32_vt: [2170, 64] first4: -0.628125 -0.848536 -0.505066 0.360242 +[Debug] dit_step32_xt: [2170, 64] first4: 0.329347 1.380641 0.006228 0.154460 +[DiT] step 33/50 t=0.360 +[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.183072 -0.130801 -0.438493 0.449678 +[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.021971 0.136892 -0.420384 0.490091 +[Debug] dit_step33_vt: [2170, 64] first4: -0.685087 -0.931651 -0.428386 0.294226 +[Debug] dit_step33_xt: [2170, 64] first4: 0.343048 1.399274 0.014796 0.148576 +[DiT] step 34/50 t=0.340 +[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.207282 -0.251064 -0.429462 0.399560 +[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.035614 0.010201 -0.426610 0.442224 +[Debug] dit_step34_vt: [2170, 64] first4: -0.740469 -1.039289 -0.393755 0.238626 +[Debug] dit_step34_xt: [2170, 64] first4: 0.357858 1.420060 0.022671 0.143803 +[DiT] step 35/50 t=0.320 +[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.234011 -0.373429 -0.414613 0.349351 +[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.051328 -0.116322 -0.423153 0.392585 +[Debug] dit_step35_vt: [2170, 64] first4: -0.800518 -1.139187 -0.342183 0.192528 +[Debug] dit_step35_xt: [2170, 64] first4: 0.373868 1.442844 0.029515 0.139953 +[DiT] step 36/50 t=0.300 +[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.261591 -0.503509 -0.392160 0.303680 +[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.072050 -0.249828 -0.410849 0.351470 +[Debug] dit_step36_vt: [2170, 64] first4: -0.838416 -1.260836 -0.298992 0.122180 +[Debug] dit_step36_xt: [2170, 64] first4: 0.390637 1.468061 0.035495 0.137509 +[DiT] step 37/50 t=0.280 +[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.290611 -0.615966 -0.361295 0.261135 +[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.095822 -0.367916 -0.388325 0.310791 +[Debug] dit_step37_vt: [2170, 64] first4: -0.893251 -1.349895 -0.245346 0.089192 +[Debug] dit_step37_xt: [2170, 64] first4: 0.408502 1.495059 0.040402 0.135725 +[DiT] step 38/50 t=0.260 +[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.316862 -0.724614 -0.326989 0.221074 +[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.120406 -0.482601 -0.361356 0.272140 +[Debug] dit_step38_vt: [2170, 64] first4: -0.917953 -1.452874 -0.195436 0.033767 +[Debug] dit_step38_xt: [2170, 64] first4: 0.426861 1.524116 0.044310 0.135050 +[DiT] step 39/50 t=0.240 +[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.344701 -0.840724 -0.280406 0.181682 +[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.151500 -0.605403 -0.318787 0.232017 +[Debug] dit_step39_vt: [2170, 64] first4: -0.945851 -1.537027 -0.144223 0.008566 +[Debug] dit_step39_xt: [2170, 64] first4: 0.445778 1.554857 0.047195 0.134879 +[DiT] step 40/50 t=0.220 +[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.369051 -0.939547 -0.228334 0.139823 +[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.182335 -0.713639 -0.270236 0.191739 +[Debug] dit_step40_vt: [2170, 64] first4: -0.958883 -1.593756 -0.082150 -0.057526 +[Debug] dit_step40_xt: [2170, 64] first4: 0.464955 1.586732 0.048838 0.136029 +[DiT] step 41/50 t=0.200 +[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.388759 -1.034758 -0.170808 0.098079 +[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.215027 -0.818665 -0.212783 0.153622 +[Debug] dit_step41_vt: [2170, 64] first4: -0.929079 -1.656826 -0.036355 -0.101313 +[Debug] dit_step41_xt: [2170, 64] first4: 0.483537 1.619868 0.049565 0.138056 +[DiT] step 42/50 t=0.180 +[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.404481 -1.121373 -0.110304 0.048469 +[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.250394 -0.918649 -0.148512 0.113292 +[Debug] dit_step42_vt: [2170, 64] first4: -0.870129 -1.689520 0.009394 -0.198920 +[Debug] dit_step42_xt: [2170, 64] first4: 0.500939 1.653659 0.049377 0.142034 +[DiT] step 43/50 t=0.160 +[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.416518 -1.199422 -0.047277 -0.004303 +[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.285961 -1.014739 -0.080642 0.076449 +[Debug] dit_step43_vt: [2170, 64] first4: -0.799869 -1.709703 0.054214 -0.305297 +[Debug] dit_step43_xt: [2170, 64] first4: 0.516937 1.687853 0.048293 0.148140 +[DiT] step 44/50 t=0.140 +[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.422798 -1.270758 0.022277 -0.058297 +[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.318056 -1.108378 -0.007512 0.042141 +[Debug] dit_step44_vt: [2170, 64] first4: -0.718613 -1.710690 0.113612 -0.432909 +[Debug] dit_step44_xt: [2170, 64] first4: 0.531309 1.722067 0.046020 0.156798 +[DiT] step 45/50 t=0.120 +[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.430461 -1.334901 0.090295 -0.107751 +[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.346132 -1.190932 0.060499 0.012419 +[Debug] dit_step45_vt: [2170, 64] first4: -0.676233 -1.740750 0.184198 -0.543741 +[Debug] dit_step45_xt: [2170, 64] first4: 0.544834 1.756882 0.042336 0.167673 +[DiT] step 46/50 t=0.100 +[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.442548 -1.408986 0.177202 -0.124432 +[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.366053 -1.276834 0.124694 -0.015436 +[Debug] dit_step46_vt: [2170, 64] first4: -0.689058 -1.805405 0.393872 -0.448936 +[Debug] dit_step46_xt: [2170, 64] first4: 0.558615 1.792990 0.034459 0.176652 +[DiT] step 47/50 t=0.080 +[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.439555 -1.466634 0.226367 -0.147289 +[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.380429 -1.352640 0.169813 -0.038167 +[Debug] dit_step47_vt: [2170, 64] first4: -0.594441 -1.800792 0.366639 -0.559854 +[Debug] dit_step47_xt: [2170, 64] first4: 0.570504 1.829005 0.027126 0.187849 +[DiT] step 48/50 t=0.060 +[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.421519 -1.502992 0.243896 -0.165260 +[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.386849 -1.417176 0.200885 -0.065191 +[Debug] dit_step48_vt: [2170, 64] first4: -0.516278 -1.762812 0.360980 -0.463950 +[Debug] dit_step48_xt: [2170, 64] first4: 0.580829 1.864262 0.019907 0.197128 +[DiT] step 49/50 t=0.040 +[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.442348 -1.531937 0.237906 -0.192473 +[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.399571 -1.435245 0.199709 -0.084932 +[Debug] dit_step49_vt: [2170, 64] first4: -0.632891 -1.901084 0.347748 -0.622644 +[Debug] dit_x0: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581 +[DiT] step 50/50 t=0.020 +[DiT] Total generation: 97237.2 ms (97237.2 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x1556105a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x1556166d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x155616930 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x155616fc0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x155617400 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x155617a00 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x155617f60 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x155618e40 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x10b705130 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609639.3 ms +[Debug] vae_audio: [2, 4166400] first4: -0.001780 -0.001606 -0.001703 -0.001406 +[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +[Request] Loaded request0.json +[Noise] Reusing existing rng_philox_seed42.bf16 +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf +[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... +[GGML] Done, 47 dump files +[Turbo] Reusing existing Python dumps: python-turbo +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.997096 + detok_output 0.999629 + context 0.999763 + noise 1.000000 + temb_t 0.999906 + hidden_after_proj_in 0.999918 + enc_after_cond_emb 0.997606 + layer0_sa_output 0.998452 + hidden_after_layer0 0.999696 + hidden_after_layer6 0.999330 + hidden_after_layer12 0.995408 + hidden_after_layer18 0.991270 + hidden_after_layer23 0.984826 + dit_step0_vt 0.944528 + dit_step0_xt 0.999878 + dit_step1_vt 0.947871 + dit_step1_xt 0.999609 + dit_step2_vt 0.956355 + dit_step2_xt 0.998980 + dit_step3_vt 0.961293 + dit_step3_xt 0.997669 + dit_step4_vt 0.958834 + dit_step4_xt 0.994713 + dit_step5_vt 0.956132 + dit_step5_xt 0.988221 + dit_step6_vt 0.950838 + dit_step6_xt 0.976124 + dit_step7_vt 0.938802 + dit_x0 0.958347 + vae_audio 0.832313 + vae_audio (log spectral) 0.999533 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999877 0.165977 0.010464 -0.002251 0.973155 -0.002342 0.972003 + dit_step1_xt 0.999608 0.266862 0.018170 -0.005108 0.943161 -0.005313 0.941730 + dit_step2_xt 0.998979 0.448963 0.028101 -0.009001 0.910184 -0.009311 0.908527 + dit_step3_xt 0.997667 0.610427 0.040689 -0.014279 0.875248 -0.014577 0.873624 + dit_step4_xt 0.994712 0.903635 0.058677 -0.021196 0.843722 -0.021660 0.841995 + dit_step5_xt 0.988220 1.370464 0.085448 -0.031128 0.827283 -0.032109 0.824593 + dit_step6_xt 0.976123 1.998804 0.126069 -0.045345 0.858424 -0.046482 0.855546 +[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf +[GGML] Running acestep-v15-sft-Q4_K_M.gguf... +[GGML] Done, 233 dump files +[SFT] Reusing existing Python dumps: python-sft +[SFT] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.997097 + detok_output 0.999629 + context 0.999763 + noise 1.000000 + temb_t 0.999673 + hidden_after_proj_in 0.999917 + enc_after_cond_emb 0.997598 + layer0_sa_output 0.998569 + hidden_after_layer0 0.999686 + hidden_after_layer6 0.999172 + hidden_after_layer12 0.997776 + hidden_after_layer18 0.996818 + hidden_after_layer23 0.997039 + null_condition_emb 1.000000 + null_enc_hidden 1.000000 + dit_step0_vt_cond 0.996934 + dit_step0_vt_uncond 0.996212 + dit_step0_vt 0.990566 + dit_step0_xt 0.999995 + dit_step5_vt_cond 0.995434 + dit_step5_vt 0.980046 + dit_step5_xt 0.999823 + dit_step10_vt_cond 0.991133 + dit_step10_vt 0.971906 + dit_step10_xt 0.999207 + dit_step15_vt_cond 0.982704 + dit_step15_vt 0.950629 + dit_step15_xt 0.997454 + dit_step20_vt_cond 0.968600 + dit_step20_vt 0.929360 + dit_step20_xt 0.993412 + dit_step25_vt_cond 0.951686 + dit_step25_vt 0.903442 + dit_step25_xt 0.986280 + dit_step30_vt_cond 0.931805 + dit_step30_vt 0.881992 + dit_step30_xt 0.976117 + dit_step35_vt_cond 0.911309 + dit_step35_vt 0.858516 + dit_step35_xt 0.964745 + dit_step40_vt_cond 0.898448 + dit_step40_vt 0.843064 + dit_step40_xt 0.954421 + dit_step45_vt_cond 0.908747 + dit_step45_vt 0.865504 + dit_step45_xt 0.947533 + dit_step49_vt_cond 0.927312 + dit_step49_vt 0.885368 + dit_x0 0.945292 + vae_audio 0.825801 + vae_audio (log spectral) 0.999459 +[SFT] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999994 0.035677 0.002825 -0.001840 0.980345 -0.001741 0.980402 + dit_step5_xt 0.999822 0.191921 0.012992 -0.007283 0.890515 -0.007143 0.887999 + dit_step10_xt 0.999206 0.526469 0.024282 -0.012946 0.812557 -0.012603 0.811299 + dit_step15_xt 0.997453 0.836399 0.039177 -0.018559 0.748243 -0.018114 0.745269 + dit_step20_xt 0.993411 1.152330 0.058726 -0.024275 0.703300 -0.023808 0.699582 + dit_step25_xt 0.986279 1.542745 0.081991 -0.030177 0.682229 -0.029311 0.679278 + dit_step30_xt 0.976117 1.915049 0.109049 -0.036245 0.688533 -0.035027 0.685262 + dit_step35_xt 0.964744 2.242426 0.138946 -0.042318 0.720837 -0.040716 0.717196 + dit_step40_xt 0.954421 2.562076 0.170565 -0.048389 0.775001 -0.046462 0.771853 + dit_step45_xt 0.947532 2.889421 0.200672 -0.054787 0.846930 -0.052475 0.843036 diff --git a/tests/Metal-Q5_K_M.log b/tests/Metal-Q5_K_M.log new file mode 100644 index 0000000..a25afc6 --- /dev/null +++ b/tests/Metal-Q5_K_M.log @@ -0,0 +1,835 @@ +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.007 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 21.7 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 1538.7 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 275.7 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 42.1 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 230.3 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x11cf0b930 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x11cf0bd90 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x11cf0c600 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x11cf0ca80 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x11cf0d2f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x11cf0d920 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x11cf0e200 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x11cf0e610 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x11cf0e870 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 44.0 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 33.7 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 140 tensors, 412.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 572.5 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x11ce0c140 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1 0x11cf17e80 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x11cf18860 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x11cf18ec0 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 158.6 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982 +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 73.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 113.6 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x11ce0cf30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x11ce0d840 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x11ce0ddf0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x11ce0e050 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8 0x11ce0ea30 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x11ce0efe0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x11ce0f360 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x11ce0f5c0 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x11ce10000 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 1065.0 ms +[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x11cf09240 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x11cf19120 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x11cf19380 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2 0x11cf1a3c0 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x11cf1a740 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x11cf1ae00 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x11cf1b060 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x11cf1b600 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x11cf1bb90 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x11cf1c5b0 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.260409 -0.161609 -0.102203 0.051602 +[Debug] temb: [2048] first4: -0.000151 -0.132293 -0.035516 0.064751 +[Debug] temb_t: [2048] first4: 0.000578 0.026708 -0.052786 0.063514 +[Debug] temb_r: [2048] first4: -0.000729 -0.159001 0.017269 0.001237 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.051153 -0.053631 -0.012192 -0.039024 +[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.044511 -0.951831 0.540187 0.457322 +[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.153168 0.787275 0.319340 -0.492001 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.722961 -0.753736 -0.051927 0.265661 +[Debug] layer0_q_after_rope: [128, 16] first4: -12.602057 0.798570 1.518488 1.778495 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.153168 0.787275 0.319340 -0.492001 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.498292 0.150378 -0.398807 0.484326 +[Debug] layer0_attn_out: [2048, 1085] first4: -12.773369 1.105118 1.773309 1.768943 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542001 -1.018193 0.152304 0.468235 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.605642 -0.786551 -0.346129 0.499558 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.171107 0.593998 51.751106 -0.883031 +[Debug] hidden_after_layer6: [2048, 1085] first4: -20.936150 0.582827 29.989494 -4.872031 +[Debug] hidden_after_layer12: [2048, 1085] first4: -18.277052 -17.088211 71.559052 24.992846 +[Debug] hidden_after_layer18: [2048, 1085] first4: -25.915581 10.692349 65.928192 19.066517 +[Debug] hidden_after_layer23: [2048, 1085] first4: -6.799564 38.425339 203.467468 153.140854 +[Debug] dit_step0_vt: [2170, 64] first4: 0.015160 1.163890 0.353989 2.352075 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193647 2.103346 -0.187965 0.740744 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.238755 1.372093 -0.135596 1.879695 +[Debug] dit_step1_xt: [2170, 64] first4: 0.206670 2.028504 -0.180569 0.638215 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.034453 1.243445 0.102498 2.382742 +[Debug] dit_step2_xt: [2170, 64] first4: 0.208967 1.945608 -0.187402 0.479365 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.286766 1.110088 0.263285 2.616079 +[Debug] dit_step3_xt: [2170, 64] first4: 0.185070 1.853101 -0.209343 0.261359 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.359540 0.909711 0.077998 2.701789 +[Debug] dit_step4_xt: [2170, 64] first4: 0.146547 1.755632 -0.217700 -0.028118 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.361413 0.800675 -0.393300 2.731152 +[Debug] dit_step5_xt: [2170, 64] first4: 0.094917 1.641250 -0.161514 -0.418283 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.247442 0.617176 -0.960503 2.789753 +[Debug] dit_step6_xt: [2170, 64] first4: 0.045429 1.517814 0.030587 -0.976234 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: 0.019480 0.316869 -1.427597 3.153955 +[Debug] dit_x0: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 8546.5 ms (8546.5 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x11ce0b610 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x11ce10380 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x11ce10870 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x11ce10ad0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x11ce10e80 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x11ce11660 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x11ce11bc0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x11ce12b20 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x11ce12d80 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609611.2 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000665 0.001184 0.001013 0.001406 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.006 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 21.5 ms +[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 1513.9 ms +[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 272.4 ms +[Request 1/1] ggml-sft/request0.json (batch=1) +[Request] parsed ggml-sft/request0.json (18 fields) +[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 42.2 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 225.9 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x12de0dc30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x12de0e090 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x12de0e900 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x12de0ed80 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x12de0f5f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x12de0fc20 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x12de10500 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x12de10910 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x12de10b70 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 49.1 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 42.5 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 140 tensors, 412.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 760.1 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x103e0be70 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1 0x103e0c420 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x103e0c860 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x103e0cfd0 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 162.8 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982 +[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 30 tensors, 73.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 115.2 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x103f05ca0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x103f065b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x103f06b60 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x103f06dc0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8 0x103f07ca0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x103f08250 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x103f085d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x103f08830 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x103f091d0 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 1055.5 ms +[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x103e0dfd0 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x103e0e530 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x103e0e940 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2 0x103e0f980 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x103e0fd00 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x103e103c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x103e10620 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x103e10bc0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x103e11150 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x103e11b70 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.155137 -0.118305 -0.084248 0.082623 +[Debug] temb: [2048] first4: -0.002843 -0.176820 0.004745 -0.001924 +[Debug] temb_t: [2048] first4: -0.001351 0.003023 -0.012552 -0.001712 +[Debug] temb_r: [2048] first4: -0.001491 -0.179843 0.017298 -0.000212 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.034754 0.028817 0.027120 -0.031729 +[Debug] temb_lin1_r: [2048] first4: -0.002680 0.004202 0.000655 -0.002088 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.095122 -0.908724 0.502637 0.450925 +[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.173051 0.876023 0.351566 -0.532545 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.920384 -0.707757 -0.034391 0.299813 +[Debug] layer0_q_after_rope: [128, 16] first4: -12.596228 0.535827 1.482060 1.773901 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.173051 0.876023 0.351566 -0.532545 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.637092 0.777700 -0.593592 0.534410 +[Debug] layer0_attn_out: [2048, 1085] first4: -12.115236 1.011131 1.711030 1.787191 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803575 -1.350455 -0.166552 0.391822 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.983645 -0.994904 -0.442195 0.398258 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.639871 1.314413 59.265984 -1.678902 +[Debug] hidden_after_layer6: [2048, 1085] first4: -15.443645 3.665146 59.001129 -0.430717 +[Debug] hidden_after_layer12: [2048, 1085] first4: -13.175318 0.713974 -24.768734 -0.709223 +[Debug] hidden_after_layer18: [2048, 1085] first4: -5.504215 14.850023 -44.686668 -9.688757 +[Debug] hidden_after_layer23: [2048, 1085] first4: 32.081551 63.384781 49.094582 -14.152830 +[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.659668 2.541722 -0.175072 1.431705 +[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.363007 2.087823 -0.714051 1.721254 +[Debug] dit_step0_vt: [2170, 64] first4: -0.859789 3.013237 0.095956 1.039126 +[Debug] dit_step0_xt: [2170, 64] first4: 0.211532 2.095985 -0.173794 0.826874 +[DiT] step 1/50 t=1.000 +[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.721169 2.535385 -0.028817 1.265576 +[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.587318 2.330142 -0.122353 1.467132 +[Debug] dit_step1_vt: [2170, 64] first4: -0.560584 2.181154 -0.231214 1.319647 +[Debug] dit_step1_xt: [2170, 64] first4: 0.222743 2.052362 -0.169170 0.800481 +[DiT] step 2/50 t=0.980 +[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.714072 2.499362 0.020078 1.278304 +[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.600958 2.309783 -0.102115 1.400073 +[Debug] dit_step2_vt: [2170, 64] first4: -0.861209 2.897427 0.258299 0.998091 +[Debug] dit_step2_xt: [2170, 64] first4: 0.239968 1.994414 -0.174336 0.780519 +[DiT] step 3/50 t=0.960 +[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.674846 2.427423 0.055491 1.297147 +[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.635843 2.332050 -0.021028 1.341071 +[Debug] dit_step3_vt: [2170, 64] first4: -0.498860 2.133466 -0.141608 1.438968 +[Debug] dit_step3_xt: [2170, 64] first4: 0.249945 1.951744 -0.171504 0.751740 +[DiT] step 4/50 t=0.940 +[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.605752 2.344337 0.051579 1.306835 +[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.617271 2.293072 -0.026830 1.327208 +[Debug] dit_step4_vt: [2170, 64] first4: -0.645479 2.581793 0.280999 1.082338 +[Debug] dit_step4_xt: [2170, 64] first4: 0.262854 1.900108 -0.177124 0.730093 +[DiT] step 5/50 t=0.920 +[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.529691 2.265976 0.024912 1.308485 +[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.613722 2.254884 0.007785 1.353816 +[Debug] dit_step5_vt: [2170, 64] first4: -0.220550 1.958779 -0.197940 1.314650 +[Debug] dit_step5_xt: [2170, 64] first4: 0.267265 1.860933 -0.173165 0.703800 +[DiT] step 6/50 t=0.900 +[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492980 2.220784 0.013978 1.314708 +[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.574326 2.194118 0.017516 1.377784 +[Debug] dit_step6_vt: [2170, 64] first4: -0.484828 2.424557 0.188767 1.065213 +[Debug] dit_step6_xt: [2170, 64] first4: 0.276962 1.812442 -0.176940 0.682496 +[DiT] step 7/50 t=0.880 +[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.442502 2.140999 -0.004548 1.307788 +[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.527369 2.112375 0.025084 1.375065 +[Debug] dit_step7_vt: [2170, 64] first4: -0.162851 1.943518 -0.225018 1.235321 +[Debug] dit_step7_xt: [2170, 64] first4: 0.280219 1.773571 -0.172440 0.657789 +[DiT] step 8/50 t=0.860 +[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.402943 2.052812 -0.006712 1.290008 +[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.459439 2.025463 0.030814 1.342298 +[Debug] dit_step8_vt: [2170, 64] first4: -0.414733 2.160834 0.094455 1.088282 +[Debug] dit_step8_xt: [2170, 64] first4: 0.288514 1.730355 -0.174329 0.636023 +[DiT] step 9/50 t=0.840 +[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.369760 1.969441 -0.010690 1.271576 +[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.396671 1.951135 0.030780 1.310076 +[Debug] dit_step9_vt: [2170, 64] first4: -0.235488 1.803751 -0.198291 1.206838 +[Debug] dit_step9_xt: [2170, 64] first4: 0.293223 1.694280 -0.170363 0.611887 +[DiT] step 10/50 t=0.820 +[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.344175 1.892885 0.003662 1.257559 +[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.342085 1.891713 0.047752 1.301667 +[Debug] dit_step10_vt: [2170, 64] first4: -0.420278 1.858245 0.037965 1.052360 +[Debug] dit_step10_xt: [2170, 64] first4: 0.301629 1.657115 -0.171122 0.590840 +[DiT] step 11/50 t=0.800 +[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.321399 1.822005 0.005853 1.241717 +[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.313690 1.840958 0.057351 1.308882 +[Debug] dit_step11_vt: [2170, 64] first4: -0.256015 1.588119 -0.173446 1.051672 +[Debug] dit_step11_xt: [2170, 64] first4: 0.306749 1.625353 -0.167653 0.569806 +[DiT] step 12/50 t=0.780 +[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.300376 1.758913 0.017258 1.234417 +[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.284777 1.790303 0.055927 1.316604 +[Debug] dit_step12_vt: [2170, 64] first4: -0.382294 1.631086 0.039774 0.961382 +[Debug] dit_step12_xt: [2170, 64] first4: 0.314395 1.592731 -0.168449 0.550578 +[DiT] step 13/50 t=0.760 +[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.282512 1.699357 0.023821 1.222209 +[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.256755 1.739260 0.045941 1.313994 +[Debug] dit_step13_vt: [2170, 64] first4: -0.292858 1.451734 -0.071923 0.964463 +[Debug] dit_step13_xt: [2170, 64] first4: 0.320252 1.563696 -0.167011 0.531289 +[DiT] step 14/50 t=0.740 +[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.268355 1.643249 0.035884 1.219110 +[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.234205 1.685310 0.038125 1.306284 +[Debug] dit_step14_vt: [2170, 64] first4: -0.380481 1.484198 0.105917 0.956716 +[Debug] dit_step14_xt: [2170, 64] first4: 0.327862 1.534012 -0.169129 0.512155 +[DiT] step 15/50 t=0.720 +[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.257674 1.591244 0.046474 1.215229 +[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.215398 1.632289 0.020497 1.288409 +[Debug] dit_step15_vt: [2170, 64] first4: -0.335985 1.372340 0.098681 1.003970 +[Debug] dit_step15_xt: [2170, 64] first4: 0.334582 1.506565 -0.171102 0.492076 +[DiT] step 16/50 t=0.700 +[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.250808 1.534114 0.052932 1.211927 +[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.205743 1.580690 0.005785 1.270423 +[Debug] dit_step16_vt: [2170, 64] first4: -0.382949 1.342146 0.222679 1.022127 +[Debug] dit_step16_xt: [2170, 64] first4: 0.342241 1.479722 -0.175556 0.471633 +[DiT] step 17/50 t=0.680 +[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.246631 1.471787 0.045976 1.197702 +[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.202947 1.526995 -0.016021 1.248045 +[Debug] dit_step17_vt: [2170, 64] first4: -0.343461 1.218771 0.212458 1.019693 +[Debug] dit_step17_xt: [2170, 64] first4: 0.349110 1.455347 -0.179805 0.451239 +[DiT] step 18/50 t=0.660 +[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.243157 1.409443 0.036330 1.184456 +[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.204529 1.477009 -0.037415 1.231383 +[Debug] dit_step18_vt: [2170, 64] first4: -0.354021 1.150632 0.298290 1.001465 +[Debug] dit_step18_xt: [2170, 64] first4: 0.356190 1.432334 -0.185771 0.431210 +[DiT] step 19/50 t=0.640 +[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.237715 1.343371 0.017683 1.161345 +[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.204017 1.427205 -0.062926 1.210816 +[Debug] dit_step19_vt: [2170, 64] first4: -0.318427 0.994659 0.274882 0.959663 +[Debug] dit_step19_xt: [2170, 64] first4: 0.362559 1.412441 -0.191269 0.412017 +[DiT] step 20/50 t=0.620 +[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.232230 1.263409 -0.001007 1.136674 +[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.200904 1.370160 -0.090913 1.187652 +[Debug] dit_step20_vt: [2170, 64] first4: -0.327181 0.859238 0.330215 0.939772 +[Debug] dit_step20_xt: [2170, 64] first4: 0.369102 1.395257 -0.197873 0.393221 +[DiT] step 21/50 t=0.600 +[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.230263 1.180950 -0.026976 1.107196 +[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.199895 1.312066 -0.122788 1.160427 +[Debug] dit_step21_vt: [2170, 64] first4: -0.309986 0.682107 0.288212 0.884258 +[Debug] dit_step21_xt: [2170, 64] first4: 0.375302 1.381614 -0.203637 0.375536 +[DiT] step 22/50 t=0.580 +[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.226975 1.087024 -0.049604 1.072087 +[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.197016 1.244136 -0.149074 1.129456 +[Debug] dit_step22_vt: [2170, 64] first4: -0.315666 0.502494 0.302365 0.836432 +[Debug] dit_step22_xt: [2170, 64] first4: 0.381615 1.371564 -0.209685 0.358807 +[DiT] step 23/50 t=0.560 +[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.229945 0.987764 -0.068857 1.041486 +[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.196528 1.172394 -0.166018 1.101852 +[Debug] dit_step23_vt: [2170, 64] first4: -0.331807 0.301117 0.239278 0.795295 +[Debug] dit_step23_xt: [2170, 64] first4: 0.388252 1.365542 -0.214470 0.342901 +[DiT] step 24/50 t=0.540 +[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.234566 0.878375 -0.086414 0.999502 +[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.196025 1.088575 -0.178235 1.066777 +[Debug] dit_step24_vt: [2170, 64] first4: -0.353102 0.129063 0.219470 0.719129 +[Debug] dit_step24_xt: [2170, 64] first4: 0.395314 1.362961 -0.218859 0.328519 +[DiT] step 25/50 t=0.520 +[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.244918 0.762549 -0.099009 0.962606 +[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.200310 0.995220 -0.184174 1.033471 +[Debug] dit_step25_vt: [2170, 64] first4: -0.384523 -0.079339 0.165061 0.686562 +[Debug] dit_step25_xt: [2170, 64] first4: 0.403004 1.364548 -0.222161 0.314788 +[DiT] step 26/50 t=0.500 +[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.256133 0.634451 -0.110018 0.919318 +[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.204624 0.887617 -0.187237 0.997615 +[Debug] dit_step26_vt: [2170, 64] first4: -0.416561 -0.257153 0.136664 0.594071 +[Debug] dit_step26_xt: [2170, 64] first4: 0.411335 1.369691 -0.224894 0.302906 +[DiT] step 27/50 t=0.480 +[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.269545 0.510911 -0.116178 0.879919 +[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.210466 0.778243 -0.183774 0.961990 +[Debug] dit_step27_vt: [2170, 64] first4: -0.454081 -0.397413 0.085143 0.568775 +[Debug] dit_step27_xt: [2170, 64] first4: 0.420417 1.377639 -0.226597 0.291531 +[DiT] step 28/50 t=0.460 +[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.282172 0.372450 -0.120048 0.831178 +[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.216011 0.655762 -0.179339 0.918627 +[Debug] dit_step28_vt: [2170, 64] first4: -0.483525 -0.575904 0.063843 0.478002 +[Debug] dit_step28_xt: [2170, 64] first4: 0.430087 1.389157 -0.227874 0.281971 +[DiT] step 29/50 t=0.440 +[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.293168 0.231214 -0.124586 0.781733 +[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.219691 0.525349 -0.170166 0.873745 +[Debug] dit_step29_vt: [2170, 64] first4: -0.518206 -0.710899 -0.005456 0.435657 +[Debug] dit_step29_xt: [2170, 64] first4: 0.440451 1.403375 -0.227765 0.273257 +[DiT] step 30/50 t=0.420 +[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.302506 0.086124 -0.124549 0.731757 +[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.221950 0.385721 -0.158065 0.830404 +[Debug] dit_step30_vt: [2170, 64] first4: -0.544895 -0.854978 -0.023964 0.343625 +[Debug] dit_step30_xt: [2170, 64] first4: 0.451349 1.420475 -0.227285 0.266385 +[DiT] step 31/50 t=0.400 +[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.310194 -0.052540 -0.126955 0.680247 +[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.222645 0.246419 -0.151881 0.786437 +[Debug] dit_step31_vt: [2170, 64] first4: -0.574628 -0.958966 -0.058730 0.282347 +[Debug] dit_step31_xt: [2170, 64] first4: 0.462842 1.439654 -0.226111 0.260738 +[DiT] step 32/50 t=0.380 +[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.314911 -0.194979 -0.126385 0.629081 +[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.220352 0.102109 -0.144540 0.743076 +[Debug] dit_step32_vt: [2170, 64] first4: -0.598710 -1.085776 -0.069698 0.196720 +[Debug] dit_step32_xt: [2170, 64] first4: 0.474816 1.461369 -0.224717 0.256804 +[DiT] step 33/50 t=0.360 +[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.318386 -0.334863 -0.120002 0.581452 +[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.218793 -0.045891 -0.135709 0.701808 +[Debug] dit_step33_vt: [2170, 64] first4: -0.615695 -1.169131 -0.067985 0.140436 +[Debug] dit_step33_xt: [2170, 64] first4: 0.487130 1.484752 -0.223357 0.253995 +[DiT] step 34/50 t=0.340 +[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.319302 -0.465754 -0.110709 0.538110 +[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211723 -0.189661 -0.128844 0.664668 +[Debug] dit_step34_vt: [2170, 64] first4: -0.653452 -1.253376 -0.044227 0.068009 +[Debug] dit_step34_xt: [2170, 64] first4: 0.500199 1.509819 -0.222472 0.252635 +[DiT] step 35/50 t=0.320 +[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.319083 -0.598486 -0.097674 0.492459 +[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.205768 -0.335506 -0.119003 0.623259 +[Debug] dit_step35_vt: [2170, 64] first4: -0.661011 -1.330961 -0.027972 0.020086 +[Debug] dit_step35_xt: [2170, 64] first4: 0.513419 1.536439 -0.221913 0.252233 +[DiT] step 36/50 t=0.300 +[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.318070 -0.730204 -0.080886 0.449659 +[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.199361 -0.482989 -0.108305 0.581833 +[Debug] dit_step36_vt: [2170, 64] first4: -0.690251 -1.401439 0.013562 -0.032488 +[Debug] dit_step36_xt: [2170, 64] first4: 0.527224 1.564467 -0.222184 0.252883 +[DiT] step 37/50 t=0.280 +[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.316356 -0.845106 -0.054751 0.408449 +[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.194137 -0.614114 -0.090670 0.540072 +[Debug] dit_step37_vt: [2170, 64] first4: -0.694876 -1.462489 0.059738 -0.070205 +[Debug] dit_step37_xt: [2170, 64] first4: 0.541122 1.593717 -0.223379 0.254287 +[DiT] step 38/50 t=0.260 +[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.317038 -0.954626 -0.027545 0.366891 +[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.190420 -0.739252 -0.071602 0.495985 +[Debug] dit_step38_vt: [2170, 64] first4: -0.728120 -1.522043 0.107842 -0.107965 +[Debug] dit_step38_xt: [2170, 64] first4: 0.555684 1.624158 -0.225536 0.256446 +[DiT] step 39/50 t=0.240 +[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.317547 -1.061431 0.009151 0.320700 +[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.189703 -0.862508 -0.047778 0.446086 +[Debug] dit_step39_vt: [2170, 64] first4: -0.718521 -1.567608 0.187925 -0.137347 +[Debug] dit_step39_xt: [2170, 64] first4: 0.570055 1.655510 -0.229294 0.259193 +[DiT] step 40/50 t=0.220 +[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.321349 -1.147692 0.054264 0.273095 +[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.194555 -0.963666 -0.014199 0.392062 +[Debug] dit_step40_vt: [2170, 64] first4: -0.725549 -1.599213 0.254155 -0.152648 +[Debug] dit_step40_xt: [2170, 64] first4: 0.584566 1.687495 -0.234377 0.262246 +[DiT] step 41/50 t=0.200 +[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.323505 -1.229047 0.104885 0.223181 +[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.200140 -1.061003 0.024296 0.333073 +[Debug] dit_step41_vt: [2170, 64] first4: -0.706612 -1.620246 0.347929 -0.161344 +[Debug] dit_step41_xt: [2170, 64] first4: 0.598698 1.719899 -0.241336 0.265473 +[DiT] step 42/50 t=0.180 +[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.325457 -1.305037 0.162371 0.163510 +[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.210729 -1.152184 0.073317 0.265414 +[Debug] dit_step42_vt: [2170, 64] first4: -0.675503 -1.653079 0.417194 -0.191713 +[Debug] dit_step42_xt: [2170, 64] first4: 0.612208 1.752961 -0.249680 0.269307 +[DiT] step 43/50 t=0.160 +[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.327040 -1.367895 0.222307 0.103005 +[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.223734 -1.229896 0.124878 0.195856 +[Debug] dit_step43_vt: [2170, 64] first4: -0.637198 -1.669221 0.519679 -0.207779 +[Debug] dit_step43_xt: [2170, 64] first4: 0.624952 1.786345 -0.260074 0.273463 +[DiT] step 44/50 t=0.140 +[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.324606 -1.422529 0.282540 0.041568 +[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.235813 -1.298376 0.179092 0.128338 +[Debug] dit_step44_vt: [2170, 64] first4: -0.582920 -1.697035 0.593491 -0.255212 +[Debug] dit_step44_xt: [2170, 64] first4: 0.636610 1.820286 -0.271943 0.278567 +[DiT] step 45/50 t=0.120 +[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.322754 -1.471227 0.337328 -0.019488 +[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.253130 -1.357642 0.232497 0.062397 +[Debug] dit_step45_vt: [2170, 64] first4: -0.511094 -1.728875 0.663531 -0.289321 +[Debug] dit_step45_xt: [2170, 64] first4: 0.646832 1.854864 -0.285214 0.284353 +[DiT] step 46/50 t=0.100 +[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.317589 -1.518562 0.387160 -0.074592 +[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.269982 -1.418198 0.282772 0.001510 +[Debug] dit_step46_vt: [2170, 64] first4: -0.434923 -1.750584 0.706043 -0.325901 +[Debug] dit_step46_xt: [2170, 64] first4: 0.655531 1.889875 -0.299335 0.290871 +[DiT] step 47/50 t=0.080 +[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.316530 -1.561702 0.429371 -0.113134 +[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.290275 -1.474672 0.330953 -0.045588 +[Debug] dit_step47_vt: [2170, 64] first4: -0.369515 -1.780826 0.721569 -0.327625 +[Debug] dit_step47_xt: [2170, 64] first4: 0.662921 1.925492 -0.313766 0.297424 +[DiT] step 48/50 t=0.060 +[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.304095 -1.593375 0.469693 -0.135493 +[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.298372 -1.526686 0.379661 -0.093868 +[Debug] dit_step48_vt: [2170, 64] first4: -0.296147 -1.763528 0.744123 -0.229345 +[Debug] dit_step48_xt: [2170, 64] first4: 0.668844 1.960763 -0.328649 0.302011 +[DiT] step 49/50 t=0.040 +[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.330730 -1.622756 0.480628 -0.154374 +[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.324303 -1.543248 0.396626 -0.105187 +[Debug] dit_step49_vt: [2170, 64] first4: -0.371182 -1.888395 0.739232 -0.345413 +[Debug] dit_x0: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919 +[DiT] step 50/50 t=0.020 +[DiT] Total generation: 106456.5 ms (106456.5 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x103f04c20 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x103f072d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x103f09950 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x103f05240 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x103f09cd0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x103f0a8c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x103f0ab20 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x103f0beb0 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x103f04080 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609604.9 ms +[Debug] vae_audio: [2, 4166400] first4: -0.002491 -0.002402 -0.002394 -0.002024 +[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +[Request] Loaded request0.json +[Noise] Reusing existing rng_philox_seed42.bf16 +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf +[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... +[GGML] Done, 47 dump files +[Turbo] Reusing existing Python dumps: python-turbo +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.999083 + detok_output 0.999884 + context 0.999927 + noise 1.000000 + temb_t 0.999972 + hidden_after_proj_in 0.999966 + enc_after_cond_emb 0.999209 + layer0_sa_output 0.999459 + hidden_after_layer0 0.999838 + hidden_after_layer6 0.999790 + hidden_after_layer12 0.998662 + hidden_after_layer18 0.995505 + hidden_after_layer23 0.991560 + dit_step0_vt 0.968885 + dit_step0_xt 0.999932 + dit_step1_vt 0.972718 + dit_step1_xt 0.999793 + dit_step2_vt 0.970980 + dit_step2_xt 0.999392 + dit_step3_vt 0.974057 + dit_step3_xt 0.998550 + dit_step4_vt 0.972601 + dit_step4_xt 0.996666 + dit_step5_vt 0.967840 + dit_step5_xt 0.992262 + dit_step6_vt 0.963419 + dit_step6_xt 0.983648 + dit_step7_vt 0.954759 + dit_x0 0.970661 + vae_audio 0.881689 + vae_audio (log spectral) 0.999788 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999930 0.140512 0.007718 -0.002317 0.973035 -0.002342 0.972003 + dit_step1_xt 0.999791 0.264415 0.013154 -0.005313 0.942911 -0.005313 0.941730 + dit_step2_xt 0.999391 0.457878 0.021002 -0.009296 0.909537 -0.009311 0.908527 + dit_step3_xt 0.998548 0.672565 0.031169 -0.014659 0.874300 -0.014577 0.873624 + dit_step4_xt 0.996664 0.977397 0.045289 -0.021867 0.842610 -0.021660 0.841995 + dit_step5_xt 0.992261 1.456099 0.067099 -0.032222 0.826249 -0.032109 0.824593 + dit_step6_xt 0.983647 2.128287 0.100579 -0.046802 0.857538 -0.046482 0.855546 +[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf +[GGML] Running acestep-v15-sft-Q5_K_M.gguf... +[GGML] Done, 233 dump files +[SFT] Reusing existing Python dumps: python-sft +[SFT] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.999083 + detok_output 0.999884 + context 0.999927 + noise 1.000000 + temb_t 0.999900 + hidden_after_proj_in 0.999966 + enc_after_cond_emb 0.999209 + layer0_sa_output 0.999536 + hidden_after_layer0 0.999891 + hidden_after_layer6 0.999626 + hidden_after_layer12 0.998995 + hidden_after_layer18 0.998026 + hidden_after_layer23 0.998535 + null_condition_emb 1.000000 + null_enc_hidden 1.000000 + dit_step0_vt_cond 0.998436 + dit_step0_vt_uncond 0.998344 + dit_step0_vt 0.994668 + dit_step0_xt 0.999997 + dit_step5_vt_cond 0.998676 + dit_step5_vt 0.989830 + dit_step5_xt 0.999935 + dit_step10_vt_cond 0.996806 + dit_step10_vt 0.987585 + dit_step10_xt 0.999744 + dit_step15_vt_cond 0.992244 + dit_step15_vt 0.973038 + dit_step15_xt 0.999108 + dit_step20_vt_cond 0.984474 + dit_step20_vt 0.958153 + dit_step20_xt 0.997397 + dit_step25_vt_cond 0.974096 + dit_step25_vt 0.945640 + dit_step25_xt 0.994154 + dit_step30_vt_cond 0.962790 + dit_step30_vt 0.934107 + dit_step30_xt 0.989253 + dit_step35_vt_cond 0.951958 + dit_step35_vt 0.920426 + dit_step35_xt 0.983572 + dit_step40_vt_cond 0.945880 + dit_step40_vt 0.910054 + dit_step40_xt 0.978292 + dit_step45_vt_cond 0.952542 + dit_step45_vt 0.924831 + dit_step45_xt 0.974685 + dit_step49_vt_cond 0.963084 + dit_step49_vt 0.916267 + dit_x0 0.973449 + vae_audio 0.878623 + vae_audio (log spectral) 0.999566 +[SFT] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999996 0.038422 0.002252 -0.001777 0.980099 -0.001741 0.980402 + dit_step5_xt 0.999933 0.110159 0.007862 -0.006926 0.889483 -0.007143 0.887999 + dit_step10_xt 0.999743 0.216004 0.013603 -0.012446 0.811152 -0.012603 0.811299 + dit_step15_xt 0.999108 0.433603 0.022505 -0.017845 0.746187 -0.018114 0.745269 + dit_step20_xt 0.997397 0.645668 0.035020 -0.023481 0.700583 -0.023808 0.699582 + dit_step25_xt 0.994154 0.894286 0.050478 -0.029250 0.679073 -0.029311 0.679278 + dit_step30_xt 0.989253 1.155794 0.069043 -0.035037 0.684973 -0.035027 0.685262 + dit_step35_xt 0.983572 1.518599 0.089822 -0.040808 0.717172 -0.040716 0.717196 + dit_step40_xt 0.978291 1.917882 0.111854 -0.046581 0.771460 -0.046462 0.771853 + dit_step45_xt 0.974684 2.279520 0.132709 -0.052804 0.843506 -0.052475 0.843036 diff --git a/tests/Metal-Q6_K.log b/tests/Metal-Q6_K.log new file mode 100644 index 0000000..19a2f5c --- /dev/null +++ b/tests/Metal-Q6_K.log @@ -0,0 +1,819 @@ +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.006 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 18.8 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 1759.5 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 335.9 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 42.1 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 294.2 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x13a80b9e0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x13a80be40 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x13a80c6b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x13a80cb30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x13a80d3a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x13a80d9d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x13a80e2b0 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x13a80e6c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x13a80e920 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 46.1 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 33.8 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 140 tensors, 476.3 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 652.0 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x13a818c40 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x13a819080 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x13a8197f0 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 146.8 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631 +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 82.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 125.6 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x13a817f60 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x13a818470 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x13a808aa0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x13a808d00 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x13a81a190 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x13a81a3f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x13a81adc0 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x13a81b250 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 1009.6 ms +[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x13970a020 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x13970a280 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x13970a4e0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x13970b610 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x13970be80 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x13970c0e0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x13970c7b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x13970cc30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x13970d300 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.261089 -0.161223 -0.098728 0.051901 +[Debug] temb: [2048] first4: 0.000236 -0.132397 -0.035348 0.064653 +[Debug] temb_t: [2048] first4: 0.001398 0.026957 -0.052741 0.063660 +[Debug] temb_r: [2048] first4: -0.001162 -0.159353 0.017394 0.000993 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049071 -0.051112 -0.017769 -0.037193 +[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.037692 -0.956719 0.540867 0.451860 +[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.167564 0.852700 0.309671 -0.538299 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.716202 -0.756050 -0.048455 0.263529 +[Debug] layer0_q_after_rope: [128, 16] first4: -0.167564 0.852700 0.309671 -0.538299 +[Debug] layer0_k_after_rope: [128, 8] first4: -1.214772 -0.856039 -1.908578 -2.256124 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.502833 0.209946 -0.367812 0.520536 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.134820 -0.084089 -34.867664 -0.724257 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540231 -1.049932 0.181504 0.461969 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.609974 -0.819551 -0.333653 0.497179 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.196066 0.534182 52.334564 -0.880322 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.325979 -0.207006 34.129318 -4.337931 +[Debug] hidden_after_layer12: [2048, 1085] first4: -15.411194 -16.311844 76.549057 29.816362 +[Debug] hidden_after_layer18: [2048, 1085] first4: -28.025963 13.209218 65.994347 20.087559 +[Debug] hidden_after_layer23: [2048, 1085] first4: -19.575611 48.863052 201.092041 136.881271 +[Debug] dit_step0_vt: [2170, 64] first4: 0.099154 1.132388 0.349667 2.375307 +[Debug] dit_step0_xt: [2170, 64] first4: 0.189829 2.104778 -0.187769 0.739688 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.140258 1.329038 -0.173978 1.924904 +[Debug] dit_step1_xt: [2170, 64] first4: 0.197479 2.032285 -0.178279 0.634693 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: 0.064148 1.236530 0.118618 2.406788 +[Debug] dit_step2_xt: [2170, 64] first4: 0.193203 1.949849 -0.186187 0.474240 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.305678 1.101620 0.246811 2.656265 +[Debug] dit_step3_xt: [2170, 64] first4: 0.167730 1.858048 -0.206755 0.252885 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.333444 1.032630 0.077940 2.735898 +[Debug] dit_step4_xt: [2170, 64] first4: 0.132004 1.747409 -0.215105 -0.040247 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.310135 0.905818 -0.324717 2.786166 +[Debug] dit_step5_xt: [2170, 64] first4: 0.087699 1.618006 -0.168717 -0.438271 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.184311 0.624224 -0.863634 2.781863 +[Debug] dit_step6_xt: [2170, 64] first4: 0.050836 1.493161 0.004009 -0.994643 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: 0.049488 0.168290 -1.435298 3.015385 +[Debug] dit_x0: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 8015.7 ms (8015.7 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x13a81b7d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x13a81c1a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x13a81c400 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x13a81ca90 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x13a81ccf0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x13a81d4d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x13a81da30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x13a81e910 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x13a81eb70 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609630.7 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000453 0.000980 0.000826 0.001209 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.006 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 20.5 ms +[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 1781.2 ms +[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 275.0 ms +[Request 1/1] ggml-sft/request0.json (batch=1) +[Request] parsed ggml-sft/request0.json (18 fields) +[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 42.4 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 361.7 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x106006bc0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x106007320 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x106008160 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x1060083c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x106008e40 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x1060093a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x106009600 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x10600a090 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x10600a2f0 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 45.5 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 35.7 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 140 tensors, 476.3 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 850.5 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x120606720 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x120606ba0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x1206071d0 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 151.8 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631 +[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 30 tensors, 82.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 134.7 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x106012a70 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x106013260 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x106013b60 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x1060134c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x1206079b0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x120608040 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x1206082a0 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x120608730 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 1004.2 ms +[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x120608bb0 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x120608e10 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x120609070 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x120609f80 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x12060a7f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x12060aa50 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x12060b0d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x12060b550 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x12060bc10 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.153861 -0.117528 -0.090110 0.080834 +[Debug] temb: [2048] first4: -0.002466 -0.176370 0.004369 -0.002069 +[Debug] temb_t: [2048] first4: -0.000999 0.003474 -0.013219 -0.002130 +[Debug] temb_r: [2048] first4: -0.001467 -0.179844 0.017589 0.000062 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.041140 0.030274 0.027836 -0.025460 +[Debug] temb_lin1_r: [2048] first4: 0.004272 0.006720 0.000208 -0.005103 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.088484 -0.913424 0.502796 0.445566 +[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.194042 0.920094 0.309464 -0.544236 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.914448 -0.710483 -0.040214 0.295227 +[Debug] layer0_q_after_rope: [128, 16] first4: -0.194042 0.920094 0.309464 -0.544236 +[Debug] layer0_k_after_rope: [128, 8] first4: -1.341203 -0.993715 -1.828661 -2.252987 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.651907 0.800758 -0.600550 0.531539 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.094031 -0.025671 -33.031021 -0.629337 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.804741 -1.365866 -0.176846 0.385942 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.975780 -1.029709 -0.454110 0.391604 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.417660 1.271689 57.716125 -1.718801 +[Debug] hidden_after_layer6: [2048, 1085] first4: -17.205166 2.402088 59.038250 -1.336451 +[Debug] hidden_after_layer12: [2048, 1085] first4: -10.471869 6.708532 -25.396618 -2.966099 +[Debug] hidden_after_layer18: [2048, 1085] first4: -4.594971 20.646416 -42.849018 -14.024486 +[Debug] hidden_after_layer23: [2048, 1085] first4: 34.838955 64.575096 51.865501 -11.288853 +[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.621944 2.533786 -0.220703 1.441472 +[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.335422 2.090178 -0.712111 1.749312 +[Debug] dit_step0_vt: [2170, 64] first4: -0.817132 3.005553 0.060651 1.037697 +[Debug] dit_step0_xt: [2170, 64] first4: 0.210679 2.096139 -0.173088 0.826902 +[DiT] step 1/50 t=1.000 +[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.663130 2.509250 -0.047312 1.284879 +[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.535073 2.303810 -0.132373 1.511020 +[Debug] dit_step1_vt: [2170, 64] first4: -0.505551 2.169540 -0.259578 1.319904 +[Debug] dit_step1_xt: [2170, 64] first4: 0.220790 2.052748 -0.167896 0.800504 +[DiT] step 2/50 t=0.980 +[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.658958 2.478008 -0.004274 1.297306 +[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.555012 2.295219 -0.109707 1.445456 +[Debug] dit_step2_vt: [2170, 64] first4: -0.800255 2.877401 0.236257 0.991010 +[Debug] dit_step2_xt: [2170, 64] first4: 0.236795 1.995200 -0.172622 0.780684 +[DiT] step 3/50 t=0.960 +[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.623685 2.405478 0.016157 1.310913 +[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.584147 2.331472 -0.052434 1.362097 +[Debug] dit_step3_vt: [2170, 64] first4: -0.460909 2.085223 -0.179917 1.477093 +[Debug] dit_step3_xt: [2170, 64] first4: 0.246013 1.953496 -0.169023 0.751142 +[DiT] step 4/50 t=0.940 +[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.573136 2.336644 0.011644 1.310671 +[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.579254 2.305885 -0.063045 1.333517 +[Debug] dit_step4_vt: [2170, 64] first4: -0.615409 2.553339 0.248993 1.073298 +[Debug] dit_step4_xt: [2170, 64] first4: 0.258321 1.902429 -0.174003 0.729676 +[DiT] step 5/50 t=0.920 +[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.517031 2.271256 -0.017464 1.306595 +[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.593495 2.274611 -0.028605 1.340369 +[Debug] dit_step5_vt: [2170, 64] first4: -0.226837 1.944980 -0.246283 1.356041 +[Debug] dit_step5_xt: [2170, 64] first4: 0.262858 1.863529 -0.169077 0.702555 +[DiT] step 6/50 t=0.900 +[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492334 2.237296 -0.030848 1.308453 +[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.559225 2.211779 -0.028865 1.356664 +[Debug] dit_step6_vt: [2170, 64] first4: -0.506265 2.451765 0.152689 1.063906 +[Debug] dit_step6_xt: [2170, 64] first4: 0.272983 1.814494 -0.172131 0.681277 +[DiT] step 7/50 t=0.880 +[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.448929 2.166704 -0.051454 1.296641 +[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.516741 2.132911 -0.036027 1.347799 +[Debug] dit_step7_vt: [2170, 64] first4: -0.201648 1.970928 -0.247107 1.267663 +[Debug] dit_step7_xt: [2170, 64] first4: 0.277016 1.775075 -0.167189 0.655924 +[DiT] step 8/50 t=0.860 +[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.412205 2.087660 -0.059856 1.279323 +[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.450006 2.049930 -0.047531 1.315284 +[Debug] dit_step8_vt: [2170, 64] first4: -0.455255 2.220170 0.079066 1.097043 +[Debug] dit_step8_xt: [2170, 64] first4: 0.286121 1.730672 -0.168770 0.633983 +[DiT] step 9/50 t=0.840 +[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.378611 2.005139 -0.068357 1.261302 +[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.379138 1.976703 -0.055227 1.283708 +[Debug] dit_step9_vt: [2170, 64] first4: -0.302897 1.841593 -0.214058 1.239796 +[Debug] dit_step9_xt: [2170, 64] first4: 0.292179 1.693840 -0.164489 0.609187 +[DiT] step 10/50 t=0.820 +[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.355196 1.928296 -0.064251 1.252096 +[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.318644 1.912086 -0.042776 1.279346 +[Debug] dit_step10_vt: [2170, 64] first4: -0.498521 1.941743 -0.002070 1.076981 +[Debug] dit_step10_xt: [2170, 64] first4: 0.302150 1.655005 -0.164448 0.587648 +[DiT] step 11/50 t=0.800 +[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.335946 1.853130 -0.070414 1.240474 +[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.288372 1.862547 -0.022430 1.290233 +[Debug] dit_step11_vt: [2170, 64] first4: -0.354338 1.618477 -0.262413 1.094882 +[Debug] dit_step11_xt: [2170, 64] first4: 0.309236 1.622636 -0.159200 0.565750 +[DiT] step 12/50 t=0.780 +[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.320539 1.785762 -0.070432 1.238213 +[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.262826 1.816614 -0.015609 1.306050 +[Debug] dit_step12_vt: [2170, 64] first4: -0.491342 1.658213 -0.091632 0.993836 +[Debug] dit_step12_xt: [2170, 64] first4: 0.319063 1.589471 -0.157367 0.545873 +[DiT] step 13/50 t=0.760 +[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.304664 1.719325 -0.071341 1.231198 +[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.239698 1.764744 -0.021893 1.311546 +[Debug] dit_step13_vt: [2170, 64] first4: -0.399484 1.443110 -0.224588 1.008114 +[Debug] dit_step13_xt: [2170, 64] first4: 0.327053 1.560609 -0.152875 0.525711 +[DiT] step 14/50 t=0.740 +[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.287200 1.663083 -0.055876 1.237650 +[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.221389 1.709672 -0.033314 1.310252 +[Debug] dit_step14_vt: [2170, 64] first4: -0.462595 1.498639 -0.013459 1.015139 +[Debug] dit_step14_xt: [2170, 64] first4: 0.336305 1.530637 -0.152606 0.505408 +[DiT] step 15/50 t=0.720 +[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.268932 1.604917 -0.048117 1.238302 +[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.201852 1.656007 -0.050695 1.299129 +[Debug] dit_step15_vt: [2170, 64] first4: -0.405852 1.327548 -0.070650 1.046717 +[Debug] dit_step15_xt: [2170, 64] first4: 0.344422 1.504086 -0.151193 0.484474 +[DiT] step 16/50 t=0.700 +[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.252512 1.546370 -0.039756 1.239714 +[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.180964 1.599575 -0.068729 1.281288 +[Debug] dit_step16_vt: [2170, 64] first4: -0.465245 1.345990 0.113584 1.099017 +[Debug] dit_step16_xt: [2170, 64] first4: 0.353727 1.477166 -0.153465 0.462493 +[DiT] step 17/50 t=0.680 +[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.235390 1.484958 -0.035930 1.232975 +[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.159705 1.538287 -0.087686 1.257131 +[Debug] dit_step17_vt: [2170, 64] first4: -0.434245 1.219246 0.101116 1.128435 +[Debug] dit_step17_xt: [2170, 64] first4: 0.362412 1.452781 -0.155487 0.439925 +[DiT] step 18/50 t=0.660 +[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.216501 1.424307 -0.036471 1.226410 +[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.139621 1.481410 -0.105683 1.236565 +[Debug] dit_step18_vt: [2170, 64] first4: -0.455215 1.183846 0.234644 1.168681 +[Debug] dit_step18_xt: [2170, 64] first4: 0.371516 1.429104 -0.160180 0.416551 +[DiT] step 19/50 t=0.640 +[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.193781 1.365146 -0.052930 1.209294 +[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.117725 1.431031 -0.131161 1.210289 +[Debug] dit_step19_vt: [2170, 64] first4: -0.413012 1.059516 0.190559 1.170309 +[Debug] dit_step19_xt: [2170, 64] first4: 0.379776 1.407914 -0.163991 0.393145 +[DiT] step 20/50 t=0.620 +[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.171132 1.297945 -0.074904 1.190113 +[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.094949 1.381890 -0.154719 1.182902 +[Debug] dit_step20_vt: [2170, 64] first4: -0.416916 0.961257 0.217113 1.187745 +[Debug] dit_step20_xt: [2170, 64] first4: 0.388114 1.388689 -0.168333 0.369390 +[DiT] step 21/50 t=0.600 +[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.156818 1.217449 -0.102774 1.165961 +[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.076230 1.330654 -0.181784 1.153171 +[Debug] dit_step21_vt: [2170, 64] first4: -0.413541 0.733536 0.153392 1.169193 +[Debug] dit_step21_xt: [2170, 64] first4: 0.396385 1.374018 -0.171401 0.346006 +[DiT] step 22/50 t=0.580 +[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.149490 1.124467 -0.129160 1.136317 +[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066301 1.272416 -0.203485 1.126914 +[Debug] dit_step22_vt: [2170, 64] first4: -0.414485 0.550591 0.128185 1.113165 +[Debug] dit_step22_xt: [2170, 64] first4: 0.404675 1.363006 -0.173965 0.323743 +[DiT] step 23/50 t=0.560 +[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.151829 1.034549 -0.154075 1.109805 +[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.064656 1.215351 -0.221178 1.104513 +[Debug] dit_step23_vt: [2170, 64] first4: -0.436045 0.335528 0.059138 1.081705 +[Debug] dit_step23_xt: [2170, 64] first4: 0.413396 1.356296 -0.175148 0.302109 +[DiT] step 24/50 t=0.540 +[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.158201 0.930576 -0.174559 1.077020 +[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.070645 1.149243 -0.231212 1.077814 +[Debug] dit_step24_vt: [2170, 64] first4: -0.428632 0.115268 0.009393 1.014248 +[Debug] dit_step24_xt: [2170, 64] first4: 0.421968 1.353990 -0.175335 0.281824 +[DiT] step 25/50 t=0.520 +[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.172706 0.820171 -0.187534 1.046402 +[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.084898 1.070370 -0.233334 1.052725 +[Debug] dit_step25_vt: [2170, 64] first4: -0.451610 -0.083828 -0.046615 0.975977 +[Debug] dit_step25_xt: [2170, 64] first4: 0.431001 1.355667 -0.174403 0.262304 +[DiT] step 26/50 t=0.500 +[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.189538 0.693448 -0.199819 1.008348 +[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.101718 0.976297 -0.234447 1.021465 +[Debug] dit_step26_vt: [2170, 64] first4: -0.457462 -0.318421 -0.092668 0.902675 +[Debug] dit_step26_xt: [2170, 64] first4: 0.440150 1.362035 -0.172550 0.244251 +[DiT] step 27/50 t=0.480 +[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.208769 0.570284 -0.208205 0.971380 +[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.121884 0.876404 -0.234553 0.991057 +[Debug] dit_step27_vt: [2170, 64] first4: -0.474894 -0.475314 -0.124745 0.852438 +[Debug] dit_step27_xt: [2170, 64] first4: 0.449648 1.371541 -0.170055 0.227202 +[DiT] step 28/50 t=0.460 +[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.231451 0.438197 -0.217041 0.925335 +[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.142976 0.763591 -0.236660 0.951937 +[Debug] dit_step28_vt: [2170, 64] first4: -0.503044 -0.662942 -0.154526 0.775360 +[Debug] dit_step28_xt: [2170, 64] first4: 0.459709 1.384800 -0.166964 0.211695 +[DiT] step 29/50 t=0.440 +[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.255602 0.305935 -0.225167 0.878463 +[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.161997 0.641782 -0.240440 0.915606 +[Debug] dit_step29_vt: [2170, 64] first4: -0.546771 -0.770888 -0.172051 0.693937 +[Debug] dit_step29_xt: [2170, 64] first4: 0.470644 1.400218 -0.163523 0.197816 +[DiT] step 30/50 t=0.420 +[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.279856 0.166208 -0.230015 0.829064 +[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.179567 0.505654 -0.246080 0.880209 +[Debug] dit_step30_vt: [2170, 64] first4: -0.586251 -0.913478 -0.165068 0.591419 +[Debug] dit_step30_xt: [2170, 64] first4: 0.482369 1.418488 -0.160222 0.185988 +[DiT] step 31/50 t=0.400 +[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.303020 0.026116 -0.234342 0.778238 +[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194565 0.361688 -0.252981 0.842483 +[Debug] dit_step31_vt: [2170, 64] first4: -0.634687 -1.004404 -0.163206 0.505303 +[Debug] dit_step31_xt: [2170, 64] first4: 0.495063 1.438576 -0.156958 0.175882 +[DiT] step 32/50 t=0.380 +[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.323948 -0.114179 -0.233989 0.723499 +[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.205043 0.215050 -0.257018 0.804404 +[Debug] dit_step32_vt: [2170, 64] first4: -0.683278 -1.112943 -0.143677 0.386194 +[Debug] dit_step32_xt: [2170, 64] first4: 0.508728 1.460835 -0.154084 0.168158 +[DiT] step 33/50 t=0.360 +[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.343586 -0.257977 -0.227555 0.671680 +[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.213907 0.060018 -0.258127 0.766574 +[Debug] dit_step33_vt: [2170, 64] first4: -0.732613 -1.188033 -0.112312 0.307609 +[Debug] dit_step33_xt: [2170, 64] first4: 0.523381 1.484595 -0.151838 0.162006 +[DiT] step 34/50 t=0.340 +[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.359997 -0.390317 -0.220456 0.624948 +[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.218993 -0.085222 -0.256678 0.732610 +[Debug] dit_step34_vt: [2170, 64] first4: -0.782051 -1.277164 -0.095120 0.209992 +[Debug] dit_step34_xt: [2170, 64] first4: 0.539022 1.510138 -0.149936 0.157806 +[DiT] step 35/50 t=0.320 +[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.376117 -0.519578 -0.205748 0.575179 +[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.225790 -0.227651 -0.246590 0.695063 +[Debug] dit_step35_vt: [2170, 64] first4: -0.824555 -1.344921 -0.065901 0.128571 +[Debug] dit_step35_xt: [2170, 64] first4: 0.555513 1.537037 -0.148618 0.155234 +[DiT] step 36/50 t=0.300 +[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.388908 -0.645821 -0.189103 0.531176 +[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.231184 -0.367522 -0.234059 0.657502 +[Debug] dit_step36_vt: [2170, 64] first4: -0.864100 -1.422112 -0.041499 0.066854 +[Debug] dit_step36_xt: [2170, 64] first4: 0.572795 1.565479 -0.147788 0.153897 +[DiT] step 37/50 t=0.280 +[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.403025 -0.763416 -0.163423 0.484469 +[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.241143 -0.498980 -0.214114 0.616995 +[Debug] dit_step37_vt: [2170, 64] first4: -0.892219 -1.488433 0.005198 -0.010545 +[Debug] dit_step37_xt: [2170, 64] first4: 0.590639 1.595248 -0.147892 0.154108 +[DiT] step 38/50 t=0.260 +[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.413133 -0.876420 -0.134134 0.440490 +[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.251082 -0.625665 -0.187976 0.573381 +[Debug] dit_step38_vt: [2170, 64] first4: -0.908542 -1.561219 0.031824 -0.045067 +[Debug] dit_step38_xt: [2170, 64] first4: 0.608810 1.626472 -0.148528 0.155009 +[DiT] step 39/50 t=0.240 +[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.423588 -0.987294 -0.096889 0.393114 +[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.265262 -0.752590 -0.154346 0.525557 +[Debug] dit_step39_vt: [2170, 64] first4: -0.912287 -1.609328 0.086172 -0.106501 +[Debug] dit_step39_xt: [2170, 64] first4: 0.627056 1.658659 -0.150252 0.157140 +[DiT] step 40/50 t=0.220 +[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.432563 -1.079476 -0.052369 0.343707 +[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.280351 -0.859098 -0.115334 0.472040 +[Debug] dit_step40_vt: [2170, 64] first4: -0.898876 -1.653373 0.143320 -0.117817 +[Debug] dit_step40_xt: [2170, 64] first4: 0.645033 1.691726 -0.153118 0.159496 +[DiT] step 41/50 t=0.200 +[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.437202 -1.169207 -0.002996 0.295201 +[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.297226 -0.965837 -0.069953 0.414322 +[Debug] dit_step41_vt: [2170, 64] first4: -0.848467 -1.668746 0.201099 -0.128392 +[Debug] dit_step41_xt: [2170, 64] first4: 0.662003 1.725101 -0.157140 0.162064 +[DiT] step 42/50 t=0.180 +[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.440767 -1.252195 0.052645 0.236928 +[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.315933 -1.066464 -0.015775 0.349839 +[Debug] dit_step42_vt: [2170, 64] first4: -0.805727 -1.702440 0.256392 -0.165544 +[Debug] dit_step42_xt: [2170, 64] first4: 0.678117 1.759150 -0.162268 0.165375 +[DiT] step 43/50 t=0.160 +[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.441682 -1.322971 0.111570 0.178405 +[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.332472 -1.154346 0.037873 0.279796 +[Debug] dit_step43_vt: [2170, 64] first4: -0.752842 -1.718463 0.345657 -0.158267 +[Debug] dit_step43_xt: [2170, 64] first4: 0.693174 1.793519 -0.169181 0.168540 +[DiT] step 44/50 t=0.140 +[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.436481 -1.389212 0.170327 0.116380 +[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.347603 -1.238272 0.093730 0.210065 +[Debug] dit_step44_vt: [2170, 64] first4: -0.673781 -1.742915 0.402893 -0.211575 +[Debug] dit_step44_xt: [2170, 64] first4: 0.706650 1.828377 -0.177239 0.172771 +[DiT] step 45/50 t=0.120 +[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.432089 -1.448218 0.222042 0.052259 +[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.365667 -1.310638 0.145240 0.138891 +[Debug] dit_step45_vt: [2170, 64] first4: -0.598716 -1.780947 0.463354 -0.230193 +[Debug] dit_step45_xt: [2170, 64] first4: 0.718624 1.863996 -0.186506 0.177375 +[DiT] step 46/50 t=0.100 +[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.419096 -1.509236 0.267908 -0.006720 +[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.381439 -1.387897 0.192344 0.071553 +[Debug] dit_step46_vt: [2170, 64] first4: -0.477449 -1.801432 0.493477 -0.266383 +[Debug] dit_step46_xt: [2170, 64] first4: 0.728173 1.900025 -0.196375 0.182703 +[DiT] step 47/50 t=0.080 +[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.407120 -1.565253 0.302819 -0.051005 +[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.401163 -1.460867 0.230993 0.012604 +[Debug] dit_step47_vt: [2170, 64] first4: -0.369522 -1.829498 0.516088 -0.236691 +[Debug] dit_step47_xt: [2170, 64] first4: 0.735563 1.936615 -0.206697 0.187437 +[DiT] step 48/50 t=0.060 +[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.382883 -1.607141 0.333374 -0.083609 +[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.407261 -1.531759 0.269964 -0.041764 +[Debug] dit_step48_vt: [2170, 64] first4: -0.250388 -1.766134 0.508857 -0.194591 +[Debug] dit_step48_xt: [2170, 64] first4: 0.740571 1.971938 -0.216874 0.191329 +[DiT] step 49/50 t=0.040 +[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.416988 -1.643981 0.337042 -0.115695 +[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.434090 -1.549805 0.279877 -0.060649 +[Debug] dit_step49_vt: [2170, 64] first4: -0.398854 -1.970749 0.508508 -0.360412 +[Debug] dit_x0: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537 +[DiT] step 50/50 t=0.020 +[DiT] Total generation: 99823.1 ms (99823.1 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x12060b7b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x1206126e0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x120612940 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x120612fd0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x120613410 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x120613a10 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x120613f70 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x120614e50 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x10600ac10 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609612.5 ms +[Debug] vae_audio: [2, 4166400] first4: -0.003173 -0.003180 -0.003117 -0.002677 +[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +[Request] Loaded request0.json +[Noise] Reusing existing rng_philox_seed42.bf16 +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf +[GGML] Running acestep-v15-turbo-Q6_K.gguf... +[GGML] Done, 47 dump files +[Turbo] Reusing existing Python dumps: python-turbo +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.999631 + detok_output 0.999969 + context 0.999982 + noise 1.000000 + temb_t 0.999991 + hidden_after_proj_in 0.999987 + enc_after_cond_emb 0.999648 + layer0_sa_output 0.999791 + hidden_after_layer0 0.999898 + hidden_after_layer6 0.999877 + hidden_after_layer12 0.998721 + hidden_after_layer18 0.995721 + hidden_after_layer23 0.992012 + dit_step0_vt 0.970006 + dit_step0_xt 0.999934 + dit_step1_vt 0.973568 + dit_step1_xt 0.999795 + dit_step2_vt 0.976942 + dit_step2_xt 0.999458 + dit_step3_vt 0.977714 + dit_step3_xt 0.998700 + dit_step4_vt 0.976433 + dit_step4_xt 0.997003 + dit_step5_vt 0.973498 + dit_step5_xt 0.993187 + dit_step6_vt 0.970259 + dit_step6_xt 0.985910 + dit_step7_vt 0.963169 + dit_x0 0.975098 + vae_audio 0.894235 + vae_audio (log spectral) 0.999805 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999933 0.149343 0.007343 -0.002290 0.972926 -0.002342 0.972003 + dit_step1_xt 0.999794 0.293469 0.012678 -0.005225 0.942670 -0.005313 0.941730 + dit_step2_xt 0.999456 0.477608 0.019544 -0.009177 0.909085 -0.009311 0.908527 + dit_step3_xt 0.998699 0.734476 0.028962 -0.014472 0.873547 -0.014577 0.873624 + dit_step4_xt 0.997001 1.052176 0.042099 -0.021523 0.841546 -0.021660 0.841995 + dit_step5_xt 0.993185 1.523836 0.061805 -0.031603 0.824654 -0.032109 0.824593 + dit_step6_xt 0.985908 2.172513 0.091680 -0.045910 0.855362 -0.046482 0.855546 +[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf +[GGML] Running acestep-v15-sft-Q6_K.gguf... +[GGML] Done, 233 dump files +[SFT] Reusing existing Python dumps: python-sft +[SFT] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.999631 + detok_output 0.999969 + context 0.999982 + noise 1.000000 + temb_t 0.999973 + hidden_after_proj_in 0.999987 + enc_after_cond_emb 0.999652 + layer0_sa_output 0.999803 + hidden_after_layer0 0.999920 + hidden_after_layer6 0.999785 + hidden_after_layer12 0.999350 + hidden_after_layer18 0.998528 + hidden_after_layer23 0.998828 + null_condition_emb 1.000000 + null_enc_hidden 1.000000 + dit_step0_vt_cond 0.998729 + dit_step0_vt_uncond 0.998412 + dit_step0_vt 0.995061 + dit_step0_xt 0.999998 + dit_step5_vt_cond 0.999147 + dit_step5_vt 0.992746 + dit_step5_xt 0.999953 + dit_step10_vt_cond 0.997986 + dit_step10_vt 0.991731 + dit_step10_xt 0.999832 + dit_step15_vt_cond 0.995896 + dit_step15_vt 0.984377 + dit_step15_xt 0.999476 + dit_step20_vt_cond 0.992581 + dit_step20_vt 0.975428 + dit_step20_xt 0.998605 + dit_step25_vt_cond 0.986288 + dit_step25_vt 0.963191 + dit_step25_xt 0.996899 + dit_step30_vt_cond 0.979478 + dit_step30_vt 0.956129 + dit_step30_xt 0.994252 + dit_step35_vt_cond 0.972963 + dit_step35_vt 0.947489 + dit_step35_xt 0.991078 + dit_step40_vt_cond 0.968903 + dit_step40_vt 0.939482 + dit_step40_xt 0.987990 + dit_step45_vt_cond 0.973091 + dit_step45_vt 0.949768 + dit_step45_xt 0.985825 + dit_step49_vt_cond 0.979346 + dit_step49_vt 0.959720 + dit_x0 0.985104 + vae_audio 0.940564 + vae_audio (log spectral) 0.999648 +[SFT] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999996 0.039016 0.002154 -0.001750 0.980178 -0.001741 0.980402 + dit_step5_xt 0.999952 0.136674 0.006709 -0.006940 0.889822 -0.007143 0.887999 + dit_step10_xt 0.999831 0.203842 0.011045 -0.012357 0.811533 -0.012603 0.811299 + dit_step15_xt 0.999475 0.335757 0.017566 -0.017603 0.746439 -0.018114 0.745269 + dit_step20_xt 0.998605 0.555654 0.026541 -0.022932 0.700822 -0.023808 0.699582 + dit_step25_xt 0.996899 0.830926 0.037973 -0.028358 0.679564 -0.029311 0.679278 + dit_step30_xt 0.994252 1.135793 0.051746 -0.033803 0.685565 -0.035027 0.685262 + dit_step35_xt 0.991078 1.467212 0.067373 -0.039173 0.717556 -0.040716 0.717196 + dit_step40_xt 0.987990 1.880554 0.084328 -0.044527 0.771174 -0.046462 0.771853 + dit_step45_xt 0.985824 2.238589 0.100473 -0.050335 0.842316 -0.052475 0.843036 diff --git a/tests/Metal-Q8_0.log b/tests/Metal-Q8_0.log new file mode 100644 index 0000000..3ddbb85 --- /dev/null +++ b/tests/Metal-Q8_0.log @@ -0,0 +1,823 @@ +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.006 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 21.3 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 1779.3 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 272.0 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 41.5 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 228.7 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x14170b900 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x14170bd60 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x14170c5d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x14170ca50 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x14170d2c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x14170d8f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x14170e1d0 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x14170e5e0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x14170e840 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 44.9 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 34.4 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 140 tensors, 616.6 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 787.4 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1 0x141718bd0 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x141719010 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x141719670 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 132.0 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443 +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 106.5 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 165.9 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x14160b720 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x14160c030 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4 0x14160c760 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x14160c9c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32 0x14160ce00 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8 0x14160da60 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x14160dde0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x14160e040 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x14160ea80 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 664.8 ms +[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x14160f030 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x14160aa00 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x14160f9a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x1416102f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x1416109b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x1416113c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x1416118b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x141610670 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x141611dc0 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.260124 -0.161873 -0.097043 0.052039 +[Debug] temb: [2048] first4: 0.000130 -0.132501 -0.035452 0.064788 +[Debug] temb_t: [2048] first4: 0.001146 0.026826 -0.052770 0.063722 +[Debug] temb_r: [2048] first4: -0.001015 -0.159327 0.017318 0.001066 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.048950 -0.051683 -0.015299 -0.038721 +[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018836 -0.015732 0.008463 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039670 -0.968864 0.535370 0.447502 +[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.166626 0.855863 0.327675 -0.524847 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.719151 -0.764016 -0.047420 0.261850 +[Debug] layer0_q_after_rope: [128, 16] first4: -0.166626 0.855863 0.327675 -0.524847 +[Debug] layer0_k_after_rope: [128, 8] first4: -1.224136 -0.868663 -1.921617 -2.258156 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.509784 0.173032 -0.350482 0.513236 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.198180 -0.062361 -34.349155 -0.672093 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.543300 -1.045637 0.193163 0.457042 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.610051 -0.820831 -0.300355 0.492100 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.086482 0.559607 52.406876 -0.905369 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.433996 -0.091888 33.781910 -4.433238 +[Debug] hidden_after_layer12: [2048, 1085] first4: -15.201079 -18.070684 72.561172 28.713606 +[Debug] hidden_after_layer18: [2048, 1085] first4: -26.474438 14.961594 62.515419 20.237282 +[Debug] hidden_after_layer23: [2048, 1085] first4: -7.978052 44.256046 198.826355 145.129532 +[Debug] dit_step0_vt: [2170, 64] first4: 0.022187 1.144711 0.357881 2.375370 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193327 2.104218 -0.188142 0.739685 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.218329 1.318116 -0.102946 1.902612 +[Debug] dit_step1_xt: [2170, 64] first4: 0.205236 2.032320 -0.182527 0.635906 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: 0.013556 1.214101 0.135533 2.387155 +[Debug] dit_step2_xt: [2170, 64] first4: 0.204333 1.951380 -0.191563 0.476762 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.268002 1.073703 0.267121 2.643928 +[Debug] dit_step3_xt: [2170, 64] first4: 0.181999 1.861905 -0.213823 0.256435 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.304715 1.021377 0.118111 2.720495 +[Debug] dit_step4_xt: [2170, 64] first4: 0.149351 1.752472 -0.226477 -0.035047 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.279773 0.924189 -0.283976 2.780081 +[Debug] dit_step5_xt: [2170, 64] first4: 0.109384 1.620445 -0.185910 -0.432201 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.163348 0.641980 -0.841978 2.816087 +[Debug] dit_step6_xt: [2170, 64] first4: 0.076714 1.492049 -0.017514 -0.995418 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.026257 0.197844 -1.519455 3.080479 +[Debug] dit_x0: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 7097.2 ms (7097.2 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x141717870 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x1417100a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x1417089d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x141708e50 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x141719b10 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x14171a1d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x14171a730 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x14171b660 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x141618f80 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609553.7 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000462 0.000971 0.000803 0.001170 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices +ggml_metal_library_init: using embedded metal library +ggml_metal_library_init: loaded in 0.006 sec +ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) +ggml_metal_device_init: GPU name: MTL0 +ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) +ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) +ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) +ggml_metal_device_init: simdgroup reduction = true +ggml_metal_device_init: simdgroup matrix mul. = true +ggml_metal_device_init: has unified memory = true +ggml_metal_device_init: has bfloat = true +ggml_metal_device_init: has tensor = false +ggml_metal_device_init: use residency sets = true +ggml_metal_device_init: use shared buffers = true +ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] DiT backend: MTL0 (CPU threads: 5) +[Load] Backend init: 20.2 ms +[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 2506.1 ms +[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] VAE backend: MTL0 (CPU threads: 5) +[VAE] Backend: MTL0, Weight buffer: 255.7 MB +[VAE] Loaded: 5 blocks, upsample=1920x +[Load] VAE weights: 340.1 ms +[Request 1/1] ggml-sft/request0.json (batch=1) +[Request] parsed ggml-sft/request0.json (18 fields) +[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 40.9 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] TextEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Load] TextEncoder: 238.6 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' +ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x13060e0d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x13060e830 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x13060f670 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' +ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x13060f8d0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x130610350 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x1306108b0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x130610b10 | th_max = 576 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x1306115a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' +ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x130611800 | th_max = 1024 | th_width = 32 +[Encode] TextEncoder (70 tokens): 49.2 ms +[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 34.1 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +ggml_metal_init: allocating +ggml_metal_init: found device: Apple M2 Pro +ggml_metal_init: picking default device: Apple M2 Pro +ggml_metal_init: use fusion = true +ggml_metal_init: use concurrency = true +ggml_metal_init: use graph optimize = true +[Load] CondEncoder backend: MTL0 (CPU threads: 5) +[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 140 tensors, 616.6 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 615.4 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1 0x130709710 | th_max = 896 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x130709b90 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x13070a1c0 | th_max = 640 | th_width = 32 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 129.6 ms, enc_S=238 +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443 +[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 +[WeightCtx] Loaded 30 tensors, 106.5 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 77.5 ms +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x130708890 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x13070ab80 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4 0x13070ade0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' +ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x13070b260 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32 0x13070b970 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8 0x13070c5e0 | th_max = 640 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x13070c840 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x13070cc50 | th_max = 448 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x13070d590 | th_max = 1024 | th_width = 32 +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 663.7 ms +[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704 +[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 +[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x130612120 | th_max = 832 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' +ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x13061a740 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' +ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x13061af30 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x13061b880 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x13061bf40 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x13061c950 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x13061ce40 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x13061bc00 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x13061d350 | th_max = 640 | th_width = 32 +[Debug] tproj: [12288] first4: 0.154891 -0.116111 -0.086457 0.081949 +[Debug] temb: [2048] first4: -0.002756 -0.176432 0.004178 -0.001982 +[Debug] temb_t: [2048] first4: -0.001185 0.003330 -0.013113 -0.002073 +[Debug] temb_r: [2048] first4: -0.001571 -0.179762 0.017291 0.000091 +[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.038370 0.029875 0.028026 -0.024772 +[Debug] temb_lin1_r: [2048] first4: 0.001863 0.003353 -0.000552 -0.000197 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.090297 -0.925707 0.497575 0.441158 +[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.185352 0.911478 0.323017 -0.548477 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.915448 -0.716620 -0.037252 0.294421 +[Debug] layer0_q_after_rope: [128, 16] first4: -0.185352 0.911478 0.323017 -0.548477 +[Debug] layer0_k_after_rope: [128, 8] first4: -1.352387 -0.993045 -1.790654 -2.255961 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.646956 0.807681 -0.548860 0.541048 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.095906 0.020644 -33.592499 -0.642108 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803416 -1.383010 -0.122776 0.380531 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.967658 -1.044807 -0.403243 0.382801 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.409370 1.156843 57.423218 -1.617135 +[Debug] hidden_after_layer6: [2048, 1085] first4: -17.478519 4.365310 57.869217 -2.590640 +[Debug] hidden_after_layer12: [2048, 1085] first4: -10.105301 4.450487 -23.010748 -1.911694 +[Debug] hidden_after_layer18: [2048, 1085] first4: -3.554647 18.982800 -38.417198 -11.175929 +[Debug] hidden_after_layer23: [2048, 1085] first4: 34.303474 64.211922 58.212040 -9.279413 +[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.583175 2.543502 -0.173930 1.461996 +[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.311259 2.094594 -0.691959 1.747500 +[Debug] dit_step0_vt: [2170, 64] first4: -0.765032 3.014946 0.101383 1.079777 +[Debug] dit_step0_xt: [2170, 64] first4: 0.209637 2.095951 -0.173903 0.826061 +[DiT] step 1/50 t=1.000 +[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.635693 2.541408 0.005301 1.299802 +[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.555907 2.342016 -0.102693 1.478369 +[Debug] dit_step1_vt: [2170, 64] first4: -0.416332 2.156285 -0.189682 1.386406 +[Debug] dit_step1_xt: [2170, 64] first4: 0.217963 2.052825 -0.170109 0.798333 +[DiT] step 2/50 t=0.980 +[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.628899 2.502596 0.041825 1.309368 +[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.545262 2.302532 -0.090023 1.423282 +[Debug] dit_step2_vt: [2170, 64] first4: -0.777333 2.936136 0.278430 1.015660 +[Debug] dit_step2_xt: [2170, 64] first4: 0.233510 1.994103 -0.175678 0.778019 +[DiT] step 3/50 t=0.960 +[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.587814 2.422845 0.053595 1.321287 +[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.577692 2.332772 -0.037127 1.357753 +[Debug] dit_step3_vt: [2170, 64] first4: -0.357759 2.100169 -0.131042 1.500563 +[Debug] dit_step3_xt: [2170, 64] first4: 0.240665 1.952099 -0.173057 0.748008 +[DiT] step 4/50 t=0.940 +[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.531520 2.346682 0.048538 1.319634 +[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.572059 2.299766 -0.047866 1.336576 +[Debug] dit_step4_vt: [2170, 64] first4: -0.562290 2.601383 0.287872 1.079526 +[Debug] dit_step4_xt: [2170, 64] first4: 0.251911 1.900071 -0.178814 0.726417 +[DiT] step 5/50 t=0.920 +[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.471980 2.275284 0.016048 1.313642 +[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.581892 2.263602 -0.014694 1.345006 +[Debug] dit_step5_vt: [2170, 64] first4: -0.118539 1.956480 -0.197979 1.364062 +[Debug] dit_step5_xt: [2170, 64] first4: 0.254282 1.860942 -0.174855 0.699136 +[DiT] step 6/50 t=0.900 +[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.438105 2.231154 0.001016 1.310053 +[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.544084 2.196794 -0.016910 1.357073 +[Debug] dit_step6_vt: [2170, 64] first4: -0.411409 2.467071 0.199489 1.062841 +[Debug] dit_step6_xt: [2170, 64] first4: 0.262510 1.811600 -0.178844 0.677879 +[DiT] step 7/50 t=0.880 +[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.388773 2.150816 -0.023557 1.299900 +[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.488913 2.112037 -0.027399 1.344294 +[Debug] dit_step7_vt: [2170, 64] first4: -0.088803 1.961096 -0.200603 1.289382 +[Debug] dit_step7_xt: [2170, 64] first4: 0.264286 1.772379 -0.174832 0.652092 +[DiT] step 8/50 t=0.860 +[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.350721 2.065278 -0.035497 1.282128 +[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.410401 2.018277 -0.041600 1.310604 +[Debug] dit_step8_vt: [2170, 64] first4: -0.374781 2.238836 0.127401 1.108719 +[Debug] dit_step8_xt: [2170, 64] first4: 0.271781 1.727602 -0.177380 0.629917 +[DiT] step 9/50 t=0.840 +[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.318524 1.978134 -0.051657 1.264737 +[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.341550 1.941071 -0.051996 1.283970 +[Debug] dit_step9_vt: [2170, 64] first4: -0.191845 1.828466 -0.186632 1.247382 +[Debug] dit_step9_xt: [2170, 64] first4: 0.275618 1.691033 -0.173648 0.604970 +[DiT] step 10/50 t=0.820 +[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.295512 1.899216 -0.053004 1.254855 +[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.287151 1.875865 -0.046272 1.283265 +[Debug] dit_step10_vt: [2170, 64] first4: -0.389814 1.941252 0.040587 1.075441 +[Debug] dit_step10_xt: [2170, 64] first4: 0.283415 1.652207 -0.174460 0.583461 +[DiT] step 11/50 t=0.800 +[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.275416 1.819571 -0.061343 1.243315 +[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.256136 1.821522 -0.037205 1.292460 +[Debug] dit_step11_vt: [2170, 64] first4: -0.238914 1.604097 -0.208856 1.109252 +[Debug] dit_step11_xt: [2170, 64] first4: 0.288193 1.620125 -0.170282 0.561276 +[DiT] step 12/50 t=0.780 +[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.256205 1.750170 -0.060178 1.242222 +[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.227482 1.778021 -0.026911 1.317358 +[Debug] dit_step12_vt: [2170, 64] first4: -0.364171 1.624805 -0.047159 0.969305 +[Debug] dit_step12_xt: [2170, 64] first4: 0.295476 1.587629 -0.169339 0.541890 +[DiT] step 13/50 t=0.760 +[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.235268 1.684496 -0.057297 1.240694 +[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.197700 1.726529 -0.035090 1.325942 +[Debug] dit_step13_vt: [2170, 64] first4: -0.273492 1.425781 -0.144172 1.021587 +[Debug] dit_step13_xt: [2170, 64] first4: 0.300946 1.559114 -0.166456 0.521458 +[DiT] step 14/50 t=0.740 +[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.215348 1.630674 -0.040594 1.249328 +[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.174794 1.672403 -0.046957 1.327406 +[Debug] dit_step14_vt: [2170, 64] first4: -0.334811 1.472660 0.055936 0.996336 +[Debug] dit_step14_xt: [2170, 64] first4: 0.307642 1.529660 -0.167575 0.501531 +[DiT] step 15/50 t=0.720 +[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.197163 1.578918 -0.035665 1.254164 +[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.153038 1.617873 -0.066125 1.317649 +[Debug] dit_step15_vt: [2170, 64] first4: -0.277163 1.353290 0.013043 1.079428 +[Debug] dit_step15_xt: [2170, 64] first4: 0.313186 1.502595 -0.167835 0.479943 +[DiT] step 16/50 t=0.700 +[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.178978 1.530298 -0.037333 1.260345 +[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.134279 1.564621 -0.087480 1.303262 +[Debug] dit_step16_vt: [2170, 64] first4: -0.306646 1.373590 0.150592 1.110865 +[Debug] dit_step16_xt: [2170, 64] first4: 0.319319 1.475123 -0.170847 0.457726 +[DiT] step 17/50 t=0.680 +[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.160694 1.478322 -0.055287 1.256185 +[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.120116 1.510010 -0.115773 1.278657 +[Debug] dit_step17_vt: [2170, 64] first4: -0.246079 1.282344 0.092105 1.178406 +[Debug] dit_step17_xt: [2170, 64] first4: 0.324240 1.449476 -0.172689 0.434157 +[DiT] step 18/50 t=0.660 +[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.143064 1.420482 -0.082808 1.247799 +[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.106988 1.460190 -0.151634 1.257033 +[Debug] dit_step18_vt: [2170, 64] first4: -0.255780 1.219606 0.175261 1.179049 +[Debug] dit_step18_xt: [2170, 64] first4: 0.329356 1.425084 -0.176195 0.410576 +[DiT] step 19/50 t=0.640 +[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.128746 1.354206 -0.117595 1.227311 +[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.089307 1.410491 -0.191474 1.230714 +[Debug] dit_step19_vt: [2170, 64] first4: -0.249608 1.072434 0.109807 1.183548 +[Debug] dit_step19_xt: [2170, 64] first4: 0.334348 1.403635 -0.178391 0.386906 +[DiT] step 20/50 t=0.620 +[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.119003 1.272067 -0.154096 1.204746 +[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.074174 1.352716 -0.224701 1.203207 +[Debug] dit_step20_vt: [2170, 64] first4: -0.270581 0.922657 0.097471 1.173580 +[Debug] dit_step20_xt: [2170, 64] first4: 0.339760 1.385182 -0.180340 0.363434 +[DiT] step 21/50 t=0.600 +[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.119830 1.180927 -0.187708 1.173581 +[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.067616 1.288429 -0.252699 1.168760 +[Debug] dit_step21_vt: [2170, 64] first4: -0.293490 0.743300 0.018199 1.149608 +[Debug] dit_step21_xt: [2170, 64] first4: 0.345629 1.370316 -0.180704 0.340442 +[DiT] step 22/50 t=0.580 +[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.122855 1.082593 -0.215883 1.140040 +[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066099 1.222055 -0.270495 1.136248 +[Debug] dit_step22_vt: [2170, 64] first4: -0.307841 0.532235 -0.036651 1.101102 +[Debug] dit_step22_xt: [2170, 64] first4: 0.351786 1.359671 -0.179971 0.318420 +[DiT] step 23/50 t=0.560 +[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.132629 0.986949 -0.241285 1.111620 +[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.070734 1.155219 -0.283975 1.102751 +[Debug] dit_step23_vt: [2170, 64] first4: -0.338930 0.377693 -0.112170 1.114765 +[Debug] dit_step23_xt: [2170, 64] first4: 0.358565 1.352118 -0.177728 0.296124 +[DiT] step 24/50 t=0.540 +[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.144419 0.884916 -0.261786 1.077981 +[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.082416 1.084641 -0.291092 1.068368 +[Debug] dit_step24_vt: [2170, 64] first4: -0.333728 0.169552 -0.174635 1.048605 +[Debug] dit_step24_xt: [2170, 64] first4: 0.365239 1.348727 -0.174235 0.275152 +[DiT] step 25/50 t=0.520 +[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.162952 0.778074 -0.275456 1.049087 +[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.097955 1.005611 -0.292343 1.037690 +[Debug] dit_step25_vt: [2170, 64] first4: -0.381506 0.021539 -0.229765 1.053622 +[Debug] dit_step25_xt: [2170, 64] first4: 0.372869 1.348296 -0.169640 0.254080 +[DiT] step 26/50 t=0.500 +[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.181652 0.659926 -0.286432 1.013943 +[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.115043 0.914795 -0.293755 1.005103 +[Debug] dit_step26_vt: [2170, 64] first4: -0.384189 -0.243706 -0.263819 0.974856 +[Debug] dit_step26_xt: [2170, 64] first4: 0.380553 1.353170 -0.164363 0.234583 +[DiT] step 27/50 t=0.480 +[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.201740 0.544023 -0.293109 0.975384 +[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.133396 0.820908 -0.294792 0.973169 +[Debug] dit_step27_vt: [2170, 64] first4: -0.422354 -0.384602 -0.284394 0.929557 +[Debug] dit_step27_xt: [2170, 64] first4: 0.389000 1.360862 -0.158675 0.215992 +[DiT] step 28/50 t=0.460 +[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.224075 0.417509 -0.297748 0.929072 +[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.152221 0.713812 -0.298235 0.936752 +[Debug] dit_step28_vt: [2170, 64] first4: -0.444730 -0.577905 -0.283219 0.830288 +[Debug] dit_step28_xt: [2170, 64] first4: 0.397895 1.372420 -0.153011 0.199386 +[DiT] step 29/50 t=0.440 +[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.245692 0.290209 -0.302133 0.880189 +[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.168230 0.597160 -0.303812 0.900573 +[Debug] dit_step29_vt: [2170, 64] first4: -0.492967 -0.691559 -0.283257 0.750519 +[Debug] dit_step29_xt: [2170, 64] first4: 0.407754 1.386251 -0.147346 0.184375 +[DiT] step 30/50 t=0.420 +[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.267059 0.157186 -0.303854 0.831628 +[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.182529 0.468759 -0.310389 0.867764 +[Debug] dit_step30_vt: [2170, 64] first4: -0.526538 -0.832075 -0.259881 0.638692 +[Debug] dit_step30_xt: [2170, 64] first4: 0.418285 1.402893 -0.142148 0.171602 +[DiT] step 31/50 t=0.400 +[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.287749 0.019218 -0.305370 0.782136 +[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194848 0.329909 -0.318059 0.834346 +[Debug] dit_step31_vt: [2170, 64] first4: -0.575041 -0.931991 -0.245392 0.549215 +[Debug] dit_step31_xt: [2170, 64] first4: 0.429786 1.421533 -0.137240 0.160617 +[DiT] step 32/50 t=0.380 +[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.304426 -0.117694 -0.305028 0.730304 +[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.204242 0.187789 -0.324302 0.800084 +[Debug] dit_step32_vt: [2170, 64] first4: -0.600474 -1.036970 -0.221470 0.433362 +[Debug] dit_step32_xt: [2170, 64] first4: 0.441795 1.442272 -0.132811 0.151950 +[DiT] step 33/50 t=0.360 +[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.319238 -0.254083 -0.299488 0.682753 +[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.210244 0.042433 -0.326321 0.765284 +[Debug] dit_step33_vt: [2170, 64] first4: -0.649450 -1.121861 -0.194288 0.366186 +[Debug] dit_step33_xt: [2170, 64] first4: 0.454784 1.464709 -0.128925 0.144626 +[DiT] step 34/50 t=0.340 +[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.329936 -0.381831 -0.294650 0.641700 +[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211479 -0.098170 -0.329415 0.733958 +[Debug] dit_step34_vt: [2170, 64] first4: -0.683049 -1.194662 -0.165597 0.286680 +[Debug] dit_step34_xt: [2170, 64] first4: 0.468445 1.488602 -0.125613 0.138893 +[DiT] step 35/50 t=0.320 +[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.339163 -0.510193 -0.285358 0.597418 +[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.213347 -0.237217 -0.326480 0.696784 +[Debug] dit_step35_vt: [2170, 64] first4: -0.716524 -1.285484 -0.142755 0.230145 +[Debug] dit_step35_xt: [2170, 64] first4: 0.482776 1.514312 -0.122758 0.134290 +[DiT] step 36/50 t=0.300 +[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.344282 -0.639077 -0.274660 0.557109 +[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.213018 -0.378000 -0.321460 0.659633 +[Debug] dit_step36_vt: [2170, 64] first4: -0.737407 -1.359316 -0.114364 0.177555 +[Debug] dit_step36_xt: [2170, 64] first4: 0.497524 1.541498 -0.120471 0.130739 +[DiT] step 37/50 t=0.280 +[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.350530 -0.754109 -0.251615 0.515208 +[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.215775 -0.505277 -0.306489 0.618527 +[Debug] dit_step37_vt: [2170, 64] first4: -0.758482 -1.436458 -0.064580 0.134758 +[Debug] dit_step37_xt: [2170, 64] first4: 0.512694 1.570228 -0.119179 0.128044 +[DiT] step 38/50 t=0.260 +[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.351878 -0.862931 -0.225801 0.473748 +[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.216566 -0.629070 -0.288643 0.572433 +[Debug] dit_step38_vt: [2170, 64] first4: -0.762284 -1.485620 -0.020081 0.114390 +[Debug] dit_step38_xt: [2170, 64] first4: 0.527939 1.599940 -0.118778 0.125756 +[DiT] step 39/50 t=0.240 +[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.353231 -0.974004 -0.188379 0.427506 +[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.220991 -0.753843 -0.259805 0.519853 +[Debug] dit_step39_vt: [2170, 64] first4: -0.758813 -1.561977 0.045217 0.080611 +[Debug] dit_step39_xt: [2170, 64] first4: 0.543115 1.631179 -0.119682 0.124144 +[DiT] step 40/50 t=0.220 +[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.356234 -1.066793 -0.144915 0.378638 +[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.228754 -0.860472 -0.222172 0.464689 +[Debug] dit_step40_vt: [2170, 64] first4: -0.749040 -1.587260 0.097200 0.055798 +[Debug] dit_step40_xt: [2170, 64] first4: 0.558096 1.662925 -0.121626 0.123028 +[DiT] step 41/50 t=0.200 +[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.355270 -1.157881 -0.092032 0.327957 +[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.236702 -0.967551 -0.178051 0.403448 +[Debug] dit_step41_vt: [2170, 64] first4: -0.709683 -1.623054 0.190026 0.058188 +[Debug] dit_step41_xt: [2170, 64] first4: 0.572290 1.695386 -0.125427 0.121864 +[DiT] step 42/50 t=0.180 +[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.352479 -1.244785 -0.033309 0.267733 +[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.244508 -1.070808 -0.121544 0.336131 +[Debug] dit_step42_vt: [2170, 64] first4: -0.676820 -1.651634 0.233357 0.014431 +[Debug] dit_step42_xt: [2170, 64] first4: 0.585826 1.728418 -0.130094 0.121575 +[DiT] step 43/50 t=0.160 +[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.345241 -1.321976 0.026233 0.204406 +[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.251274 -1.163180 -0.061907 0.264783 +[Debug] dit_step43_vt: [2170, 64] first4: -0.615402 -1.690314 0.311200 0.000887 +[Debug] dit_step43_xt: [2170, 64] first4: 0.598134 1.762225 -0.136318 0.121558 +[DiT] step 44/50 t=0.140 +[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.330899 -1.390417 0.088632 0.138209 +[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.253711 -1.249830 -0.000380 0.191808 +[Debug] dit_step44_vt: [2170, 64] first4: -0.551020 -1.700038 0.375316 -0.049211 +[Debug] dit_step44_xt: [2170, 64] first4: 0.609155 1.796225 -0.143824 0.122542 +[DiT] step 45/50 t=0.120 +[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.315962 -1.445453 0.152387 0.071900 +[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.260821 -1.326647 0.053483 0.118309 +[Debug] dit_step45_vt: [2170, 64] first4: -0.444048 -1.697294 0.502791 -0.074117 +[Debug] dit_step45_xt: [2170, 64] first4: 0.618036 1.830171 -0.153880 0.124024 +[DiT] step 46/50 t=0.100 +[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.294778 -1.496297 0.212962 0.010717 +[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.258755 -1.395077 0.130780 0.026842 +[Debug] dit_step46_vt: [2170, 64] first4: -0.331878 -1.748994 0.398840 0.016598 +[Debug] dit_step46_xt: [2170, 64] first4: 0.624673 1.865151 -0.161857 0.123692 +[DiT] step 47/50 t=0.080 +[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.281226 -1.541478 0.262625 -0.022201 +[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.279224 -1.453849 0.178946 -0.018997 +[Debug] dit_step47_vt: [2170, 64] first4: -0.212368 -1.759161 0.579829 -0.049871 +[Debug] dit_step47_xt: [2170, 64] first4: 0.628921 1.900334 -0.173453 0.124690 +[DiT] step 48/50 t=0.060 +[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.256958 -1.573266 0.310890 -0.048733 +[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.281065 -1.517397 0.260529 -0.069026 +[Debug] dit_step48_vt: [2170, 64] first4: -0.191601 -1.679712 0.330919 0.028046 +[Debug] dit_step48_xt: [2170, 64] first4: 0.632753 1.933929 -0.180072 0.124129 +[DiT] step 49/50 t=0.040 +[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.282571 -1.604237 0.314485 -0.067221 +[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.305600 -1.528070 0.251822 -0.083237 +[Debug] dit_step49_vt: [2170, 64] first4: -0.212302 -1.898327 0.637213 -0.078416 +[Debug] dit_x0: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697 +[DiT] step 50/50 t=0.020 +[DiT] Total generation: 88329.8 ms (88329.8 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 474 nodes, T_latent=192 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' +ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x13070d7f0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x13070e360 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x13070e5c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' +ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x13070eb20 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x13070eea0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' +ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x13070f4a0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' +ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x13070f8c0 | th_max = 1024 | th_width = 32 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x130710c50 | th_max = 1024 | th_width = 32 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 474 nodes, T_latent=256 +[VAE] Graph: 474 nodes, T_latent=186 +ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' +ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x130710eb0 | th_max = 896 | th_width = 32 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 609578.6 ms +[Debug] vae_audio: [2, 4166400] first4: -0.002759 -0.002685 -0.002611 -0.002214 +[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +ggml_metal_free: deallocating +ggml_metal_free: deallocating +[Pipeline] All done +[Request] Loaded request0.json +[Noise] Reusing existing rng_philox_seed42.bf16 +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf +[GGML] Running acestep-v15-turbo-Q8_0.gguf... +[GGML] Done, 47 dump files +[Turbo] Reusing existing Python dumps: python-turbo +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.999792 + detok_output 0.999991 + context 0.999996 + noise 1.000000 + temb_t 0.999998 + hidden_after_proj_in 0.999992 + enc_after_cond_emb 0.999776 + layer0_sa_output 0.999956 + hidden_after_layer0 0.999975 + hidden_after_layer6 0.999915 + hidden_after_layer12 0.999185 + hidden_after_layer18 0.996490 + hidden_after_layer23 0.993375 + dit_step0_vt 0.974442 + dit_step0_xt 0.999944 + dit_step1_vt 0.976442 + dit_step1_xt 0.999818 + dit_step2_vt 0.978398 + dit_step2_xt 0.999498 + dit_step3_vt 0.979729 + dit_step3_xt 0.998787 + dit_step4_vt 0.979038 + dit_step4_xt 0.997189 + dit_step5_vt 0.976705 + dit_step5_xt 0.993692 + dit_step6_vt 0.973710 + dit_step6_xt 0.987147 + dit_step7_vt 0.967471 + dit_x0 0.977589 + vae_audio 0.899969 + vae_audio (log spectral) 0.999797 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999943 0.142558 0.006797 -0.002331 0.972917 -0.002342 0.972003 + dit_step1_xt 0.999817 0.272611 0.011800 -0.005326 0.942672 -0.005313 0.941730 + dit_step2_xt 0.999496 0.463653 0.018437 -0.009355 0.909197 -0.009311 0.908527 + dit_step3_xt 0.998785 0.659420 0.027271 -0.014709 0.873849 -0.014577 0.873624 + dit_step4_xt 0.997188 0.977590 0.039587 -0.021771 0.842053 -0.021660 0.841995 + dit_step5_xt 0.993691 1.450203 0.057714 -0.031846 0.825442 -0.032109 0.824593 + dit_step6_xt 0.987145 2.144326 0.085260 -0.046128 0.856513 -0.046482 0.855546 +[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf +[GGML] Running acestep-v15-sft-Q8_0.gguf... +[GGML] Done, 233 dump files +[SFT] Reusing existing Python dumps: python-sft +[SFT] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999813 + lyric_embed 1.000000 + enc_hidden 0.999792 + detok_output 0.999991 + context 0.999996 + noise 1.000000 + temb_t 0.999994 + hidden_after_proj_in 0.999993 + enc_after_cond_emb 0.999779 + layer0_sa_output 0.999948 + hidden_after_layer0 0.999975 + hidden_after_layer6 0.999842 + hidden_after_layer12 0.999467 + hidden_after_layer18 0.998721 + hidden_after_layer23 0.998987 + null_condition_emb 1.000000 + null_enc_hidden 1.000000 + dit_step0_vt_cond 0.998936 + dit_step0_vt_uncond 0.998589 + dit_step0_vt 0.995617 + dit_step0_xt 0.999998 + dit_step5_vt_cond 0.999453 + dit_step5_vt 0.993749 + dit_step5_xt 0.999962 + dit_step10_vt_cond 0.998786 + dit_step10_vt 0.993632 + dit_step10_xt 0.999883 + dit_step15_vt_cond 0.996925 + dit_step15_vt 0.985179 + dit_step15_xt 0.999650 + dit_step20_vt_cond 0.993789 + dit_step20_vt 0.978156 + dit_step20_xt 0.998994 + dit_step25_vt_cond 0.988666 + dit_step25_vt 0.968588 + dit_step25_xt 0.997635 + dit_step30_vt_cond 0.983353 + dit_step30_vt 0.963692 + dit_step30_xt 0.995502 + dit_step35_vt_cond 0.978311 + dit_step35_vt 0.954994 + dit_step35_xt 0.992900 + dit_step40_vt_cond 0.975242 + dit_step40_vt 0.949054 + dit_step40_xt 0.990408 + dit_step45_vt_cond 0.977875 + dit_step45_vt 0.949872 + dit_step45_xt 0.988656 + dit_step49_vt_cond 0.980007 + dit_step49_vt 0.943555 + dit_x0 0.988056 + vae_audio 0.945079 + vae_audio (log spectral) 0.999659 +[SFT] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999997 0.038313 0.002069 -0.001710 0.980019 -0.001741 0.980402 + dit_step5_xt 0.999960 0.128136 0.005945 -0.006874 0.889005 -0.007143 0.887999 + dit_step10_xt 0.999882 0.212035 0.009258 -0.012273 0.810355 -0.012603 0.811299 + dit_step15_xt 0.999649 0.310457 0.014288 -0.017479 0.745211 -0.018114 0.745269 + dit_step20_xt 0.998994 0.579346 0.021839 -0.022740 0.699641 -0.023808 0.699582 + dit_step25_xt 0.997635 0.874619 0.031657 -0.028120 0.678310 -0.029311 0.679278 + dit_step30_xt 0.995501 1.140020 0.043494 -0.033543 0.684534 -0.035027 0.685262 + dit_step35_xt 0.992900 1.761304 0.057050 -0.038898 0.716898 -0.040716 0.717196 + dit_step40_xt 0.990407 2.128224 0.071732 -0.044231 0.770985 -0.046462 0.771853 + dit_step45_xt 0.988655 2.420490 0.085663 -0.050087 0.842526 -0.052475 0.843036 diff --git a/tests/Vulkan-BF16.log b/tests/Vulkan-BF16.log new file mode 100644 index 0000000..2d955d7 --- /dev/null +++ b/tests/Vulkan-BF16.log @@ -0,0 +1,259 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 +[Load] DiT backend: Vulkan0 (CPU threads: 16) +[Load] Backend init: 260.3 ms +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 3007.9 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 397.7 ms +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: Vulkan0 (CPU threads: 16) +[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 672.5 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 32.1 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 166.9 ms +[Encode] TextEncoder (70 tokens): 30.9 ms +[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 1160.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 163.7 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 22.5 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.758148 -0.049593 -0.132730 0.058488 +[GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 +[WeightCtx] Loaded 30 tensors, 200.3 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 28.1 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 229.8 ms +[Debug] detok_output: [2170, 64] first4: -0.125193 1.435010 0.308190 -0.624228 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.125193 1.435010 0.308190 -0.624228 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] tproj: [12288] first4: 0.260062 -0.161562 -0.097030 0.052313 +[Debug] temb: [2048] first4: 0.000069 -0.132499 -0.035430 0.064753 +[Debug] temb_t: [2048] first4: 0.001065 0.026818 -0.052754 0.063717 +[Debug] temb_r: [2048] first4: -0.000996 -0.159317 0.017323 0.001036 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049318 -0.051829 -0.014251 -0.038444 +[Debug] temb_lin1_r: [2048] first4: -0.013266 -0.018319 -0.016375 0.008532 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039551 -0.969299 0.536133 0.446747 +[Debug] proj_in_input: [192, 2170] first4: -0.125193 1.435010 0.308190 -0.624228 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.168464 0.814954 0.327714 -0.561971 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.719110 -0.764019 -0.047328 0.261808 +[Debug] layer0_q_after_rope: [128, 16] first4: -2.424376 -0.094810 -0.411903 1.007324 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.712339 1.106410 1.775920 1.780798 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.501171 0.169176 -0.355798 0.513027 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.598325 -0.820241 -0.296337 0.493580 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.091503 0.566892 52.584164 -0.903901 +[Debug] hidden_after_layer6: [2048, 1085] first4: -21.192070 0.040278 33.599442 -4.442998 +[Debug] hidden_after_layer12: [2048, 1085] first4: -15.068191 -18.118078 71.999359 28.597229 +[Debug] hidden_after_layer18: [2048, 1085] first4: -27.132679 15.867422 60.847614 20.940519 +[Debug] hidden_after_layer23: [2048, 1085] first4: -12.584854 45.152912 198.753845 145.517029 +[Debug] dit_step0_vt: [2170, 64] first4: 0.014936 1.119046 0.345802 2.379982 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193657 2.105384 -0.187593 0.739475 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: 0.086700 0.854980 -0.273651 1.728149 +[Debug] dit_step1_xt: [2170, 64] first4: 0.188928 2.058749 -0.172667 0.645212 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: 0.180420 0.837399 -0.150421 2.056976 +[Debug] dit_step2_xt: [2170, 64] first4: 0.176900 2.002922 -0.162639 0.508081 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.130821 0.833313 0.053528 2.193359 +[Debug] dit_step3_xt: [2170, 64] first4: 0.165998 1.933480 -0.167099 0.325301 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.273712 0.866425 0.216686 2.274872 +[Debug] dit_step4_xt: [2170, 64] first4: 0.136672 1.840648 -0.190316 0.081565 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.347900 0.772171 0.542953 2.248352 +[Debug] dit_step5_xt: [2170, 64] first4: 0.086972 1.730338 -0.267881 -0.239629 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.132820 0.664673 0.218246 2.387787 +[Debug] dit_step6_xt: [2170, 64] first4: 0.060408 1.597404 -0.311530 -0.717186 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.335976 0.323303 0.198029 2.726624 +[Debug] dit_x0: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 740.5 ms (740.5 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 9812.1 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000591 0.001078 0.000929 0.001296 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:55:13.398 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:55:13.398 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:55:13.399 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:55:13.399 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:55:13.399 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:55:14.155 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:55:15.664 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:55:15.664 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:55:15.669 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:55:15.830 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:55:15.838 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:55:15.850 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:55:15.851 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:55:15.885 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:55:16.193 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:55:16.193 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:55:16.193 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006814241409301758, 'diffusion_time_cost': 0.30007076263427734, 'diffusion_per_step_time_cost': 0.03750884532928467, 'total_time_cost': 0.3068850040435791, 'offload_time_cost': 0.0} +2026-03-01 19:55:16.208 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:55:16.485 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:55:16.488 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:55:16.491 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf +[GGML] Running acestep-v15-turbo-BF16.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999812 + lyric_embed 1.000000 + enc_hidden 0.999834 + detok_output 0.999997 + context 0.999998 + noise 1.000000 + temb_t 0.999999 + hidden_after_proj_in 0.999987 + enc_after_cond_emb 0.999825 + layer0_sa_output 0.999959 + hidden_after_layer0 0.999982 + hidden_after_layer6 0.999916 + hidden_after_layer12 0.999276 + hidden_after_layer18 0.996645 + hidden_after_layer23 0.993735 + dit_step0_vt 0.975502 + dit_step0_xt 0.999946 + dit_step1_vt 0.898326 + dit_step1_xt 0.999578 + dit_step2_vt 0.893586 + dit_step2_xt 0.998276 + dit_step3_vt 0.881101 + dit_step3_xt 0.994720 + dit_step4_vt 0.869138 + dit_step4_xt 0.986137 + dit_step5_vt 0.854878 + dit_step5_xt 0.965846 + dit_step6_vt 0.840298 + dit_step6_xt 0.925771 + dit_step7_vt 0.818271 + dit_x0 0.867399 + vae_audio 0.680412 + vae_audio (STFT cosine) 0.855380 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999946 0.135811 0.006633 -0.002316 0.972919 -0.002342 0.972003 + dit_step1_xt 0.999578 0.413265 0.019706 -0.005121 0.942541 -0.005313 0.941730 + dit_step2_xt 0.998276 0.811472 0.038208 -0.008968 0.908957 -0.009311 0.908527 + dit_step3_xt 0.994720 1.481150 0.064047 -0.014385 0.872574 -0.014577 0.873624 + dit_step4_xt 0.986137 1.857148 0.100272 -0.021489 0.837038 -0.021660 0.841995 + dit_step5_xt 0.965846 1.439633 0.154129 -0.031859 0.812819 -0.032109 0.824593 + dit_step6_xt 0.925771 2.125688 0.235367 -0.046759 0.832442 -0.046482 0.855546 diff --git a/tests/Vulkan-CPU_Q6_K.log b/tests/Vulkan-CPU_Q6_K.log new file mode 100644 index 0000000..8912047 --- /dev/null +++ b/tests/Vulkan-CPU_Q6_K.log @@ -0,0 +1,54 @@ +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf +[GGML] Running acestep-v15-turbo-Q6_K.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999812 + lyric_embed 1.000000 + enc_hidden 0.999665 + detok_output 0.999972 + context 0.999982 + noise 1.000000 + temb_t 0.999990 + hidden_after_proj_in 0.999982 + enc_after_cond_emb 0.999691 + layer0_sa_output 0.999774 + hidden_after_layer0 0.999710 + hidden_after_layer6 0.999855 + hidden_after_layer12 0.998856 + hidden_after_layer18 0.995803 + hidden_after_layer23 0.992072 + dit_step0_vt 0.970064 + dit_step0_xt 0.999934 + dit_step1_vt 0.924564 + dit_step1_xt 0.999651 + dit_step2_vt 0.915541 + dit_step2_xt 0.998650 + dit_step3_vt 0.915489 + dit_step3_xt 0.996123 + dit_step4_vt 0.916835 + dit_step4_xt 0.990527 + dit_step5_vt 0.909275 + dit_step5_xt 0.977470 + dit_step6_vt 0.899986 + dit_step6_xt 0.952353 + dit_step7_vt 0.880023 + dit_x0 0.915268 + vae_audio 0.753562 + vae_audio (STFT cosine) 0.882452 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999934 0.147239 0.007394 -0.002260 0.973056 -0.002342 0.972003 + dit_step1_xt 0.999651 0.410402 0.017745 -0.005286 0.943565 -0.005313 0.941730 + dit_step2_xt 0.998650 0.806730 0.033672 -0.009524 0.911097 -0.009311 0.908527 + dit_step3_xt 0.996123 1.479887 0.054500 -0.015235 0.876469 -0.014577 0.873624 + dit_step4_xt 0.990527 2.298363 0.081794 -0.022731 0.844225 -0.021660 0.841995 + dit_step5_xt 0.977470 3.296017 0.123177 -0.033626 0.825405 -0.032109 0.824593 + dit_step6_xt 0.952353 4.545029 0.185597 -0.049157 0.851892 -0.046482 0.855546 diff --git a/tests/Vulkan-Q4_K_M.log b/tests/Vulkan-Q4_K_M.log new file mode 100644 index 0000000..011c0c3 --- /dev/null +++ b/tests/Vulkan-Q4_K_M.log @@ -0,0 +1,259 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 +[Load] DiT backend: Vulkan0 (CPU threads: 16) +[Load] Backend init: 115.6 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 895.6 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 126.7 ms +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: Vulkan0 (CPU threads: 16) +[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 667.9 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 31.0 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 166.1 ms +[Encode] TextEncoder (70 tokens): 18.4 ms +[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 11.3 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 352.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 43.9 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 18.2 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.760519 -0.046675 -0.129011 0.057651 +[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 64.7 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 8.9 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 152.2 ms +[Debug] detok_output: [2170, 64] first4: -0.107345 1.442038 0.300564 -0.641466 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.107345 1.442038 0.300564 -0.641466 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +[Debug] tproj: [12288] first4: 0.260934 -0.160421 -0.090493 0.048629 +[Debug] temb: [2048] first4: 0.000206 -0.133914 -0.034444 0.065020 +[Debug] temb_t: [2048] first4: 0.000970 0.025693 -0.052101 0.063331 +[Debug] temb_r: [2048] first4: -0.000764 -0.159607 0.017657 0.001690 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049286 -0.053324 -0.012254 -0.047666 +[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.048340 -0.991272 0.525635 0.454071 +[Debug] proj_in_input: [192, 2170] first4: -0.107345 1.442038 0.300564 -0.641466 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.176880 0.743576 0.273499 -0.548842 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.723765 -0.772117 -0.042278 0.260597 +[Debug] layer0_q_after_rope: [128, 16] first4: -3.943359 0.398682 0.213257 0.700195 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.176880 0.743576 0.273499 -0.548842 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.613281 0.155151 -0.481201 0.457520 +[Debug] layer0_attn_out: [2048, 1085] first4: -12.139185 0.824881 1.501430 1.799707 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.581965 -1.059581 0.060089 0.462956 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.685481 -0.828136 -0.442840 0.506230 +[Debug] hidden_after_layer0: [2048, 1085] first4: -8.767639 0.404994 47.213272 -0.751820 +[Debug] hidden_after_layer6: [2048, 1085] first4: -11.862045 -4.874043 33.389240 -6.747426 +[Debug] hidden_after_layer12: [2048, 1085] first4: -0.032505 3.430909 11.062031 -3.459812 +[Debug] hidden_after_layer18: [2048, 1085] first4: -3.097944 5.710473 -3.142628 -23.355347 +[Debug] hidden_after_layer23: [2048, 1085] first4: -48.737732 95.176071 35.848183 73.305969 +[Debug] dit_step0_vt: [2170, 64] first4: 0.669312 0.442215 1.300629 2.101841 +[Debug] dit_step0_xt: [2170, 64] first4: 0.163913 2.136149 -0.230995 0.752118 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: 1.120422 0.593113 1.031189 1.813599 +[Debug] dit_step1_xt: [2170, 64] first4: 0.102799 2.103798 -0.287241 0.653194 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: 1.381363 0.295410 1.456146 1.949341 +[Debug] dit_step2_xt: [2170, 64] first4: 0.010708 2.084104 -0.384318 0.523238 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 1.440727 0.067017 1.481567 2.158554 +[Debug] dit_step3_xt: [2170, 64] first4: -0.109353 2.078519 -0.507782 0.343359 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 1.377216 0.234177 1.413437 2.181564 +[Debug] dit_step4_xt: [2170, 64] first4: -0.256912 2.053428 -0.659221 0.109620 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 1.135239 0.376801 1.055233 2.272675 +[Debug] dit_step5_xt: [2170, 64] first4: -0.419089 1.999600 -0.809969 -0.215048 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.948242 0.399368 0.426941 2.645081 +[Debug] dit_step6_xt: [2170, 64] first4: -0.608737 1.919726 -0.895357 -0.744064 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: 0.549133 -0.167076 0.379578 2.984619 +[Debug] dit_x0: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 263.6 ms (263.6 ms/sample) +[Debug] dit_output: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 9686.3 ms +[Debug] vae_audio: [2, 4166400] first4: 0.015021 0.018215 0.017495 0.016521 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:56:19.059 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:56:19.060 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:56:19.060 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:56:19.060 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:56:19.060 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:56:19.832 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:56:21.417 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:56:21.417 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:56:21.428 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:56:21.589 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:56:21.597 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:56:21.610 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:56:21.610 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:56:21.642 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:56:21.955 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:56:21.956 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:56:21.956 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006905794143676758, 'diffusion_time_cost': 0.3056776523590088, 'diffusion_per_step_time_cost': 0.0382097065448761, 'total_time_cost': 0.31258344650268555, 'offload_time_cost': 0.0} +2026-03-01 19:56:21.970 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:56:22.249 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:56:22.252 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:56:22.255 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf +[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999812 + lyric_embed 1.000000 + enc_hidden 0.997128 + detok_output 0.999611 + context 0.999751 + noise 1.000000 + temb_t 0.999906 + hidden_after_proj_in 0.999907 + enc_after_cond_emb 0.997645 + layer0_sa_output 0.998432 + hidden_after_layer0 0.999545 + hidden_after_layer6 0.923275 + hidden_after_layer12 0.969957 + hidden_after_layer18 0.964919 + hidden_after_layer23 0.947132 + dit_step0_vt 0.790630 + dit_step0_xt 0.999550 + dit_step1_vt 0.812267 + dit_step1_xt 0.998316 + dit_step2_vt 0.797855 + dit_step2_xt 0.994982 + dit_step3_vt 0.785550 + dit_step3_xt 0.987155 + dit_step4_vt 0.777677 + dit_step4_xt 0.969894 + dit_step5_vt 0.765554 + dit_step5_xt 0.933268 + dit_step6_vt 0.748164 + dit_step6_xt 0.865654 + dit_step7_vt 0.704997 + dit_x0 0.768990 + vae_audio 0.377954 + vae_audio (STFT cosine) 0.669489 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999550 0.201120 0.022082 -0.002496 0.972768 -0.002342 0.972003 + dit_step1_xt 0.998316 0.415084 0.041258 -0.005641 0.942202 -0.005313 0.941730 + dit_step2_xt 0.994982 0.710340 0.068500 -0.010236 0.907728 -0.009311 0.908527 + dit_step3_xt 0.987155 1.070455 0.105302 -0.016404 0.870181 -0.014577 0.873624 + dit_step4_xt 0.969894 1.456633 0.155292 -0.024587 0.833834 -0.021660 0.841995 + dit_step5_xt 0.933268 1.997366 0.225911 -0.035903 0.808944 -0.032109 0.824593 + dit_step6_xt 0.865654 3.020976 0.331484 -0.051668 0.828925 -0.046482 0.855546 diff --git a/tests/Vulkan-Q5_K_M.log b/tests/Vulkan-Q5_K_M.log new file mode 100644 index 0000000..ec38ab3 --- /dev/null +++ b/tests/Vulkan-Q5_K_M.log @@ -0,0 +1,259 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 +[Load] DiT backend: Vulkan0 (CPU threads: 16) +[Load] Backend init: 114.1 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K fused, V separate +[DiT] Cross-attn: all separate +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 151.9 ms +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: Vulkan0 (CPU threads: 16) +[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 677.1 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 32.6 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 167.6 ms +[Encode] TextEncoder (70 tokens): 18.0 ms +[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 11.1 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K fused, V separate +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 412.5 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 55.7 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 17.4 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.760480 -0.051691 -0.132144 0.058144 +[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 73.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 14.2 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 176.8 ms +[Debug] detok_output: [2170, 64] first4: -0.125636 1.455599 0.291766 -0.651349 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.125636 1.455599 0.291766 -0.651349 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1775 nodes +[Debug] tproj: [12288] first4: 0.260409 -0.161609 -0.102203 0.051602 +[Debug] temb: [2048] first4: -0.000151 -0.132293 -0.035516 0.064751 +[Debug] temb_t: [2048] first4: 0.000578 0.026708 -0.052786 0.063514 +[Debug] temb_r: [2048] first4: -0.000729 -0.159001 0.017269 0.001237 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.051153 -0.053631 -0.012192 -0.039024 +[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.043457 -0.948303 0.538086 0.454315 +[Debug] proj_in_input: [192, 2170] first4: -0.125636 1.455599 0.291766 -0.651349 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.156174 0.748947 0.319763 -0.524475 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.721755 -0.751598 -0.052189 0.264294 +[Debug] layer0_q_after_rope: [128, 16] first4: -3.849609 0.403564 0.117188 0.729004 +[Debug] layer0_k_after_rope: [128, 8] first4: -0.156174 0.748947 0.319763 -0.524475 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.502930 0.143799 -0.399902 0.485840 +[Debug] layer0_attn_out: [2048, 1085] first4: -12.621027 0.802575 1.516849 1.778620 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542487 -1.011762 0.149138 0.465263 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.584631 -0.767133 -0.342805 0.501823 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.051172 0.588318 50.418579 -0.862462 +[Debug] hidden_after_layer6: [2048, 1085] first4: -17.400093 -1.418044 30.339943 -5.945173 +[Debug] hidden_after_layer12: [2048, 1085] first4: 6.109352 -15.584214 49.778614 -0.069897 +[Debug] hidden_after_layer18: [2048, 1085] first4: -11.684156 5.829335 7.772402 -2.692122 +[Debug] hidden_after_layer23: [2048, 1085] first4: -44.213371 57.440056 122.126839 44.268806 +[Debug] dit_step0_vt: [2170, 64] first4: -0.006317 1.190186 0.280113 2.456451 +[Debug] dit_step0_xt: [2170, 64] first4: 0.194623 2.102151 -0.184607 0.735999 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.053368 1.748116 -0.894806 1.618408 +[Debug] dit_step1_xt: [2170, 64] first4: 0.197534 2.006799 -0.135800 0.647723 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.025024 1.326050 -0.792084 2.043884 +[Debug] dit_step2_xt: [2170, 64] first4: 0.199202 1.918396 -0.082994 0.511464 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: -0.000458 1.126770 -0.795593 2.254120 +[Debug] dit_step3_xt: [2170, 64] first4: 0.199240 1.824498 -0.016695 0.323620 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.174652 1.253662 -1.125977 2.441956 +[Debug] dit_step4_xt: [2170, 64] first4: 0.180528 1.690177 0.103946 0.061982 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.205261 1.640076 -1.795410 2.452087 +[Debug] dit_step5_xt: [2170, 64] first4: 0.151205 1.455881 0.360433 -0.288316 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: -0.158905 1.750122 -2.412979 2.419128 +[Debug] dit_step6_xt: [2170, 64] first4: 0.182986 1.105856 0.843029 -0.772142 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.636047 1.672760 -3.485062 2.600891 +[Debug] dit_x0: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 269.9 ms (269.9 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 9630.7 ms +[Debug] vae_audio: [2, 4166400] first4: 0.001367 0.001844 0.001533 0.001892 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:56:02.727 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:56:02.728 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:56:02.728 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:56:02.728 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:56:02.728 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:56:03.499 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:56:05.072 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:56:05.072 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:56:05.078 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:56:05.239 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:56:05.247 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:56:05.260 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:56:05.260 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:56:05.285 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:56:05.592 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:56:05.593 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:56:05.593 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00687718391418457, 'diffusion_time_cost': 0.3001282215118408, 'diffusion_per_step_time_cost': 0.0375160276889801, 'total_time_cost': 0.3070054054260254, 'offload_time_cost': 0.0} +2026-03-01 19:56:05.607 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:56:05.609 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB +2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:56:05.610 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB +2026-03-01 19:56:05.610 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB +2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:56:05.884 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:56:05.888 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:56:05.891 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf +[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999812 + lyric_embed 1.000000 + enc_hidden 0.999132 + detok_output 0.999876 + context 0.999921 + noise 1.000000 + temb_t 0.999972 + hidden_after_proj_in 0.999959 + enc_after_cond_emb 0.999270 + layer0_sa_output 0.999442 + hidden_after_layer0 0.999638 + hidden_after_layer6 0.996691 + hidden_after_layer12 0.982345 + hidden_after_layer18 0.974400 + hidden_after_layer23 0.959738 + dit_step0_vt 0.838705 + dit_step0_xt 0.999650 + dit_step1_vt 0.854589 + dit_step1_xt 0.998725 + dit_step2_vt 0.841602 + dit_step2_xt 0.996217 + dit_step3_vt 0.832748 + dit_step3_xt 0.990342 + dit_step4_vt 0.826828 + dit_step4_xt 0.977304 + dit_step5_vt 0.815977 + dit_step5_xt 0.948497 + dit_step6_vt 0.803425 + dit_step6_xt 0.895308 + dit_step7_vt 0.770195 + dit_x0 0.820447 + vae_audio 0.478241 + vae_audio (STFT cosine) 0.753764 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999650 0.235954 0.018872 -0.002255 0.973213 -0.002342 0.972003 + dit_step1_xt 0.998725 0.437235 0.034677 -0.005176 0.942982 -0.005313 0.941730 + dit_step2_xt 0.996217 0.735376 0.057569 -0.009210 0.909169 -0.009311 0.908527 + dit_step3_xt 0.990342 1.115564 0.088544 -0.014811 0.872820 -0.014577 0.873624 + dit_step4_xt 0.977304 1.463506 0.131044 -0.022213 0.838526 -0.021660 0.841995 + dit_step5_xt 0.948497 2.208427 0.193557 -0.032833 0.817339 -0.032109 0.824593 + dit_step6_xt 0.895308 3.287671 0.286241 -0.047639 0.842369 -0.046482 0.855546 diff --git a/tests/Vulkan-Q6_K.log b/tests/Vulkan-Q6_K.log new file mode 100644 index 0000000..eff680f --- /dev/null +++ b/tests/Vulkan-Q6_K.log @@ -0,0 +1,259 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 +[Load] DiT backend: Vulkan0 (CPU threads: 16) +[Load] Backend init: 114.2 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 181.3 ms +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: Vulkan0 (CPU threads: 16) +[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 670.0 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 32.2 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 165.9 ms +[Encode] TextEncoder (70 tokens): 17.6 ms +[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 476.3 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 61.6 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 15.6 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.761356 -0.050570 -0.133026 0.058500 +[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 82.2 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 10.8 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 143.8 ms +[Debug] detok_output: [2170, 64] first4: -0.141024 1.454365 0.315089 -0.623565 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.141024 1.454365 0.315089 -0.623565 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] tproj: [12288] first4: 0.261089 -0.161223 -0.098727 0.051901 +[Debug] temb: [2048] first4: 0.000236 -0.132397 -0.035347 0.064653 +[Debug] temb_t: [2048] first4: 0.001398 0.026957 -0.052741 0.063660 +[Debug] temb_r: [2048] first4: -0.001162 -0.159353 0.017394 0.000993 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.049071 -0.051112 -0.017769 -0.037193 +[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.037598 -0.956604 0.541748 0.451630 +[Debug] proj_in_input: [192, 2170] first4: -0.141024 1.454365 0.315089 -0.623565 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.170166 0.815842 0.310486 -0.571373 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.716080 -0.755969 -0.048350 0.263422 +[Debug] layer0_q_after_rope: [128, 16] first4: -2.400391 -0.081909 -0.397461 1.011719 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.581572 1.117675 1.774897 1.788774 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.503906 0.211304 -0.366943 0.520996 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.586454 -0.808233 -0.324089 0.502214 +[Debug] hidden_after_layer0: [2048, 1085] first4: -9.155503 0.531986 51.823910 -0.865276 +[Debug] hidden_after_layer6: [2048, 1085] first4: -20.861578 -0.240065 34.589954 -4.288221 +[Debug] hidden_after_layer12: [2048, 1085] first4: -14.692959 -16.975090 77.250595 30.676491 +[Debug] hidden_after_layer18: [2048, 1085] first4: -28.082283 13.370504 64.661263 19.941170 +[Debug] hidden_after_layer23: [2048, 1085] first4: -16.195175 45.294254 196.766129 138.065048 +[Debug] dit_step0_vt: [2170, 64] first4: 0.098133 1.125458 0.338135 2.349396 +[Debug] dit_step0_xt: [2170, 64] first4: 0.189875 2.105093 -0.187245 0.740865 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: -0.018386 1.071533 -0.402077 1.814056 +[Debug] dit_step1_xt: [2170, 64] first4: 0.190878 2.046645 -0.165313 0.641917 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.052032 1.017303 -0.201233 2.115219 +[Debug] dit_step2_xt: [2170, 64] first4: 0.194347 1.978825 -0.151898 0.500902 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: 0.052856 1.105988 0.072205 2.288803 +[Debug] dit_step3_xt: [2170, 64] first4: 0.189942 1.886660 -0.157915 0.310169 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.097982 1.134430 0.083038 2.362534 +[Debug] dit_step4_xt: [2170, 64] first4: 0.179444 1.765114 -0.166812 0.057040 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.122574 1.016464 0.173828 2.333248 +[Debug] dit_step5_xt: [2170, 64] first4: 0.161934 1.619904 -0.191644 -0.276281 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: 0.070358 0.866913 -0.005890 2.297897 +[Debug] dit_step6_xt: [2170, 64] first4: 0.147862 1.446522 -0.190466 -0.735860 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.360962 0.376282 -0.314270 2.626526 +[Debug] dit_x0: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 276.6 ms (276.6 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 9723.7 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000254 0.000880 0.000782 0.001025 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:55:46.361 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:55:46.361 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:55:46.361 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:55:46.362 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:55:46.362 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:55:47.150 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:55:48.700 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:55:48.700 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:55:48.705 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:55:48.864 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:55:48.872 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:55:48.885 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:55:48.885 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:55:48.917 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:55:49.229 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:55:49.230 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:55:49.230 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006822347640991211, 'diffusion_time_cost': 0.3050048351287842, 'diffusion_per_step_time_cost': 0.03812560439109802, 'total_time_cost': 0.3118271827697754, 'offload_time_cost': 0.0} +2026-03-01 19:55:49.244 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB +2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB +2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB +2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:55:49.543 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:55:49.546 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:55:49.549 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf +[GGML] Running acestep-v15-turbo-Q6_K.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999812 + lyric_embed 1.000000 + enc_hidden 0.999665 + detok_output 0.999972 + context 0.999982 + noise 1.000000 + temb_t 0.999990 + hidden_after_proj_in 0.999982 + enc_after_cond_emb 0.999691 + layer0_sa_output 0.999774 + hidden_after_layer0 0.999710 + hidden_after_layer6 0.999855 + hidden_after_layer12 0.998856 + hidden_after_layer18 0.995803 + hidden_after_layer23 0.992072 + dit_step0_vt 0.970064 + dit_step0_xt 0.999934 + dit_step1_vt 0.924403 + dit_step1_xt 0.999650 + dit_step2_vt 0.915580 + dit_step2_xt 0.998651 + dit_step3_vt 0.914431 + dit_step3_xt 0.996098 + dit_step4_vt 0.913750 + dit_step4_xt 0.990344 + dit_step5_vt 0.906205 + dit_step5_xt 0.976856 + dit_step6_vt 0.897054 + dit_step6_xt 0.950943 + dit_step7_vt 0.876737 + dit_x0 0.912738 + vae_audio 0.744947 + vae_audio (STFT cosine) 0.875717 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999934 0.147239 0.007394 -0.002260 0.973056 -0.002342 0.972003 + dit_step1_xt 0.999650 0.408757 0.017759 -0.005276 0.943557 -0.005313 0.941730 + dit_step2_xt 0.998651 0.803721 0.033644 -0.009510 0.911087 -0.009311 0.908527 + dit_step3_xt 0.996098 1.476888 0.054660 -0.015226 0.876460 -0.014577 0.873624 + dit_step4_xt 0.990344 2.294700 0.082632 -0.022702 0.844225 -0.021660 0.841995 + dit_step5_xt 0.976856 3.284146 0.125042 -0.033545 0.825286 -0.032109 0.824593 + dit_step6_xt 0.950943 4.445529 0.188707 -0.049081 0.851111 -0.046482 0.855546 diff --git a/tests/Vulkan-Q8_0.log b/tests/Vulkan-Q8_0.log new file mode 100644 index 0000000..774bc8a --- /dev/null +++ b/tests/Vulkan-Q8_0.log @@ -0,0 +1,259 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 +[Load] DiT backend: Vulkan0 (CPU threads: 16) +[Load] Backend init: 113.5 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[DiT] Self-attn: Q+K+V fused +[DiT] Cross-attn: Q+K+V fused +[DiT] MLP: gate+up fused +[Load] null_condition_emb found (CFG available) +[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend +[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 +[Load] DiT weight load: 214.1 ms +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[Load] silence_latent: [15000, 64] from GGUF +[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 +[Load] VAE backend: Vulkan0 (CPU threads: 16) +[VAE] Backend: Vulkan0, Weight buffer: 161.1 MB +[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations +[Load] VAE weights: 671.7 ms +[Request 1/1] ggml-turbo/request0.json (batch=1) +[Request] parsed ggml-turbo/request0.json (18 fields) +[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) +[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s +[Pipeline] 434 audio codes (86.8s @ 5Hz) +[Pipeline] T=2170, S=1085 +[BPE] Loaded from GGUF: 151643 vocab, 151387 merges +[Load] BPE tokenizer: 31.9 ms +[Pipeline] caption: 70 tokens, lyrics: 167 tokens +[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Load] TextEncoder: 28L, H=1024, Nh=16/8 +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend +[Load] TextEncoder: 176.0 ms +[Encode] TextEncoder (70 tokens): 17.6 ms +[Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 +[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 +[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 +[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[Load] LyricEncoder: 8L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[Load] TimbreEncoder: 4L +[Qwen3] Attn: Q+K+V fused +[Qwen3] MLP: gate+up fused +[WeightCtx] Loaded 140 tensors, 616.6 MB into backend +[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond +[Load] ConditionEncoder: 84.7 ms +[CondEnc] Lyric sliding mask: 167x167, window=128 +[CondEnc] Timbre sliding mask: 750x750, window=128 +[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens +[Encode] ConditionEncoder: 19.4 ms, enc_S=238 +[Debug] enc_hidden: [238, 2048] first4: 1.759194 -0.049729 -0.133332 0.058435 +[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 +[WeightCtx] Loaded 30 tensors, 106.5 MB into backend +[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) +[Load] Detokenizer: 15.5 ms +[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) +[Context] Detokenizer: 85.1 ms +[Debug] detok_output: [2170, 64] first4: -0.121505 1.434749 0.303808 -0.627535 +[Context Batch0] Philox noise seed=42, [2170, 64] +[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 +[Debug] context: [2170, 128] first4: -0.121505 1.434749 0.303808 -0.627535 +[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 +[DiT] Batch N=1, T=2170, S=1085, enc_S=238 +[DiT] Graph: 1841 nodes +[Debug] tproj: [12288] first4: 0.260124 -0.161873 -0.097043 0.052039 +[Debug] temb: [2048] first4: 0.000130 -0.132501 -0.035452 0.064788 +[Debug] temb_t: [2048] first4: 0.001145 0.026826 -0.052770 0.063722 +[Debug] temb_r: [2048] first4: -0.001015 -0.159327 0.017318 0.001066 +[Debug] sinusoidal_t: [256] first4: 0.562486 0.789701 0.439822 -0.023583 +[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 +[Debug] temb_lin1_t: [2048] first4: -0.048950 -0.051683 -0.015299 -0.038721 +[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018836 -0.015732 0.008463 +[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.038574 -0.957581 0.536377 0.445770 +[Debug] proj_in_input: [192, 2170] first4: -0.121505 1.434749 0.303808 -0.627535 +[Debug] enc_after_cond_emb: [2048, 238] first4: -0.169861 0.817307 0.328308 -0.558397 +[Debug] layer0_sa_input: [2048, 1085] first4: -0.718007 -0.757392 -0.047301 0.261071 +[Debug] layer0_q_after_rope: [128, 16] first4: -2.423828 -0.099304 -0.408203 1.004883 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.718538 1.122484 1.774887 1.790079 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.510742 0.165771 -0.347900 0.511230 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.585310 -0.791508 -0.290125 0.495190 +[Debug] hidden_after_layer0: [2048, 1085] first4: -8.926053 0.558007 51.172398 -0.877717 +[Debug] hidden_after_layer6: [2048, 1085] first4: -20.768745 -0.272222 34.170349 -4.416629 +[Debug] hidden_after_layer12: [2048, 1085] first4: -14.358247 -18.625305 73.571915 30.079784 +[Debug] hidden_after_layer18: [2048, 1085] first4: -26.789474 14.346137 62.040115 19.708126 +[Debug] hidden_after_layer23: [2048, 1085] first4: -2.927731 38.887718 192.805542 144.255524 +[Debug] dit_step0_vt: [2170, 64] first4: 0.027340 1.115875 0.350609 2.345856 +[Debug] dit_step0_xt: [2170, 64] first4: 0.193093 2.105528 -0.187812 0.741026 +[DiT] step 1/8 t=1.000 +[Debug] dit_step1_vt: [2170, 64] first4: 0.002377 1.005737 -0.352661 1.768188 +[Debug] dit_step1_xt: [2170, 64] first4: 0.192964 2.050670 -0.168576 0.644580 +[DiT] step 2/8 t=0.955 +[Debug] dit_step2_vt: [2170, 64] first4: -0.063080 1.061218 -0.344177 1.926041 +[Debug] dit_step2_xt: [2170, 64] first4: 0.197169 1.979922 -0.145631 0.516177 +[DiT] step 3/8 t=0.900 +[Debug] dit_step3_vt: [2170, 64] first4: -0.072388 1.144592 -0.184326 2.069214 +[Debug] dit_step3_xt: [2170, 64] first4: 0.203201 1.884539 -0.130270 0.343743 +[DiT] step 4/8 t=0.833 +[Debug] dit_step4_vt: [2170, 64] first4: 0.004288 1.147110 0.001495 2.068916 +[Debug] dit_step4_xt: [2170, 64] first4: 0.202742 1.761635 -0.130430 0.122073 +[DiT] step 5/8 t=0.750 +[Debug] dit_step5_vt: [2170, 64] first4: 0.070211 1.173462 0.080673 2.086014 +[Debug] dit_step5_xt: [2170, 64] first4: 0.192712 1.593997 -0.141955 -0.175929 +[DiT] step 6/8 t=0.643 +[Debug] dit_step6_vt: [2170, 64] first4: -0.010117 1.145203 0.186996 2.198898 +[Debug] dit_step6_xt: [2170, 64] first4: 0.194735 1.364957 -0.179354 -0.615709 +[DiT] step 7/8 t=0.500 +[Debug] dit_step7_vt: [2170, 64] first4: -0.244629 0.644890 0.358635 2.446594 +[Debug] dit_x0: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687 +[DiT] step 8/8 t=0.300 +[DiT] Total generation: 252.0 ms (252.0 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687 +[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) +[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Upsample factor: 1920.00 (expected ~1920) +[VAE] Graph: 417 nodes, T_latent=256 +[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) +[VAE Batch0] Decode: 9843.4 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000170 0.000825 0.000784 0.001115 +[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Request 1/1] Done +[Pipeline] All done +2026-03-01 19:55:29.948 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. +2026-03-01 19:55:29.948 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora +2026-03-01 19:55:29.948 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. +2026-03-01 19:55:29.948 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. +2026-03-01 19:55:29.948 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. +2026-03-01 19:55:30.699 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +`torch_dtype` is deprecated! Use `dtype` instead! +2026-03-01 19:55:32.273 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... +2026-03-01 19:55:32.274 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... +2026-03-01 19:55:32.279 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-01 19:55:32.442 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +====================================================================== +2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +# Instruction +Generate audio semantic tokens based on the given conditions: + +# Caption +An upbeat and anthemic pop-rock track driven by bright, slightly overdriven + +# Metas +- bpm: 83 +- timesignature: 4 +- keyscale: G major +- duration: 88 seconds +<|endoftext|> + +2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +# Languages +fr + +# Lyric +# Lyric +[Intro - Guitar Riff] +[Verse 1] +Dans le monde des tutos virtuels +G ta toise en nouvelle passion +Avec Ggendoline et Pumbé à midi +La communauté, c'est l'unité +Quel joie, une clé + +[Chorus] +Dans le monde des tutos virtuels +Gândoline et Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner + +[Guitar Solo] + +[Verse 2] +Dans le monde des tutos virtuels +Gândoline, Pumbé à midi +Une famille à connecter, c'est vrai +D'un enfant qui voit toi fusionner<|endoftext|> +2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== + +2026-03-01 19:55:32.450 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-01 19:55:32.462 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-01 19:55:32.463 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-01 19:55:32.484 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-01 19:55:32.791 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-01 19:55:32.791 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-01 19:55:32.791 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006818294525146484, 'diffusion_time_cost': 0.2995321750640869, 'diffusion_per_step_time_cost': 0.037441521883010864, 'total_time_cost': 0.3063504695892334, 'offload_time_cost': 0.0} +2026-03-01 19:55:32.806 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB +2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB +2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB +2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-01 19:55:33.083 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-01 19:55:33.084 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-01 19:55:33.088 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +[Request] Loaded request0.json +[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf +[GGML] Running acestep-v15-turbo-Q8_0.gguf... +[GGML] Done, 47 dump files +[Python] Initializing acestep-v15-turbo... +[Python] Generating (acestep-v15-turbo, 8 steps)... +Using precomputed LM hints +Using precomputed LM hints +[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) +[Python] Done, 40 dump files +[Turbo] Cosine similarities GGML vs Python + stage GGML vs Python + text_hidden 0.999812 + lyric_embed 1.000000 + enc_hidden 0.999824 + detok_output 0.999983 + context 0.999990 + noise 1.000000 + temb_t 0.999998 + hidden_after_proj_in 0.999985 + enc_after_cond_emb 0.999817 + layer0_sa_output 0.999939 + hidden_after_layer0 0.999858 + hidden_after_layer6 0.999893 + hidden_after_layer12 0.999124 + hidden_after_layer18 0.996403 + hidden_after_layer23 0.993183 + dit_step0_vt 0.973885 + dit_step0_xt 0.999943 + dit_step1_vt 0.915468 + dit_step1_xt 0.999633 + dit_step2_vt 0.912211 + dit_step2_xt 0.998544 + dit_step3_vt 0.912707 + dit_step3_xt 0.995860 + dit_step4_vt 0.906019 + dit_step4_xt 0.989505 + dit_step5_vt 0.896537 + dit_step5_xt 0.974659 + dit_step6_vt 0.886047 + dit_step6_xt 0.945866 + dit_step7_vt 0.869793 + dit_x0 0.905017 + vae_audio 0.746037 + vae_audio (STFT cosine) 0.898352 +[Turbo] Error growth GGML vs Python + stage cos max_err mean_err mean_A std_A mean_B std_B + dit_step0_xt 0.999943 0.140034 0.006943 -0.002318 0.973036 -0.002342 0.972003 + dit_step1_xt 0.999633 0.423125 0.018056 -0.005257 0.943026 -0.005313 0.941730 + dit_step2_xt 0.998544 0.841908 0.034537 -0.009209 0.910286 -0.009311 0.908527 + dit_step3_xt 0.995860 1.521911 0.055719 -0.014626 0.875169 -0.014577 0.873624 + dit_step4_xt 0.989505 2.346452 0.085477 -0.021803 0.842334 -0.021660 0.841995 + dit_step5_xt 0.974659 3.387389 0.130921 -0.032225 0.822365 -0.032109 0.824593 + dit_step6_xt 0.945866 4.812943 0.199910 -0.047290 0.846751 -0.046482 0.855546 diff --git a/tests/debug-dit-cossim.sh b/tests/debug-dit-cossim.sh index f5ad6ed..284f193 100755 --- a/tests/debug-dit-cossim.sh +++ b/tests/debug-dit-cossim.sh @@ -1,7 +1,28 @@ #!/bin/bash -./debug-dit-cossim.py --mode both --quant BF16 > BF16.log -./debug-dit-cossim.py --mode both --quant Q8_0 > Q8_0.log -./debug-dit-cossim.py --mode both --quant Q6_K > Q6_K.log -./debug-dit-cossim.py --mode both --quant Q5_K_M > Q5_K_M.log -./debug-dit-cossim.py --mode both --quant Q4_K_M > Q4_K_M.log +cd .. +./buildcuda.sh +cd tests +./debug-dit-cossim.py --mode turbo --quant BF16 2>&1 | tee CUDA-BF16.log +./debug-dit-cossim.py --mode turbo --quant Q8_0 2>&1 | tee CUDA-Q8_0.log +./debug-dit-cossim.py --mode turbo --quant Q6_K 2>&1 | tee CUDA-Q6_K.log +./debug-dit-cossim.py --mode turbo --quant Q5_K_M 2>&1 | tee CUDA-Q5_K_M.log +./debug-dit-cossim.py --mode turbo --quant Q4_K_M 2>&1 | tee CUDA-Q4_K_M.log + +cd .. +./buildvulkan.sh +cd tests +./debug-dit-cossim.py --mode turbo --quant BF16 2>&1 | tee Vulkan-BF16.log +./debug-dit-cossim.py --mode turbo --quant Q8_0 2>&1 | tee Vulkan-Q8_0.log +./debug-dit-cossim.py --mode turbo --quant Q6_K 2>&1 | tee Vulkan-Q6_K.log +./debug-dit-cossim.py --mode turbo --quant Q5_K_M 2>&1 | tee Vulkan-Q5_K_M.log +./debug-dit-cossim.py --mode turbo --quant Q4_K_M 2>&1 | tee Vulkan-Q4_K_M.log + +cd .. +./buildcpu.sh +cd tests +./debug-dit-cossim.py --mode turbo --quant BF16 2>&1 | tee CPU-BF16.log +./debug-dit-cossim.py --mode turbo --quant Q8_0 2>&1 | tee CPU-Q8_0.log +./debug-dit-cossim.py --mode turbo --quant Q6_K 2>&1 | tee CPU-Q6_K.log +./debug-dit-cossim.py --mode turbo --quant Q5_K_M 2>&1 | tee CPU-Q5_K_M.log +./debug-dit-cossim.py --mode turbo --quant Q4_K_M 2>&1 | tee CPU-Q4_K_M.log diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp index 1094fc9..fbfd049 100644 --- a/tools/ace-qwen3.cpp +++ b/tools/ace-qwen3.cpp @@ -560,6 +560,7 @@ static void usage(const char * prog) { "Debug:\n" " --max-seq KV cache size (default: 8192)\n" " --no-fsm Disable FSM constrained decoding\n" + " --no-fa Disable flash attention\n" " --dump-logits Dump prefill logits (binary f32)\n" " --dump-tokens Dump prompt token IDs (CSV)\n" , prog); @@ -571,6 +572,7 @@ int main(int argc, char ** argv) { int max_seq = 8192; int batch_size = 1; bool use_fsm = true; + bool use_fa = true; const char * dump_logits = nullptr; const char * dump_tokens = nullptr; @@ -590,6 +592,8 @@ int main(int argc, char ** argv) { batch_size = atoi(argv[++i]); else if (!strcmp(argv[i], "--no-fsm")) use_fsm = false; + else if (!strcmp(argv[i], "--no-fa")) + use_fa = false; else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc) dump_logits = argv[++i]; else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc) @@ -651,6 +655,7 @@ int main(int argc, char ** argv) { Timer t_load; Qwen3LM model; if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1; + model.use_flash_attn = use_fa; double load_ms = t_load.ms(); // FSM diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index bfe274d..1f4ffac 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -77,6 +77,7 @@ static void print_usage(const char * prog) { " --vae-chunk Latent frames per tile (default: 256)\n" " --vae-overlap Overlap frames per side (default: 64)\n\n" "Debug:\n" + " --no-fa Disable flash attention\n" " --dump Dump intermediate tensors\n", prog); } @@ -100,10 +101,11 @@ int main(int argc, char ** argv) { std::vector request_paths; const char * text_enc_gguf = NULL; const char * dit_gguf = NULL; - const char * vae_gguf = NULL; + const char * vae_gguf = NULL; const char * dump_dir = NULL; const char * lora_path = NULL; float lora_scale = 1.0f; + bool use_fa = true; int batch_n = 1; int vae_chunk = 256; int vae_overlap = 64; @@ -118,6 +120,7 @@ int main(int argc, char ** argv) { else if (strcmp(argv[i], "--dit") == 0 && i+1 < argc) dit_gguf = argv[++i]; else if (strcmp(argv[i], "--vae") == 0 && i+1 < argc) vae_gguf = argv[++i]; else if (strcmp(argv[i], "--dump") == 0 && i+1 < argc) dump_dir = argv[++i]; + else if (strcmp(argv[i], "--no-fa") == 0) use_fa = false; else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]); @@ -159,6 +162,7 @@ int main(int argc, char ** argv) { // Load DiT model (once for all requests) dit_ggml_init_backend(&model); + if (!use_fa) model.use_flash_attn = false; fprintf(stderr, "[Load] Backend init: %.1f ms\n", timer.ms()); timer.reset(); @@ -375,6 +379,7 @@ int main(int argc, char ** argv) { timer.reset(); Qwen3GGML text_enc = {}; qwen3_init_backend(&text_enc); + if (!use_fa) text_enc.use_flash_attn = false; if (!qwen3_load_text_encoder(&text_enc, text_enc_gguf)) { fprintf(stderr, "FATAL: failed to load text encoder\n"); dit_ggml_free(&model); @@ -391,30 +396,10 @@ int main(int argc, char ** argv) { fprintf(stderr, "[Encode] TextEncoder (%d tokens): %.1f ms\n", S_text, timer.ms()); debug_dump_2d(&dbg, "text_hidden", text_hidden.data(), S_text, H_text); - // 5. Lyric embedding (CPU vocab lookup from text encoder embed table) + // 5. Lyric embedding (vocab lookup via text encoder) timer.reset(); std::vector lyric_embed(H_text * S_lyric); - { - GGUFModel gf_te = {}; - if (!gf_load(&gf_te, text_enc_gguf)) { - fprintf(stderr, "FATAL: cannot reopen text encoder GGUF for lyric embed\n"); - dit_ggml_free(&model); - if (have_vae) vae_ggml_free(&vae); - return 1; - } - const void * embed_data = gf_get_data(gf_te, "embed_tokens.weight"); - if (!embed_data) { - fprintf(stderr, "FATAL: embed_tokens.weight not found\n"); - gf_close(&gf_te); - dit_ggml_free(&model); - if (have_vae) vae_ggml_free(&vae); - return 1; - } - qwen3_cpu_embed_lookup(embed_data, H_text, - lyric_ids.data(), S_lyric, - lyric_embed.data()); - gf_close(&gf_te); - } + qwen3_embed_lookup(&text_enc, lyric_ids.data(), S_lyric, lyric_embed.data()); fprintf(stderr, "[Encode] Lyric vocab lookup (%d tokens): %.1f ms\n", S_lyric, timer.ms()); debug_dump_2d(&dbg, "lyric_embed", lyric_embed.data(), S_lyric, H_text); @@ -422,6 +407,7 @@ int main(int argc, char ** argv) { timer.reset(); CondGGML cond = {}; cond_ggml_init_backend(&cond); + if (!use_fa) cond.use_flash_attn = false; if (!cond_ggml_load(&cond, dit_gguf)) { fprintf(stderr, "FATAL: failed to load condition encoder\n"); dit_ggml_free(&model); @@ -494,6 +480,7 @@ int main(int argc, char ** argv) { if (have_vae) vae_ggml_free(&vae); return 1; } + if (!use_fa) detok.use_flash_attn = false; fprintf(stderr, "[Load] Detokenizer: %.1f ms\n", timer.ms()); int T_5Hz = (int)codes_vec.size();