#0 0x00007fffed4a7a2c in ?? () from /usr/lib/libc.so.6
#1 0x00007fffed44d1a0 in raise () from /usr/lib/libc.so.6
#2 0x00007fffed4345fe in abort () from /usr/lib/libc.so.6
#3 0x00007ffff751e1c4 in ggml_abort (file=0x7ffff4ed5670 "llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu", line=99, fmt=0x7ffff4ed5665 "CUDA error")
at llama.cpp/ggml/src/ggml.c:256
#4 0x00007ffff4041c20 in ggml_cuda_error (
stmt=0x7ffff4ed6fe8 "cublasSgemm_v2(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, row_diff, src1_ncols, ne10, &alpha, src0_ddf_i, ne00, src1_ddf1_i, ne10, &beta, dst_dd_i, ldc)", func=0x7ffff4ed6c74 "ggml_cuda_op_mul_mat_cublas", file=0x7ffff4ed5670 "llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu", line=1542,
msg=0x7fffbca19cb7 "an internal operation failed") at llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:99
#5 0x00007ffff4046e4b in ggml_cuda_op_mul_mat_cublas (ctx=..., src0=0x55555b7691a0, src1=0x555559620930, dst=0x555559620c10, src0_dd_i=0x7ff4da000000 "", src1_ddf_i=0x3602000000,
src1_ddq_i=0x0, dst_dd_i=0x3602a00000, row_low=22016, row_high=27648, src1_ncols=128, src1_padded_row_size=5120, stream=0x555563b330e0)
at llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:1542
#6 0x00007ffff4049355 in ggml_cuda_op_mul_mat (ctx=..., src0=0x55555b7691a0, src1=0x555559620930, dst=0x555559620c10,
op=0x7ffff4045d2a <ggml_cuda_op_mul_mat_cublas(ggml_backend_cuda_context&, ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, float const*, char const*, float*, int64_t, int64_t, int64_t, int64_t, cudaStream_t)>, quantize_src1=0x0) at llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:1842
#7 0x00007ffff404ab23 in ggml_cuda_mul_mat (ctx=..., src0=0x55555b7691a0, src1=0x555559620930, dst=0x555559620c10)
at llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:2390
#8 0x00007ffff404c73b in ggml_cuda_compute_forward (ctx=..., dst=0x555559620c10) at llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:2742
#9 0x00007ffff4052b10 in ggml_cuda_graph_evaluate_and_capture (cuda_ctx=0x555558e8a9e0, cgraph=0x555558e7c828, use_cuda_graph=false, cuda_graph_update_required=false,
graph_key=0x55555961e0f0) at llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:4035
#10 0x00007ffff40532d9 in ggml_backend_cuda_graph_compute (backend=0x555558fe7ef0, cgraph=0x555558e7c828)
at llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:4155
#11 0x00007ffff753ad68 in ggml_backend_graph_compute_async (backend=0x555558fe7ef0, cgraph=0x555558e7c828) at llama.cpp/ggml/src/ggml-backend.cpp:452
#12 0x00007ffff753fc52 in ggml_backend_sched_compute_splits (sched=0x555558e7e8a0) at llama.cpp/ggml/src/ggml-backend.cpp:1671
#13 0x00007ffff7540b95 in ggml_backend_sched_graph_compute_async (sched=0x555558e7e8a0, graph=0x5555595851f0)
at llama.cpp/ggml/src/ggml-backend.cpp:1894
#14 0x00007ffff7a12ea1 in llama_context::graph_compute (this=0x555558f1f240, gf=0x5555595851f0, batched=true) at llama.cpp/src/llama-context.cpp:2191
#15 0x00007ffff7a0de01 in llama_context::process_ubatch (this=0x555558f1f240, ubatch=..., gtype=LLM_GRAPH_TYPE_DECODER, mctx=0x555558e74ed0, ret=@0x7fffffffc5c0: -14656)
at llama.cpp/src/llama-context.cpp:1231
--Type <RET> for more, q to quit, c to continue without paging--w
#16 0x00007ffff7a0ff09 in llama_context::decode (this=0x555558f1f240, batch_inp=...) at llama.cpp/src/llama-context.cpp:1692
#17 0x00007ffff7a17962 in llama_decode (ctx=0x555558f1f240, batch=...) at llama.cpp/src/llama-context.cpp:3454
#18 0x00005555555e4f09 in test_prompt (ctx=0x555558f1f240, n_prompt=512, n_batch=2048, n_threads=16)
at llama.cpp/tools/llama-bench/llama-bench.cpp:2078
#19 0x00005555555e5f59 in main (argc=15, argv=0x7fffffffd778) at llama.cpp/tools/llama-bench/llama-bench.cpp:2302
Name and Version
version: 8763 (ff5ef82)
built with GNU 15.2.1 for Linux x86_64
Operating systems
Linux
GGML backends
CUDA
Hardware
Ryzen Threadrippier 1950x + 1 RTX 3090 + 3 P100
Models
mradermacher/Seed-OSS-36B-Instruct-abliterated-i1-GGUF-Q6_K
Problem description & steps to reproduce
When running with split mode row on multiple GPUs after and including commit d6f3030, it will give a cuda error of "the launch timed out and was terminated". Please gdb backtrace below.
Here's how I build llama.cpp
It gives different backtraces depending on if
CMAKE_CUDA_ARCHITECTURESis specified or not, see below.First Bad Commit
Bisected to d6f3030 (#19378)
Relevant log output
Backtrace with CMAKE_CUDA_ARCHITECTURES specified
Backtrace without CMAKE_CUDA_ARCHITECTURES specified