diff --git a/llama-cpp-bindings-build/src/android_ndk.rs b/llama-cpp-bindings-build/src/android_ndk.rs index 5c6c193f..0d0123c1 100644 --- a/llama-cpp-bindings-build/src/android_ndk.rs +++ b/llama-cpp-bindings-build/src/android_ndk.rs @@ -27,8 +27,6 @@ pub enum AndroidNdkDetectionError { UnsupportedAndroidTarget { target_triple: String }, } -/// Consolidated Android NDK configuration, computed once and shared between -/// bindgen and `CMake` configuration steps. #[derive(Debug)] pub struct AndroidNdk { pub ndk_path: String, diff --git a/llama-cpp-bindings-build/src/cmake_config.rs b/llama-cpp-bindings-build/src/cmake_config.rs index 90b608d4..7d306df2 100644 --- a/llama-cpp-bindings-build/src/cmake_config.rs +++ b/llama-cpp-bindings-build/src/cmake_config.rs @@ -217,9 +217,6 @@ fn configure_platform_specific( } } -/// Work around a cmake-rs bug where debug Rust builds under MSVC strip -/// optimization flags from Release-profile C/C++ builds. -/// See: fn configure_msvc_release_workaround(config: &mut Config, profile: &str) { let is_release_profile = matches!(profile, "Release" | "RelWithDebInfo" | "MinSizeRel"); @@ -269,14 +266,6 @@ fn configure_android_cmake(config: &mut Config, ndk: &AndroidNdk, _target_triple println!("cargo:rustc-link-lib=android"); } -/// macOS BSD ar (from cctools) does not accept GNU ar's `-D` (deterministic) -/// flag. cmake's default archive recipe is ` qcD …`, which produces -/// `illegal option -- D` warnings during every static-library link. -/// -/// We override the archive command for every language used by llama.cpp's -/// build — C, C++, Objective-C and Objective-C++ (the latter two appear once -/// `GGML_METAL=ON` enables the Metal backend). Plain `qc` keeps the -/// quick-create semantics; `` still runs as ARCHIVE_FINISH. fn override_archive_commands_for_apple_ar(config: &mut Config) { for language in ["C", "CXX", "OBJC", "OBJCXX"] { config.define( diff --git a/llama-cpp-bindings-build/src/lib.rs b/llama-cpp-bindings-build/src/lib.rs index f9336583..48809244 100644 --- a/llama-cpp-bindings-build/src/lib.rs +++ b/llama-cpp-bindings-build/src/lib.rs @@ -1,5 +1,3 @@ -//! Build system for llama-cpp-bindings-sys FFI bindings to llama.cpp. - mod android_ndk; mod bindgen_config; mod cmake_config; @@ -30,7 +28,6 @@ macro_rules! debug_log { }; } -/// Shared state passed between build phases. #[derive(Debug)] pub struct BuildContext { pub out_dir: PathBuf, @@ -124,9 +121,6 @@ fn set_cmake_parallelism() { } } -/// Main entry point for the llama.cpp build system. -/// -/// Call this from `build.rs` in `llama-cpp-bindings-sys`. pub fn build() { let context = BuildContext::detect(); diff --git a/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs b/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs index 79186be5..87877e80 100644 --- a/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs +++ b/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs @@ -17,13 +17,6 @@ const CMAKE_AFFECTING_FEATURES: &[(&str, bool)] = &[ ("static-stdcxx", cfg!(feature = "static-stdcxx")), ]; -/// Compute a stable, persistent cmake build directory under the workspace -/// `target/` tree, keyed only by inputs that materially change cmake compile -/// commands. Toggling features that don't affect cmake (e.g. `mtmd`, `llguidance`) -/// returns the same path, allowing cmake's incremental build (and ccache) to -/// reuse all prior artifacts — including `nvcc`-built CUDA kernels. -/// -/// `LLAMA_CMAKE_BUILD_DIR_OVERRIDE` overrides the path entirely when set. pub fn stable_cmake_build_dir( target_dir: &Path, target_triple: &str, diff --git a/llama-cpp-bindings-sys/src/lib.rs b/llama-cpp-bindings-sys/src/lib.rs index e3dbbeba..6b429eb4 100644 --- a/llama-cpp-bindings-sys/src/lib.rs +++ b/llama-cpp-bindings-sys/src/lib.rs @@ -1,5 +1,3 @@ -//! See [llama-cpp-bindings](https://crates.io/crates/llama-cpp-bindings) for a documented and safe API. - #![expect( non_camel_case_types, reason = "bindgen emits C struct and enum names verbatim and they don't follow Rust naming" diff --git a/llama-cpp-bindings-tests/src/classify_sample_loop.rs b/llama-cpp-bindings-tests/src/classify_sample_loop.rs index d5b070c4..8240c74f 100644 --- a/llama-cpp-bindings-tests/src/classify_sample_loop.rs +++ b/llama-cpp-bindings-tests/src/classify_sample_loop.rs @@ -7,14 +7,6 @@ use llama_cpp_bindings::sampled_token::SampledToken; use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier; use llama_cpp_bindings::sampling::LlamaSampler; -/// Drives a classifier through the full sample/decode/flush loop. -/// -/// Suppresses EOG outcomes (so `generated_raw` and the per-section streams -/// never contain end-of-generation marker text) and captures per-section -/// counts. Tests that need to exercise classifier behaviour during real -/// inference should construct one of these and call -/// [`ClassifySampleLoop::run`] instead of re-implementing the loop. The -/// strict per-test assertions then run on [`ClassifySampleLoopOutcome`]. pub struct ClassifySampleLoop<'borrow, 'model, 'tokens> { pub model: &'model LlamaModel, pub classifier: &'borrow mut SampledTokenClassifier<'model>, @@ -59,10 +51,6 @@ impl ClassifySampleLoop<'_, '_, '_> { } else { outcome.generated_raw.push_str(&ingest_outcome.raw_piece); } - // Counters always include EOG so they match the classifier's - // internal usage counters (which include every sampled token). - // EOG text is suppressed from `generated_raw` and the per-section - // streams so callers can assert exact textual equality. record_outcome(ingest_outcome, &mut outcome, is_eog); } @@ -115,3 +103,30 @@ fn record_outcome(ingest: &IngestOutcome, outcome: &mut ClassifySampleLoopOutcom } } } + +#[cfg(test)] +mod tests { + use llama_cpp_bindings::ingest_outcome::IngestOutcome; + use llama_cpp_bindings::sampled_token::SampledToken; + use llama_cpp_bindings::token::LlamaToken; + + use super::ClassifySampleLoopOutcome; + use super::record_outcome; + + #[test] + fn record_outcome_tool_call_token() { + let ingest = IngestOutcome { + sampled_token: SampledToken::ToolCall(LlamaToken(42)), + visible_piece: String::new(), + raw_piece: String::new(), + }; + let mut outcome = ClassifySampleLoopOutcome::default(); + + record_outcome(&ingest, &mut outcome, false); + + assert_eq!(outcome.observed_tool_call, 1); + assert_eq!(outcome.observed_content, 0); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + } +} diff --git a/llama-cpp-bindings-tests/src/lib.rs b/llama-cpp-bindings-tests/src/lib.rs index 00686c59..b48fe749 100644 --- a/llama-cpp-bindings-tests/src/lib.rs +++ b/llama-cpp-bindings-tests/src/lib.rs @@ -1,8 +1,2 @@ -//! Integration test fixtures for `llama-cpp-bindings`. -//! -//! This crate hosts test-only helpers used by the integration tests in `tests/`: -//! [`classify_sample_loop`] for sampling-loop drivers and [`test_model::fixtures_dir`] -//! for locating image fixtures. - pub mod classify_sample_loop; -pub mod test_model; +pub mod prime_kv_cache; diff --git a/llama-cpp-bindings-tests/src/prime_kv_cache.rs b/llama-cpp-bindings-tests/src/prime_kv_cache.rs new file mode 100644 index 00000000..570cf77c --- /dev/null +++ b/llama-cpp-bindings-tests/src/prime_kv_cache.rs @@ -0,0 +1,15 @@ +use anyhow::Result; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::llama_batch::LlamaBatch; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_test_harness::LlamaFixture; + +/// # Errors +/// Forwards tokenization, batch construction, and [`LlamaContext::decode`] errors verbatim. +pub fn prime_kv_cache(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> { + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + Ok(()) +} diff --git a/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs index a7e18245..a6bf7ce3 100644 --- a/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs +++ b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs @@ -1,567 +1,551 @@ -use llama_cpp_test_harness::llama_tests_main; - -mod model_chat_template { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] - use anyhow::Result; - use llama_cpp_bindings::ChatTemplateError; - use llama_cpp_bindings::model::LlamaChatMessage; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> { - let template = fixture.model.chat_template(None); - assert!(template.is_ok()); - Ok(()) - } +use anyhow::Result; +use anyhow::bail; +use llama_cpp_bindings::ChatMessageParseOutcome; +use llama_cpp_bindings::ChatTemplateError; +use llama_cpp_bindings::model::LlamaChatMessage; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let template = model.chat_template(None)?; - let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?; - let prompt = model.apply_chat_template(&template, &[message], true); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> { + let template = fixture.model.chat_template(None); + assert!(template.is_ok()); + Ok(()) +} - assert!(prompt.is_ok()); - assert!(!prompt?.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model.chat_template(None)?; + let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?; + let prompt = model.apply_chat_template(&template, &[message], true); - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn apply_chat_template_buffer_resize_with_long_messages( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let template = model.chat_template(None)?; - let long_content = "a".repeat(2000); - let message = LlamaChatMessage::new("user".to_string(), long_content)?; - let prompt = model.apply_chat_template(&template, &[message], true); + assert!(prompt.is_ok()); + assert!(!prompt?.is_empty()); + Ok(()) +} - assert!(prompt.is_ok()); - assert!(!prompt?.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn apply_chat_template_buffer_resize_with_long_messages(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model.chat_template(None)?; + let long_content = "a".repeat(2000); + let message = LlamaChatMessage::new("user".to_string(), long_content)?; + let prompt = model.apply_chat_template(&template, &[message], true); - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture - .model - .chat_template(Some("nonexistent_template_name_xyz")); - assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate); - Ok(()) - } + assert!(prompt.is_ok()); + assert!(!prompt?.is_empty()); + Ok(()) } -mod parse_chat_message { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture + .model + .chat_template(Some("nonexistent_template_name_xyz")); + assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message("[]", "hello world", false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message("[]", "hello world", false)?; - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for plain content; got Unrecognized"); - }; - assert!(parsed.tool_calls.is_empty()); - assert!(!parsed.is_empty()); - assert!(parsed.content.contains("hello world")); + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for plain content; got Unrecognized"); + }; + assert!(parsed.tool_calls.is_empty()); + assert!(!parsed.is_empty()); + assert!(parsed.content.contains("hello world")); - Ok(()) - } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> { - let input = "step one, step two\n\nactual response"; - let outcome = fixture.model.parse_chat_message("[]", input, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> { + let input = "step one, step two\n\nactual response"; + let outcome = fixture.model.parse_chat_message("[]", input, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for reasoning section; got Unrecognized"); - }; - assert!( - parsed.reasoning_content.contains("step") || parsed.content.contains("step"), - "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}", - parsed.content, - parsed.reasoning_content - ); + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for reasoning section; got Unrecognized"); + }; + assert!( + parsed.reasoning_content.contains("step") || parsed.content.contains("step"), + "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}", + parsed.content, + parsed.reasoning_content + ); - Ok(()) - } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture.model.parse_chat_message("[]", "", false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture.model.parse_chat_message("[]", "", false)?; - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for empty input; got Unrecognized"); - }; - assert!(parsed.tool_calls.is_empty()); + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for empty input; got Unrecognized"); + }; + assert!(parsed.tool_calls.is_empty()); - Ok(()) - } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn parses_malformed_tools_json_returns_tools_json_invalid_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let result = fixture - .model - .parse_chat_message("not_a_json[}", "hello", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_malformed_tools_json_returns_tools_json_invalid_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("not_a_json[}", "hello", false); - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( - _ - )) - )); - Ok(()) - } + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( + _ + )) + )); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn parses_non_array_tools_json_returns_tools_json_not_array_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let result = fixture - .model - .parse_chat_message("{\"foo\": 1}", "hello", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_non_array_tools_json_returns_tools_json_not_array_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("{\"foo\": 1}", "hello", false); - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray) - )); - Ok(()) - } + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray) + )); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn parses_with_tools_null_byte_returns_tools_json_invalid_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let result = fixture - .model - .parse_chat_message("[]\0extra", "hello", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_with_tools_null_byte_returns_tools_json_invalid_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("[]\0extra", "hello", false); - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( - _ - )) - )); - Ok(()) - } + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( + _ + )) + )); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn parses_with_input_null_byte_returns_tools_serialization_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let result = fixture - .model - .parse_chat_message("[]", "hello\0world", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_with_input_null_byte_returns_tools_serialization_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("[]", "hello\0world", false); - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_)) - )); - Ok(()) - } + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_)) + )); + Ok(()) } llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs index cebd47c1..2d5e5823 100644 --- a/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs +++ b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs @@ -1,707 +1,630 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use std::num::NonZeroU8; +use std::time::Duration; + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::ggml_time_us; +use llama_cpp_bindings::llama_batch::LlamaBatch; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_bindings_tests::prime_kv_cache::prime_kv_cache; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; use llama_cpp_test_harness::llama_tests_main; -mod embeddings { - use std::time::Duration; +fn normalize(input: &[f32]) -> Vec { + let magnitude = input + .iter() + .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator)) + .sqrt(); - use anyhow::{Context, Result}; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::ggml_time_us; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + input.iter().map(|&value| value / magnitude).collect() +} - fn normalize(input: &[f32]) -> Vec { - let magnitude = input - .iter() - .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator)) - .sqrt(); +fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 { + vec_a + .iter() + .zip(vec_b.iter()) + .map(|(left, right)| left * right) + .sum::() +} - input.iter().map(|&value| value / magnitude).collect() +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_threads_batch = 8, + embeddings = true, +)] +fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; + + let prompt = "Hello my name is"; + let tokens = model + .str_to_token(prompt, AddBos::Always) + .with_context(|| format!("failed to tokenize {prompt}"))?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let n_ctx = usize::try_from(ctx.n_ctx())?; + assert!(tokens.len() <= n_ctx, "prompt exceeds context window size"); + + let t_main_start = ggml_time_us(); + + let mut classifier = model.sampled_token_classifier(); + let mut batch = LlamaBatch::new(n_ctx, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); + + ctx.clear_kv_cache(); + ctx.decode(&mut batch) + .with_context(|| "llama_decode() failed")?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let embedding = ctx + .embeddings_seq_ith(0) + .with_context(|| "failed to get embeddings")?; + let normalized = normalize(embedding); + + let t_main_end = ggml_time_us(); + let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); + + eprintln!( + "created embedding with {} dimensions in {:.2} s", + normalized.len(), + duration.as_secs_f32() + ); + + assert!( + !normalized.is_empty(), + "embedding should have at least one dimension" + ); + + let magnitude: f32 = normalized + .iter() + .map(|value| value * value) + .sum::() + .sqrt(); + assert!( + (magnitude - 1.0).abs() < 0.01, + "normalized embedding magnitude should be approximately 1.0, got {magnitude}" + ); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.completion_tokens(), 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 2, + n_threads_batch = 8, + embeddings = true, +)] +fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + let query = "What is machine learning?"; + let documents = [ + "Machine learning is a subset of artificial intelligence.", + "The weather today is sunny and warm.", + ]; + + let document_count = documents.len(); + assert_eq!( + u32::try_from(document_count)?, + fixture.context_params.n_seq_max, + "attribute n_seq_max must match the document count this trial expects", + ); + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; + + let prompt_lines: Vec = documents + .iter() + .map(|document| format!("{query}{document}")) + .collect(); + + let tokens_lines_list = prompt_lines + .iter() + .map(|line| model.str_to_token(line, AddBos::Always)) + .collect::, _>>() + .with_context(|| "failed to tokenize prompts")?; + + let n_ctx = usize::try_from(ctx.n_ctx())?; + + if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) { + bail!("one of the provided prompts exceeds the size of the context window"); } - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_threads_batch = 8, - embeddings = true, - )] - fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; + let mut classifier = model.sampled_token_classifier(); + let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?; + let t_main_start = ggml_time_us(); - let mut ctx = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create context")?; + for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() { + classifier.feed_prompt_sequence_to_batch( + &mut batch, + tokens, + i32::try_from(sequence_index)?, + false, + )?; + } - let prompt = "Hello my name is"; - let tokens = model - .str_to_token(prompt, AddBos::Always) - .with_context(|| format!("failed to tokenize {prompt}"))?; - let prompt_token_count = u64::try_from(tokens.len())?; + let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum(); + let total_token_count = u64::try_from(total_tokens)?; - let n_ctx = usize::try_from(ctx.n_ctx())?; - assert!(tokens.len() <= n_ctx, "prompt exceeds context window size"); + assert_eq!(classifier.pending_prompt_tokens(), total_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); - let t_main_start = ggml_time_us(); + ctx.clear_kv_cache(); + ctx.decode(&mut batch) + .with_context(|| "llama_decode() failed")?; - let mut classifier = model.sampled_token_classifier(); - let mut batch = LlamaBatch::new(n_ctx, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, total_token_count); - assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); + let mut embeddings = Vec::with_capacity(document_count); - ctx.clear_kv_cache(); - ctx.decode(&mut batch) - .with_context(|| "llama_decode() failed")?; + for sequence_index in 0..document_count { + let raw_embedding = ctx + .embeddings_seq_ith(i32::try_from(sequence_index)?) + .with_context(|| "failed to get sequence embeddings")?; + embeddings.push(normalize(raw_embedding)); + } - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); + let t_main_end = ggml_time_us(); + let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); - let embedding = ctx - .embeddings_seq_ith(0) - .with_context(|| "failed to get embeddings")?; - let normalized = normalize(embedding); + #[expect( + clippy::cast_precision_loss, + reason = "logged throughput tolerates f32 precision" + )] + let tokens_per_second = total_tokens as f32 / duration.as_secs_f32(); - let t_main_end = ggml_time_us(); - let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); + eprintln!( + "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", + duration.as_secs_f32(), + ); - eprintln!( - "created embedding with {} dimensions in {:.2} s", - normalized.len(), - duration.as_secs_f32() - ); + assert_eq!( + embeddings.len(), + document_count, + "should produce one embedding per document" + ); + for (index, embedding) in embeddings.iter().enumerate() { assert!( - !normalized.is_empty(), - "embedding should have at least one dimension" + !embedding.is_empty(), + "embedding {index} should not be empty" ); + } - let magnitude: f32 = normalized - .iter() - .map(|value| value * value) - .sum::() - .sqrt(); - assert!( - (magnitude - 1.0).abs() < 0.01, - "normalized embedding magnitude should be approximately 1.0, got {magnitude}" - ); + let similarity = cosine_similarity(&embeddings[0], &embeddings[1]); + eprintln!("cosine similarity between document embeddings: {similarity:.4}"); - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, prompt_token_count); - assert_eq!(usage.completion_tokens(), 0); + assert!( + similarity.is_finite(), + "cosine similarity should be a finite number" + ); - Ok(()) - } -} + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, total_token_count); + assert_eq!(usage.completion_tokens(), 0); -mod reranker { - use std::time::Duration; + Ok(()) +} - use anyhow::{Context, Result, bail}; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::ggml_time_us; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) +} - fn normalize(input: &[f32]) -> Vec { - let magnitude = input - .iter() - .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator)) - .sqrt(); +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let embeddings = context.embeddings_seq_ith(0)?; + + assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); + + Ok(()) +} - input.iter().map(|&value| value / magnitude).collect() +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 4, + embeddings = true, +)] +fn multi_sequence_embeddings_returns_one_embedding_per_sequence( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let inputs = [ + "alpha is here", + "beta runs fast", + "gamma waits", + "delta jumps", + ]; + let mut batch = LlamaBatch::new(64, 4)?; + + for (sequence_index, text) in inputs.iter().enumerate() { + let tokens = fixture.model.str_to_token(text, AddBos::Always)?; + let sequence_id = i32::try_from(sequence_index)?; + + batch.add_sequence(&tokens, sequence_id, true)?; } - fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 { - vec_a - .iter() - .zip(vec_b.iter()) - .map(|(left, right)| left * right) - .sum::() - } + context.decode(&mut batch)?; - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_seq_max = 2, - n_threads_batch = 8, - embeddings = true, - )] - fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; + let n_embd = usize::try_from(fixture.model.n_embd())?; + let mut collected: Vec> = Vec::with_capacity(inputs.len()); - let query = "What is machine learning?"; - let documents = [ - "Machine learning is a subset of artificial intelligence.", - "The weather today is sunny and warm.", - ]; + for sequence_index in 0..inputs.len() { + let sequence_id = i32::try_from(sequence_index)?; + let embedding = context.embeddings_seq_ith(sequence_id)?; - let document_count = documents.len(); assert_eq!( - u32::try_from(document_count)?, - fixture.context_params.n_seq_max, - "attribute n_seq_max must match the document count this trial expects", + embedding.len(), + n_embd, + "sequence {sequence_index} embedding length mismatch" ); - let mut ctx = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create context")?; - - let prompt_lines: Vec = documents - .iter() - .map(|document| format!("{query}{document}")) - .collect(); - - let tokens_lines_list = prompt_lines - .iter() - .map(|line| model.str_to_token(line, AddBos::Always)) - .collect::, _>>() - .with_context(|| "failed to tokenize prompts")?; - - let n_ctx = usize::try_from(ctx.n_ctx())?; - - if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) { - bail!("one of the provided prompts exceeds the size of the context window"); - } - - let mut classifier = model.sampled_token_classifier(); - let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?; - let t_main_start = ggml_time_us(); - - for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() { - classifier.feed_prompt_sequence_to_batch( - &mut batch, - tokens, - i32::try_from(sequence_index)?, - false, - )?; - } - - let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum(); - let total_token_count = u64::try_from(total_tokens)?; - - assert_eq!(classifier.pending_prompt_tokens(), total_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); - - ctx.clear_kv_cache(); - ctx.decode(&mut batch) - .with_context(|| "llama_decode() failed")?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, total_token_count); - - let mut embeddings = Vec::with_capacity(document_count); - - for sequence_index in 0..document_count { - let raw_embedding = ctx - .embeddings_seq_ith(i32::try_from(sequence_index)?) - .with_context(|| "failed to get sequence embeddings")?; - embeddings.push(normalize(raw_embedding)); - } - - let t_main_end = ggml_time_us(); - let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); - - #[expect( - clippy::cast_precision_loss, - reason = "logged throughput tolerates f32 precision" - )] - let tokens_per_second = total_tokens as f32 / duration.as_secs_f32(); - - eprintln!( - "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", - duration.as_secs_f32(), - ); - - assert_eq!( - embeddings.len(), - document_count, - "should produce one embedding per document" - ); + collected.push(embedding.to_vec()); + } - for (index, embedding) in embeddings.iter().enumerate() { - assert!( - !embedding.is_empty(), - "embedding {index} should not be empty" + for (left_index, left) in collected.iter().enumerate() { + for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { + assert_ne!( + left, right, + "embedding for sequence {left_index} must differ from sequence {right_index}", ); } - - let similarity = cosine_similarity(&embeddings[0], &embeddings[1]); - eprintln!("cosine similarity between document embeddings: {similarity:.4}"); - - assert!( - similarity.is_finite(), - "cosine similarity should be a finite number" - ); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, total_token_count); - assert_eq!(usage.completion_tokens(), 0); - - Ok(()) } -} - -mod context_embedding_and_encoder { - - use anyhow::Result; - - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - // ========================================================================================= - // Group A: default Qwen model, embeddings=false. Most context tests fall here. - // ========================================================================================= - - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, - )] - fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, - )] - fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let embeddings = context.embeddings_seq_ith(0)?; - - assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_seq_max = 4, - embeddings = true, - )] - fn multi_sequence_embeddings_returns_one_embedding_per_sequence( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let inputs = [ - "alpha is here", - "beta runs fast", - "gamma waits", - "delta jumps", - ]; - let mut batch = LlamaBatch::new(64, 4)?; + Ok(()) +} - for (sequence_index, text) in inputs.iter().enumerate() { +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 4, + embeddings = true, +)] +fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let iterations = [ + [ + "This is the first document with enough content to contribute meaningfully to the batch size calculation", + "This is the second document that should be processed in a potentially different batch from the first", + ], + [ + "This is the third document adding more content to ensure the total exceeds the configured chunk limit", + "This is the fourth document which should demonstrate that batching distributes across agent requests", + ], + ]; + + let n_embd = usize::try_from(fixture.model.n_embd())?; + let mut batch = LlamaBatch::new(64, 4)?; + let mut collected: Vec> = Vec::new(); + + for iteration_inputs in iterations { + for (sequence_index, text) in iteration_inputs.iter().enumerate() { let tokens = fixture.model.str_to_token(text, AddBos::Always)?; let sequence_id = i32::try_from(sequence_index)?; batch.add_sequence(&tokens, sequence_id, true)?; } + context.clear_kv_cache(); context.decode(&mut batch)?; - let n_embd = usize::try_from(fixture.model.n_embd())?; - let mut collected: Vec> = Vec::with_capacity(inputs.len()); - - for sequence_index in 0..inputs.len() { + for sequence_index in 0..iteration_inputs.len() { let sequence_id = i32::try_from(sequence_index)?; let embedding = context.embeddings_seq_ith(sequence_id)?; assert_eq!( embedding.len(), n_embd, - "sequence {sequence_index} embedding length mismatch" + "iteration sequence {sequence_index} embedding length mismatch" ); collected.push(embedding.to_vec()); } - for (left_index, left) in collected.iter().enumerate() { - for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { - assert_ne!( - left, right, - "embedding for sequence {left_index} must differ from sequence {right_index}", - ); - } - } - - Ok(()) + batch.clear(); } - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_seq_max = 4, - embeddings = true, - )] - fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let iterations = [ - [ - "This is the first document with enough content to contribute meaningfully to the batch size calculation", - "This is the second document that should be processed in a potentially different batch from the first", - ], - [ - "This is the third document adding more content to ensure the total exceeds the configured chunk limit", - "This is the fourth document which should demonstrate that batching distributes across agent requests", - ], - ]; - - let n_embd = usize::try_from(fixture.model.n_embd())?; - let mut batch = LlamaBatch::new(64, 4)?; - let mut collected: Vec> = Vec::new(); - - for iteration_inputs in iterations { - for (sequence_index, text) in iteration_inputs.iter().enumerate() { - let tokens = fixture.model.str_to_token(text, AddBos::Always)?; - let sequence_id = i32::try_from(sequence_index)?; - - batch.add_sequence(&tokens, sequence_id, true)?; - } - - context.clear_kv_cache(); - context.decode(&mut batch)?; - - for sequence_index in 0..iteration_inputs.len() { - let sequence_id = i32::try_from(sequence_index)?; - let embedding = context.embeddings_seq_ith(sequence_id)?; - - assert_eq!( - embedding.len(), - n_embd, - "iteration sequence {sequence_index} embedding length mismatch" - ); - - collected.push(embedding.to_vec()); - } - - batch.clear(); - } - - assert_eq!( - collected.len(), - iterations.iter().flatten().count(), - "expected one embedding per input across every iteration" - ); - - for (left_index, left) in collected.iter().enumerate() { - for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { - assert_ne!( - left, right, - "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations", - ); - } + assert_eq!( + collected.len(), + iterations.iter().flatten().count(), + "expected one embedding per input across every iteration" + ); + + for (left_index, left) in collected.iter().enumerate() { + for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { + assert_ne!( + left, right, + "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations", + ); } - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, - )] - fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let embeddings = context.embeddings_ith(last_index)?; - - assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, - )] - fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.embeddings_ith(999); - - assert!(result.is_err()); - - Ok(()) } - #[llama_test( - model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, - )] - fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Never)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.encode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) - } + Ok(()) } -mod context_kv_cache_embedding { - use std::num::NonZeroU8; - - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - fn build_context<'context>( - fixture: &'context LlamaFixture<'_>, - ) -> Result> { - Ok(LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?) - } - - fn decode_hello_world( - fixture: &LlamaFixture<'_>, - context: &mut LlamaContext<'_>, - ) -> Result<()> { - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.kv_cache_seq_add(0, Some(0), None, 1); +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let embeddings = context.embeddings_ith(last_index)?; + + assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); + + Ok(()) +} - assert!(result.is_ok()); +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.embeddings_ith(999); + + assert!(result.is_err()); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Never)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.encode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; - decode_hello_world(fixture, &mut context)?; + prime_kv_cache(fixture, &mut context)?; - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(0), None, divisor); + let result = context.kv_cache_seq_add(0, Some(0), None, 1); - assert!(result.is_ok()); + assert!(result.is_ok()); - Ok(()) - } + Ok(()) } -mod model_helpers_embedding { - #![expect( - clippy::unnecessary_wraps, - reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" - )] +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; - use anyhow::Result; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 - )] - fn embedding_model_tool_call_markers_call_does_not_panic( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let _markers = fixture.model.tool_call_markers(); + prime_kv_cache(fixture, &mut context)?; - Ok(()) - } + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(0), None, divisor); - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 - )] - fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let _markers = fixture.model.streaming_markers()?; + assert!(result.is_ok()); - Ok(()) - } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 - )] - fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let env = fixture.model.approximate_tok_env(); - let env_again = fixture.model.approximate_tok_env(); +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn embedding_model_tool_call_markers_call_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let _markers = fixture.model.tool_call_markers(); + + Ok(()) +} - assert!( - std::sync::Arc::ptr_eq(&env, &env_again), - "approximate_tok_env must return the same cached Arc for any model, including \ - the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)" - ); +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let _markers = fixture.model.streaming_markers()?; + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let env = fixture.model.approximate_tok_env(); + let env_again = fixture.model.approximate_tok_env(); + + assert!( + std::sync::Arc::ptr_eq(&env, &env_again), + "approximate_tok_env must return the same cached Arc for any model, including \ + the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)" + ); + + Ok(()) } llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs index de316e42..fa20f3a7 100644 --- a/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs +++ b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs @@ -1,2836 +1,2752 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use std::num::NonZeroU8; +use std::ptr::NonNull; +use std::sync::Arc; +use std::sync::atomic::AtomicBool; + +use anyhow::Result; +use llama_cpp_bindings::DecodeError; +use llama_cpp_bindings::LogitsError; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::context::kv_cache::KvCacheConversionError; +use llama_cpp_bindings::error::KvCacheSeqAddError; +use llama_cpp_bindings::error::KvCacheSeqDivError; +use llama_cpp_bindings::llama_batch::LlamaBatch; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_bindings::model::LlamaLoraAdapter; +use llama_cpp_bindings_tests::prime_kv_cache::prime_kv_cache; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; use llama_cpp_test_harness::llama_tests_main; -mod model_context_creation { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - assert!(context.n_ctx() > 0); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, - )] - fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ); - - assert!(result.is_err()); - Ok(()) - } -} - -mod context { - use std::ptr::NonNull; - use std::sync::Arc; - use std::sync::atomic::AtomicBool; - - use anyhow::Result; - use llama_cpp_bindings::DecodeError; - use llama_cpp_bindings::LogitsError; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::model::LlamaLoraAdapter; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - // ========================================================================================= - // Group A: default Qwen model, embeddings=false. Most context tests fall here. - // ========================================================================================= - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - assert!(context.n_ctx() > 0); - assert!(context.n_batch() > 0); - assert!(context.n_ubatch() > 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let decode_result = context.decode(&mut batch); - assert!(decode_result.is_ok()); - - let logits = context.get_logits()?; - assert!(!logits.is_empty()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.reset_timings(); - let timings = context.timings(); - assert!(timings.t_start_ms() >= 0.0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let token_data_array = context.token_data_array()?; - - assert!(!token_data_array.data.is_empty()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let logits = context.get_logits_ith(last_index)?; - - assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let token_data_array = context.token_data_array_ith(last_index)?; - - assert_eq!( - token_data_array.data.len(), - usize::try_from(fixture.model.n_vocab())? - ); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn embeddings_ith_returns_error_when_embeddings_disabled( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.embeddings_ith(0); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn embeddings_seq_ith_returns_error_when_embeddings_disabled( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.embeddings_seq_ith(0); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let count = context.candidates()?.count(); - - assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let debug_output = format!("{context:?}"); - - assert!(debug_output.contains("LlamaContext")); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let count = context.candidates_ith(last_index)?.count(); - - assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let mut adapter = LlamaLoraAdapter { - lora_adapter: NonNull::dangling(), - }; - - let result = context.lora_adapter_remove(&mut adapter); - - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.encode(&mut batch); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let mut adapter = LlamaLoraAdapter { - lora_adapter: NonNull::dangling(), - }; - - let result = context.lora_adapter_set(&mut adapter, 1.0); - - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, - )] - fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let result = context.embeddings_seq_ith(999); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let mut batch = LlamaBatch::new(512, 1)?; - - let result = context.decode(&mut batch); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let abort_flag = Arc::new(AtomicBool::new(true)); - context.set_abort_flag(abort_flag); - - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert_eq!(result, Err(DecodeError::Aborted)); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let abort_flag = Arc::new(AtomicBool::new(false)); - context.set_abort_flag(abort_flag); - - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let abort_flag = Arc::new(AtomicBool::new(true)); - context.set_abort_flag(abort_flag); - context.clear_abort_callback(); - - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.synchronize(); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.detach_threadpool(); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn get_logits_ith_returns_token_not_initialized_for_unknown_index( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.get_logits_ith(7); - - assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7)))); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 64, - n_batch = 2048, - n_ubatch = 512, - )] - fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let huge_index = i32::try_from(context.n_ctx())?; - context.mark_logits_initialized(huge_index); - let result = context.get_logits_ith(huge_index); - - assert!(matches!( - result, - Err(LogitsError::TokenIndexExceedsContext { .. }) - )); - - Ok(()) - } -} - -mod context_kv_cache { - use std::num::NonZeroU8; - - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::context::kv_cache::KvCacheConversionError; - use llama_cpp_bindings::error::KvCacheSeqAddError; - use llama_cpp_bindings::error::KvCacheSeqDivError; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - fn build_context<'context>( - fixture: &'context LlamaFixture<'_>, - ) -> Result> { - Ok(LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?) - } - - fn decode_hello_world( - fixture: &LlamaFixture<'_>, - context: &mut LlamaContext<'_>, - ) -> Result<()> { - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - context.clear_kv_cache(); - assert_eq!(context.kv_cache_seq_pos_max(0), -1); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - assert!(context.kv_cache_seq_pos_max(0) >= 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1)); - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.copy_kv_cache_seq(0, 1, None, None); - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let pos_max = context.kv_cache_seq_pos_max(0); - context.copy_cache(0, 1, pos_max + 1); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.kv_cache_seq_add(0, Some(0), None, 1); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqAddError::IncompatibleRopeType, - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(0), None, divisor); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqDivError::IncompatibleRopeType, - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - context.kv_cache_seq_keep(0); - - assert!(context.kv_cache_seq_pos_max(0) >= 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1)); - - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = build_context(fixture)?; - - let result = context.kv_cache_seq_pos_max(999); - - assert_eq!(result, -1); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P0TooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX)); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P1TooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::SeqIdTooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P0TooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX)); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P1TooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqAddError::P0TooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqAddError::P1TooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqDivError::P0TooLarge(_), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqDivError::P1TooLarge(_), - )); - - Ok(()) - } -} - -mod context_session { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - fn build_context<'context>( - fixture: &'context LlamaFixture<'_>, - ) -> Result> { - Ok(LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_session.bin"); - context.state_save_file(&session_path, &tokens)?; - - let loaded_tokens = context.state_load_file(&session_path, 512)?; - assert_eq!(loaded_tokens, tokens); - - std::fs::remove_file(&session_path)?; - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = build_context(fixture)?; - - assert!(context.get_state_size() > 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_seq_state.bin"); - let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?; - assert!(bytes_written > 0); - - let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?; - assert_eq!(loaded_tokens, tokens); - assert!(bytes_read > 0); - - std::fs::remove_file(&session_path)?; - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let state_size = context.get_state_size(); - let mut state_data = vec![0u8; state_size]; - let bytes_copied = unsafe { context.copy_state_data(&mut state_data) }; - assert!(bytes_copied > 0); - - let bytes_read = unsafe { context.set_state_data(&state_data) }; - assert!(bytes_read > 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_load_file_with_nonexistent_file_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.state_load_file("/nonexistent/session.bin", 512); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_load_file_with_nonexistent_file_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_save_file_to_invalid_directory_returns_failed_to_save( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = build_context(fixture)?; - - let result = context.state_save_file("/nonexistent_dir/session.bin", &[]); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_save_file_to_invalid_directory_returns_failed_to_save( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = build_context(fixture)?; - - let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_load_file_with_zero_max_tokens_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin"); - context.state_save_file(&session_path, &tokens)?; - - let result = context.state_load_file(&session_path, 0); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_load_file_with_zero_max_tokens_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin"); - context.state_seq_save_file(&session_path, 0, &tokens)?; - - let result = context.state_seq_load_file(&session_path, 0, 0); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_load_file_with_insufficient_max_tokens_returns_length_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token( - "Hello world this is a longer string for more tokens", - AddBos::Always, - )?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin"); - context.state_save_file(&session_path, &tokens)?; - - let result = context.state_load_file(&session_path, 1); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token( - "Hello world this is a longer string for more tokens", - AddBos::Always, - )?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin"); - context.state_seq_save_file(&session_path, 0, &tokens)?; - - let result = context.state_seq_load_file(&session_path, 0, 1); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) - } - - #[cfg(unix)] - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_save_file(non_utf8_path, &[]); - - assert!(result.is_err()); - - Ok(()) - } - - #[cfg(unix)] - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let mut context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_load_file(non_utf8_path, 512); - - assert!(result.is_err()); - - Ok(()) - } - - #[cfg(unix)] - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_save_file_with_non_utf8_path_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_seq_save_file(non_utf8_path, 0, &[]); - - assert!(result.is_err()); - - Ok(()) - } - - #[cfg(unix)] - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_load_file_with_non_utf8_path_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let mut context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_seq_load_file(non_utf8_path, 0, 512); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_save_file_with_null_byte_in_path_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_save_file(path_with_null, &[]); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_load_file_with_null_byte_in_path_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_load_file(path_with_null, 512); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_save_file_with_null_byte_in_path_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_seq_save_file(path_with_null, 0, &[]); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_load_file_with_null_byte_in_path_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mut context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_seq_load_file(path_with_null, 0, 512); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_get_size_ext_returns_size_for_decoded_sequence( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; - - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let flags = LlamaStateSeqFlags::empty(); - let size = context.state_seq_get_size_ext(0, &flags); - - assert!(size > 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn state_seq_get_data_ext_and_set_data_ext_round_trip( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; - - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let flags = LlamaStateSeqFlags::empty(); - let size = context.state_seq_get_size_ext(0, &flags); - let mut buffer = vec![0u8; size]; - let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) }; - - assert!(bytes_written > 0); - - let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) }; - - assert!(bytes_read > 0); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + assert!(context.n_ctx() > 0); + Ok(()) } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.build_context(); + + assert!(result.is_err()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + assert!(context.n_ctx() > 0); + assert!(context.n_batch() > 0); + assert!(context.n_ubatch() > 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let decode_result = context.decode(&mut batch); + assert!(decode_result.is_ok()); + + let logits = context.get_logits()?; + assert!(!logits.is_empty()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.reset_timings(); + let timings = context.timings(); + assert!(timings.t_start_ms() >= 0.0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let token_data_array = context.token_data_array()?; + + assert!(!token_data_array.data.is_empty()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let logits = context.get_logits_ith(last_index)?; + + assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let token_data_array = context.token_data_array_ith(last_index)?; + + assert_eq!( + token_data_array.data.len(), + usize::try_from(fixture.model.n_vocab())? + ); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn embeddings_ith_returns_error_when_embeddings_disabled(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.embeddings_ith(0); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn embeddings_seq_ith_returns_error_when_embeddings_disabled( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.embeddings_seq_ith(0); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let count = context.candidates()?.count(); + + assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let debug_output = format!("{context:?}"); + + assert!(debug_output.contains("LlamaContext")); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let count = context.candidates_ith(last_index)?.count(); + + assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut adapter = LlamaLoraAdapter { + lora_adapter: NonNull::dangling(), + }; + + let result = context.lora_adapter_remove(&mut adapter); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.encode(&mut batch); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut adapter = LlamaLoraAdapter { + lora_adapter: NonNull::dangling(), + }; + + let result = context.lora_adapter_set(&mut adapter, 1.0); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let result = context.embeddings_seq_ith(999); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut batch = LlamaBatch::new(512, 1)?; + + let result = context.decode(&mut batch); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(true)); + context.set_abort_flag(abort_flag); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert_eq!(result, Err(DecodeError::Aborted)); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(false)); + context.set_abort_flag(abort_flag); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(true)); + context.set_abort_flag(abort_flag); + context.clear_abort_callback(); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.synchronize(); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.detach_threadpool(); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn get_logits_ith_returns_token_not_initialized_for_unknown_index( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.get_logits_ith(7); + + assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7)))); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 2048, + n_ubatch = 512, +)] +fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let huge_index = i32::try_from(context.n_ctx())?; + context.mark_logits_initialized(huge_index); + let result = context.get_logits_ith(huge_index); + + assert!(matches!( + result, + Err(LogitsError::TokenIndexExceedsContext { .. }) + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + context.clear_kv_cache(); + assert_eq!(context.kv_cache_seq_pos_max(0), -1); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + assert!(context.kv_cache_seq_pos_max(0) >= 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1)); + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + let result = context.copy_kv_cache_seq(0, 1, None, None); + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + let pos_max = context.kv_cache_seq_pos_max(0); + context.copy_cache(0, 1, pos_max + 1); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + let result = context.kv_cache_seq_add(0, Some(0), None, 1); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqAddError::IncompatibleRopeType, + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(0), None, divisor); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqDivError::IncompatibleRopeType, + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + context.kv_cache_seq_keep(0); + + assert!(context.kv_cache_seq_pos_max(0) >= 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + prime_kv_cache(fixture, &mut context)?; + + let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1)); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = fixture.build_context()?; + + let result = context.kv_cache_seq_pos_max(999); + + assert_eq!(result, -1); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P0TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX)); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P1TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::SeqIdTooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P0TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX)); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P1TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqAddError::P0TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqAddError::P1TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqDivError::P0TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqDivError::P1TooLarge(_), + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_session.bin"); + context.state_save_file(&session_path, &tokens)?; + + let loaded_tokens = context.state_load_file(&session_path, 512)?; + assert_eq!(loaded_tokens, tokens); + + std::fs::remove_file(&session_path)?; + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = fixture.build_context()?; + + assert!(context.get_state_size() > 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_seq_state.bin"); + let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?; + assert!(bytes_written > 0); + + let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?; + assert_eq!(loaded_tokens, tokens); + assert!(bytes_read > 0); + + std::fs::remove_file(&session_path)?; + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let state_size = context.get_state_size(); + let mut state_data = vec![0u8; state_size]; + let bytes_copied = unsafe { context.copy_state_data(&mut state_data) }; + assert!(bytes_copied > 0); + + let bytes_read = unsafe { context.set_state_data(&state_data) }; + assert!(bytes_read > 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_nonexistent_file_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.state_load_file("/nonexistent/session.bin", 512); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_nonexistent_file_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = fixture.build_context()?; + + let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_save_file_to_invalid_directory_returns_failed_to_save( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = fixture.build_context()?; + + let result = context.state_save_file("/nonexistent_dir/session.bin", &[]); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_file_to_invalid_directory_returns_failed_to_save( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = fixture.build_context()?; + + let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_zero_max_tokens_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin"); + context.state_save_file(&session_path, &tokens)?; + + let result = context.state_load_file(&session_path, 0); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_zero_max_tokens_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin"); + context.state_seq_save_file(&session_path, 0, &tokens)?; + + let result = context.state_seq_load_file(&session_path, 0, 0); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_insufficient_max_tokens_returns_length_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token( + "Hello world this is a longer string for more tokens", + AddBos::Always, + )?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin"); + context.state_save_file(&session_path, &tokens)?; + + let result = context.state_load_file(&session_path, 1); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token( + "Hello world this is a longer string for more tokens", + AddBos::Always, + )?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin"); + context.state_seq_save_file(&session_path, 0, &tokens)?; + + let result = context.state_seq_load_file(&session_path, 0, 1); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) +} + +#[cfg(unix)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let context = fixture.build_context()?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_save_file(non_utf8_path, &[]); + + assert!(result.is_err()); + + Ok(()) +} + +#[cfg(unix)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let mut context = fixture.build_context()?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_load_file(non_utf8_path, 512); + + assert!(result.is_err()); + + Ok(()) +} + +#[cfg(unix)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let context = fixture.build_context()?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_seq_save_file(non_utf8_path, 0, &[]); + + assert!(result.is_err()); + + Ok(()) +} + +#[cfg(unix)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let mut context = fixture.build_context()?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_seq_load_file(non_utf8_path, 0, 512); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_save_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = fixture.build_context()?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_save_file(path_with_null, &[]); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = fixture.build_context()?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_load_file(path_with_null, 512); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = fixture.build_context()?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_seq_save_file(path_with_null, 0, &[]); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = fixture.build_context()?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_seq_load_file(path_with_null, 0, 512); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_get_size_ext_returns_size_for_decoded_sequence( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; + + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let flags = LlamaStateSeqFlags::empty(); + let size = context.state_seq_get_size_ext(0, &flags); + + assert!(size > 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_get_data_ext_and_set_data_ext_round_trip(fixture: &LlamaFixture<'_>) -> Result<()> { + use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; + + let mut context = fixture.build_context()?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let flags = LlamaStateSeqFlags::empty(); + let size = context.state_seq_get_size_ext(0, &flags); + let mut buffer = vec![0u8; size]; + let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) }; + + assert!(bytes_written > 0); + + let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) }; + + assert!(bytes_read > 0); + + Ok(()) +} llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/multimodal_vision.rs b/llama-cpp-bindings-tests/tests/multimodal_vision.rs index 7e596be6..a137bb99 100644 --- a/llama-cpp-bindings-tests/tests/multimodal_vision.rs +++ b/llama-cpp-bindings-tests/tests/multimodal_vision.rs @@ -1,1099 +1,1005 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Context; +use anyhow::Result; +use llama_cpp_bindings::SampledToken; +use llama_cpp_bindings::SampledTokenClassifier; +use llama_cpp_bindings::TokenUsage; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk; +use llama_cpp_bindings::llama_batch::LlamaBatch; +use llama_cpp_bindings::model::LlamaChatMessage; +use llama_cpp_bindings::model::LlamaModel; +use llama_cpp_bindings::mtmd::MtmdBitmap; +use llama_cpp_bindings::mtmd::MtmdContext; +use llama_cpp_bindings::mtmd::MtmdContextParams; +use llama_cpp_bindings::mtmd::MtmdEvalError; +use llama_cpp_bindings::mtmd::MtmdInputChunkType; +use llama_cpp_bindings::mtmd::MtmdInputChunks; +use llama_cpp_bindings::mtmd::MtmdInputText; +use llama_cpp_bindings::mtmd::mtmd_default_marker; +use llama_cpp_bindings::sampling::LlamaSampler; +use llama_cpp_bindings_sys::llama_pos; +use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::fixtures_dir::fixtures_dir; +use llama_cpp_test_harness::llama_test; use llama_cpp_test_harness::llama_tests_main; -mod mtmd_bitmap { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings_tests::test_model; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let fixtures = test_model::fixtures_dir(); - let image_path = fixtures.join("llamas.jpg"); - let image_bytes = std::fs::read(&image_path)?; - let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?; - - assert!(bitmap.nx() > 0); - assert!(bitmap.ny() > 0); - assert!(!bitmap.is_audio()); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let fixtures = fixtures_dir(); + let image_path = fixtures.join("llamas.jpg"); + let image_bytes = std::fs::read(&image_path)?; + let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?; + + assert!(bitmap.nx() > 0); + assert!(bitmap.ny() > 0); + assert!(!bitmap.is_audio()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null"); - - assert!(result.is_err()); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null"); + + assert!(result.is_err()); + + Ok(()) } -mod mtmd_chunk_operations { - use anyhow::Result; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputChunkType; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - let copied = first_chunk.copy()?; - - assert!(copied.owned); - assert_eq!(copied.n_tokens(), first_chunk.n_tokens()); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Hello <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + let copied = first_chunk.copy()?; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - let result = mtmd_ctx.encode_chunk(&chunk); - assert!(result.is_ok()); - return Ok(()); - } + assert!(copied.owned); + assert_eq!(copied.n_tokens(), first_chunk.n_tokens()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + let result = mtmd_ctx.encode_chunk(&chunk); + assert!(result.is_ok()); + return Ok(()); } - Ok(()) } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn decode_use_non_causal_returns_bool_for_image_chunk( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - let value = mtmd_ctx.decode_use_non_causal(&chunk); - let printed = format!("{value:?}"); - assert!( - !printed.is_empty(), - "decode_use_non_causal must return a Debug-printable bool" - ); - return Ok(()); - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn decode_use_non_causal_returns_bool_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + let value = mtmd_ctx.decode_use_non_causal(&chunk); + let printed = format!("{value:?}"); + assert!( + !printed.is_empty(), + "decode_use_non_causal must return a Debug-printable bool" + ); + return Ok(()); } - anyhow::bail!("tokenization should produce at least one Image chunk"); } + anyhow::bail!("tokenization should produce at least one Image chunk"); } -mod mtmd_chunk_structure { - use anyhow::Result; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputChunkType; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - fn tokenize_synthetic( - fixture: &LlamaFixture<'_>, - prompt: &str, - ) -> Result { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: prompt.to_owned(), - add_special: true, - parse_special: true, - }; - Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?) - } +fn tokenize_synthetic(fixture: &LlamaFixture<'_>, prompt: &str) -> Result { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: prompt.to_owned(), + add_special: true, + parse_special: true, + }; + Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - let tokens = first_chunk.text_tokens(); - assert!(tokens.is_some()); - assert!(!tokens.expect("tokens should be some").is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + let tokens = first_chunk.text_tokens(); + assert!(tokens.is_some()); + assert!(!tokens.expect("tokens should be some").is_empty()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert!(first_chunk.n_tokens() > 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert!(first_chunk.n_tokens() > 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert!(first_chunk.n_positions() > 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert!(first_chunk.n_positions() > 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); - assert!(first_chunk.id().is_none()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); + assert!(first_chunk.id().is_none()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - assert!(chunk.text_tokens().is_none()); - return Ok(()); - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + assert!(chunk.text_tokens().is_none()); + return Ok(()); } - Ok(()) } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - assert!(chunk.id().is_some()); - return Ok(()); - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + assert!(chunk.id().is_some()); + return Ok(()); } - Ok(()) } + Ok(()) } -mod mtmd_context { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::mtmd::MtmdContext; - use llama_cpp_bindings::mtmd::MtmdContextParams; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!(mtmd_ctx.support_vision()); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn init_from_file_with_null_byte_in_path_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mtmd_params = MtmdContextParams::default(); - let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params); - - assert!(result.is_err()); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!( - mtmd_ctx.decode_use_mrope(), - "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true" - ); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!( - !mtmd_ctx.support_audio(), - "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!(mtmd_ctx.support_vision()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn get_audio_sample_rate_is_none_for_vision_only_mmproj( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!( - mtmd_ctx.get_audio_sample_rate().is_none(), - "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn init_from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_params = MtmdContextParams::default(); + let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params); + + assert!(result.is_err()); + Ok(()) } -mod mtmd_evaluation { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdEvalError; - use llama_cpp_bindings::mtmd::MtmdInputChunks; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_bindings_tests::test_model; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; (width as usize) * (height as usize) * 3]; - let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let n_positions = chunks.total_positions(); - let required_n_ctx = u32::try_from(n_positions + 256)?; - if fixture.context_params.n_ctx < required_n_ctx { - anyhow::bail!( - "fixture n_ctx ({}) below required ({}) for {}x{} image", - fixture.context_params.n_ctx, - required_n_ctx, - width, - height, - ); - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + mtmd_ctx.decode_use_mrope(), + "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true" + ); + Ok(()) +} - let llama_ctx = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let n_batch = i32::try_from(llama_ctx.n_batch())?; - chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?; - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + !mtmd_ctx.support_audio(), + "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 64, - n_batch = 64, - n_ubatch = 32, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 64, - n_batch = 64, - n_ubatch = 32, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let llama_ctx = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chunks = MtmdInputChunks::new()?; - let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?; - - let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false); - - assert!(matches!( - result, - Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. }) - )); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn get_audio_sample_rate_is_none_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + mtmd_ctx.get_audio_sample_rate().is_none(), + "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let fixtures = test_model::fixtures_dir(); - let image_path = fixtures.join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - let input_text = MtmdInputText { - text: "What is in this image? <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let n_positions = chunks.total_positions(); - let required_n_ctx = u32::try_from(n_positions + 256)?; - assert!( - fixture.context_params.n_ctx >= required_n_ctx, - "fixture n_ctx ({}) below required ({}); update the attribute literal", +fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; (width as usize) * (height as usize) * 3]; + let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let n_positions = chunks.total_positions(); + let required_n_ctx = u32::try_from(n_positions + 256)?; + if fixture.context_params.n_ctx < required_n_ctx { + anyhow::bail!( + "fixture n_ctx ({}) below required ({}) for {}x{} image", fixture.context_params.n_ctx, required_n_ctx, + width, + height, ); - - let llama_ctx = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let n_batch = i32::try_from(llama_ctx.n_batch())?; - let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false); - - assert!(result.is_ok()); - - Ok(()) } - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> { - let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)]; - - for (width, height) in test_dimensions { - let result = eval_synthetic_bitmap(fixture, width, height); - assert!( - result.is_ok(), - "dimension {width}x{height} should succeed: {result:?}" - ); - } + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let n_batch = i32::try_from(llama_ctx.n_batch())?; + chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?; + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 64, + n_ubatch = 32, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 64, + n_ubatch = 32, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chunks = MtmdInputChunks::new()?; + let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?; + + let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false); + + assert!(matches!( + result, + Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. }) + )); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn eval_chunks_with_extreme_dimensions_does_not_crash( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let extreme_dimensions: [(u32, u32); 6] = [ - (1, 1), - (7, 13), - (3, 1000), - (1000, 3), - (1920, 1080), - (4096, 4096), - ]; - - let mut any_reached_eval = false; - - for (width, height) in extreme_dimensions { - match eval_synthetic_bitmap(fixture, width, height) { - Ok(()) => any_reached_eval = true, - Err(error) => eprintln!(" {width}x{height} failed: {error}"), - } - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let fixtures = fixtures_dir(); + let image_path = fixtures.join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + let input_text = MtmdInputText { + text: "What is in this image? <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let n_positions = chunks.total_positions(); + let required_n_ctx = u32::try_from(n_positions + 256)?; + assert!( + fixture.context_params.n_ctx >= required_n_ctx, + "fixture n_ctx ({}) below required ({}); update the attribute literal", + fixture.context_params.n_ctx, + required_n_ctx, + ); + + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let n_batch = i32::try_from(llama_ctx.n_batch())?; + let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false); + + assert!(result.is_ok()); + + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> { + let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)]; + + for (width, height) in test_dimensions { + let result = eval_synthetic_bitmap(fixture, width, height); assert!( - any_reached_eval, - "at least one extreme dimension should reach eval_chunks" + result.is_ok(), + "dimension {width}x{height} should succeed: {result:?}" ); - - Ok(()) } + + Ok(()) } -mod mtmd_tokenization { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe this image: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - assert!(!chunks.is_empty()); - assert!(chunks.total_tokens() > 0); - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_with_extreme_dimensions_does_not_crash(fixture: &LlamaFixture<'_>) -> Result<()> { + let extreme_dimensions: [(u32, u32); 6] = [ + (1, 1), + (7, 13), + (3, 1000), + (1000, 3), + (1920, 1080), + (4096, 4096), + ]; + + let mut any_reached_eval = false; + + for (width, height) in extreme_dimensions { + match eval_synthetic_bitmap(fixture, width, height) { + Ok(()) => any_reached_eval = true, + Err(error) => eprintln!(" {width}x{height} failed: {error}"), + } } - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let input_text = MtmdInputText { - text: "No media markers here".to_string(), - add_special: true, - parse_special: true, - }; - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let result = mtmd_ctx.tokenize(input_text, &[&bitmap]); - assert!(result.is_err()); - Ok(()) - } + assert!( + any_reached_eval, + "at least one extreme dimension should reach eval_chunks" + ); - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let input_text = MtmdInputText { - text: "text\0null".to_string(), - add_special: true, - parse_special: true, - }; - let result = mtmd_ctx.tokenize(input_text, &[]); - assert!(result.is_err()); - Ok(()) - } + Ok(()) } -mod multimodal { - use anyhow::{Context, Result}; - use llama_cpp_bindings::SampledTokenClassifier; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel}; - use llama_cpp_bindings::mtmd::{ - MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText, +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe this image: <__media__>".to_string(), + add_special: true, + parse_special: true, }; - use llama_cpp_bindings::sampled_token::SampledToken; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_sys::llama_pos; - use llama_cpp_bindings_tests::test_model; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - struct ChunkTokenBreakdown { - text: u64, - image: u64, - audio: u64, - } + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result { - let mut breakdown = ChunkTokenBreakdown { - text: 0, - image: 0, - audio: 0, - }; - for index in 0..chunks.len() { - let chunk = chunks - .get(index) - .with_context(|| format!("chunk index {index} is missing"))?; - let n_tokens = u64::try_from(chunk.n_tokens())?; - match chunk.chunk_type()? { - MtmdInputChunkType::Text => breakdown.text += n_tokens, - MtmdInputChunkType::Image => breakdown.image += n_tokens, - MtmdInputChunkType::Audio => breakdown.audio += n_tokens, - } - } + assert!(!chunks.is_empty()); + assert!(chunks.total_tokens() > 0); + Ok(()) +} - Ok(breakdown) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let input_text = MtmdInputText { + text: "No media markers here".to_string(), + add_special: true, + parse_special: true, + }; + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let result = mtmd_ctx.tokenize(input_text, &[&bitmap]); + assert!(result.is_err()); + Ok(()) +} - fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result { - let marker = llama_cpp_bindings::mtmd::mtmd_default_marker(); - let user_content = format!("{marker}{question}"); - let chat_template = model.chat_template(None)?; - let messages = [LlamaChatMessage::new("user".to_string(), user_content)?]; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let input_text = MtmdInputText { + text: "text\0null".to_string(), + add_special: true, + parse_special: true, + }; + let result = mtmd_ctx.tokenize(input_text, &[]); + assert!(result.is_err()); + Ok(()) +} +struct ChunkTokenBreakdown { + text: u64, + image: u64, + audio: u64, +} - Ok(model.apply_chat_template(&chat_template, &messages, true)?) +fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result { + let mut breakdown = ChunkTokenBreakdown { + text: 0, + image: 0, + audio: 0, + }; + for index in 0..chunks.len() { + let chunk = chunks + .get(index) + .with_context(|| format!("chunk index {index} is missing"))?; + let n_tokens = u64::try_from(chunk.n_tokens())?; + match chunk.chunk_type()? { + MtmdInputChunkType::Text => breakdown.text += n_tokens, + MtmdInputChunkType::Image => breakdown.image += n_tokens, + MtmdInputChunkType::Audio => breakdown.audio += n_tokens, + } } - struct SamplingTotals { - generated: String, - observed_content: u64, - observed_reasoning: u64, - } + Ok(breakdown) +} - fn drive_sampling_loop( - classifier: &mut SampledTokenClassifier, - model: &LlamaModel, - ctx: &mut LlamaContext, - starting_position: llama_pos, - max_tokens: usize, - ) -> Result { - let mut sampler = LlamaSampler::greedy(); - let mut totals = SamplingTotals { - generated: String::new(), - observed_content: 0, - observed_reasoning: 0, - }; - let mut batch = LlamaBatch::new(512, 1)?; - - for (current_position, _) in (starting_position..).zip(0..max_tokens) { - let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?; - for outcome in &outcomes { - totals.generated.push_str(&outcome.raw_piece); - match outcome.sampled_token { - SampledToken::Content(_) => totals.observed_content += 1, - SampledToken::Reasoning(_) => totals.observed_reasoning += 1, - SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {} - } - } +fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result { + let marker = mtmd_default_marker(); + let user_content = format!("{marker}{question}"); + let chat_template = model.chat_template(None)?; + let messages = [LlamaChatMessage::new("user".to_string(), user_content)?]; - let raw_as_sampled = SampledToken::Content(raw_token); - if model.is_eog_token(&raw_as_sampled) { - break; - } + Ok(model.apply_chat_template(&chat_template, &messages, true)?) +} - batch.clear(); - batch.add(&raw_as_sampled, current_position, &[0], true)?; +struct SamplingTotals { + generated: String, + observed_content: u64, + observed_reasoning: u64, +} - ctx.decode(&mut batch) - .with_context(|| "failed to decode generated token")?; - } +fn drive_sampling_loop( + classifier: &mut SampledTokenClassifier, + model: &LlamaModel, + ctx: &mut LlamaContext, + starting_position: llama_pos, + max_tokens: usize, +) -> Result { + let mut sampler = LlamaSampler::greedy(); + let mut totals = SamplingTotals { + generated: String::new(), + observed_content: 0, + observed_reasoning: 0, + }; + let mut batch = LlamaBatch::new(512, 1)?; - for outcome in classifier.flush() { + for (current_position, _) in (starting_position..).zip(0..max_tokens) { + let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?; + for outcome in &outcomes { totals.generated.push_str(&outcome.raw_piece); match outcome.sampled_token { SampledToken::Content(_) => totals.observed_content += 1, @@ -1102,900 +1008,797 @@ mod multimodal { } } - Ok(totals) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut ctx = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create llama context")?; - - assert!( - mtmd_ctx.support_vision(), - "model should support vision input" - ); - - let image_path = test_model::fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .with_context(|| "image path is not valid UTF-8")?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str) - .with_context(|| "failed to load image from file")?; - - let formatted_prompt = - build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?; + let raw_as_sampled = SampledToken::Content(raw_token); + if model.is_eog_token(&raw_as_sampled) { + break; + } - let input_text = MtmdInputText { - text: formatted_prompt, - add_special: false, - parse_special: true, - }; + batch.clear(); + batch.add(&raw_as_sampled, current_position, &[0], true)?; - let chunks = mtmd_ctx - .tokenize(input_text, &[&bitmap]) - .with_context(|| "failed to tokenize multimodal input")?; + ctx.decode(&mut batch) + .with_context(|| "failed to decode generated token")?; + } - assert!( - !chunks.is_empty(), - "tokenization should produce at least one chunk" - ); + for outcome in classifier.flush() { + totals.generated.push_str(&outcome.raw_piece); + match outcome.sampled_token { + SampledToken::Content(_) => totals.observed_content += 1, + SampledToken::Reasoning(_) => totals.observed_reasoning += 1, + SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {} + } + } - let expected = count_chunk_tokens_by_type(&chunks)?; + Ok(totals) +} - eprintln!( - "tokenized into {} chunks, text {} image {} audio {}", - chunks.len(), - expected.text, - expected.image, - expected.audio - ); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create llama context")?; + + assert!( + mtmd_ctx.support_vision(), + "model should support vision input" + ); + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .with_context(|| "image path is not valid UTF-8")?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str) + .with_context(|| "failed to load image from file")?; + + let formatted_prompt = + build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?; + + let input_text = MtmdInputText { + text: formatted_prompt, + add_special: false, + parse_special: true, + }; - assert!( - expected.image > 0, - "vision input must produce at least one image chunk" - ); + let chunks = mtmd_ctx + .tokenize(input_text, &[&bitmap]) + .with_context(|| "failed to tokenize multimodal input")?; - let mut classifier = model.sampled_token_classifier(); - let n_past = classifier - .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true) - .with_context(|| "failed to evaluate chunks")?; + assert!( + !chunks.is_empty(), + "tokenization should produce at least one chunk" + ); - eprintln!("evaluated chunks, n_past = {n_past}"); + let expected = count_chunk_tokens_by_type(&chunks)?; - { - let usage = classifier.usage(); - assert_eq!(usage.prompt_tokens, expected.text); - assert_eq!(usage.input_image_tokens, expected.image); - assert_eq!(usage.input_audio_tokens, expected.audio); - } + eprintln!( + "tokenized into {} chunks, text {} image {} audio {}", + chunks.len(), + expected.text, + expected.image, + expected.audio + ); - let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?; + assert!( + expected.image > 0, + "vision input must produce at least one image chunk" + ); - eprintln!("generated text: {}", totals.generated); + let mut classifier = model.sampled_token_classifier(); + let n_past = classifier + .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true) + .with_context(|| "failed to evaluate chunks")?; - assert!( - !totals.generated.is_empty(), - "model should generate at least one token from image input" - ); + eprintln!("evaluated chunks, n_past = {n_past}"); - let usage = classifier.into_usage(); + { + let usage = classifier.usage(); assert_eq!(usage.prompt_tokens, expected.text); assert_eq!(usage.input_image_tokens, expected.image); assert_eq!(usage.input_audio_tokens, expected.audio); - assert_eq!(usage.content_tokens, totals.observed_content); - assert_eq!(usage.reasoning_tokens, totals.observed_reasoning); - assert_eq!( - usage.completion_tokens(), - totals.observed_content + totals.observed_reasoning - ); - - Ok(()) - } -} - -mod eval_multimodal_chunks_records_exact_token_counts { - use anyhow::Result; - use llama_cpp_bindings::TokenUsage; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputChunkType; - use llama_cpp_bindings::mtmd::MtmdInputChunks; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_bindings::mtmd::mtmd_default_marker; - use llama_cpp_bindings_tests::test_model::fixtures_dir; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - const PROMPT_QUESTION: &str = "What animals do you see in this image?"; - - struct ExpectedChunkTotals { - text: u64, - image: u64, - audio: u64, } - fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result { - let mut totals = ExpectedChunkTotals { - text: 0, - image: 0, - audio: 0, - }; - for index in 0..chunks.len() { - let chunk = chunks - .get(index) - .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?; - let n_tokens = u64::try_from(chunk.n_tokens())?; - match chunk.chunk_type()? { - MtmdInputChunkType::Text => { - totals.text = totals.text.saturating_add(n_tokens); - } - MtmdInputChunkType::Image => { - totals.image = totals.image.saturating_add(n_tokens); - } - MtmdInputChunkType::Audio => { - totals.audio = totals.audio.saturating_add(n_tokens); - } - } - } - Ok(totals) - } + let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?; - fn build_multimodal_chunks_and_eval_into_usage( - fixture: &LlamaFixture<'_>, - ) -> Result<(TokenUsage, ExpectedChunkTotals)> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); + eprintln!("generated text: {}", totals.generated); - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + assert!( + !totals.generated.is_empty(), + "model should generate at least one token from image input" + ); - let marker = mtmd_default_marker(); - let prompt = format!("{marker}{PROMPT_QUESTION}"); + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, expected.text); + assert_eq!(usage.input_image_tokens, expected.image); + assert_eq!(usage.input_audio_tokens, expected.audio); + assert_eq!(usage.content_tokens, totals.observed_content); + assert_eq!(usage.reasoning_tokens, totals.observed_reasoning); + assert_eq!( + usage.completion_tokens(), + totals.observed_content + totals.observed_reasoning + ); - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; + Ok(()) +} - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let expected = sum_chunk_token_counts_by_type(&chunks)?; +const PROMPT_QUESTION: &str = "What animals do you see in this image?"; + +fn build_multimodal_chunks_and_eval_into_usage( + fixture: &LlamaFixture<'_>, +) -> Result<(TokenUsage, ChunkTokenBreakdown)> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!("{marker}{PROMPT_QUESTION}"); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; - let context_params = (*fixture.context_params).into_llama_context_params(); - let context = LlamaContext::from_model(model, fixture.backend, context_params)?; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let expected = count_chunk_tokens_by_type(&chunks)?; - let mut classifier = model.sampled_token_classifier(); - classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + let context_params = (*fixture.context_params).into_llama_context_params(); + let context = LlamaContext::from_model(model, fixture.backend, context_params)?; - Ok((classifier.into_usage(), expected)) - } + let mut classifier = model.sampled_token_classifier(); + classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if usage.prompt_tokens != expected.text { - anyhow::bail!( - "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}", - expected.text, - usage.prompt_tokens - ); - } + Ok((classifier.into_usage(), expected)) +} - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if usage.prompt_tokens != expected.text { + anyhow::bail!( + "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}", + expected.text, + usage.prompt_tokens + ); } - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if usage.input_image_tokens != expected.image { - anyhow::bail!( - "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}", - expected.image, - usage.input_image_tokens - ); - } + Ok(()) +} - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if usage.input_image_tokens != expected.image { + anyhow::bail!( + "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}", + expected.image, + usage.input_image_tokens + ); } - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if expected.audio != 0 { - anyhow::bail!( - "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}", - expected.audio - ); - } - if usage.input_audio_tokens != 0 { - anyhow::bail!( - "input_audio_tokens must be zero when no audio chunks are evaluated; got {}", - usage.input_audio_tokens - ); - } + Ok(()) +} - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if expected.audio != 0 { + anyhow::bail!( + "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}", + expected.audio + ); + } + if usage.input_audio_tokens != 0 { + anyhow::bail!( + "input_audio_tokens must be zero when no audio chunks are evaluated; got {}", + usage.input_audio_tokens + ); } - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn completion_tokens_are_zero_after_eval_before_generation( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if usage.completion_tokens() != 0 { - anyhow::bail!( - "completion_tokens must be zero immediately after eval (no generation has occurred); got {}", - usage.completion_tokens() - ); - } + Ok(()) +} - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn completion_tokens_are_zero_after_eval_before_generation( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if usage.completion_tokens() != 0 { + anyhow::bail!( + "completion_tokens must be zero immediately after eval (no generation has occurred); got {}", + usage.completion_tokens() + ); } + + Ok(()) } -mod ingest_prompt_chunk { - use anyhow::Result; - use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputChunkType; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_bindings::mtmd::mtmd_default_marker; - use llama_cpp_bindings_tests::test_model::fixtures_dir; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let input_text = MtmdInputText { - text: "hello world".to_owned(), - add_special: false, - parse_special: false, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[])?; - - let text_chunk = (0..chunks.len()) - .filter_map(|index| chunks.get(index)) - .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text)) - .ok_or_else(|| { - anyhow::anyhow!("text-only tokenization should produce at least one text chunk") - })?; - - let n_tokens = u64::try_from(text_chunk.n_tokens())?; - - let mut classifier = model.sampled_token_classifier(); - - ingest_prompt_chunk(&mut classifier, &text_chunk)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let input_text = MtmdInputText { + text: "hello world".to_owned(), + add_special: false, + parse_special: false, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[])?; - let usage = classifier.usage(); - if usage.prompt_tokens != n_tokens { - anyhow::bail!( - "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}", - usage.prompt_tokens - ); - } - if usage.input_image_tokens != 0 { - anyhow::bail!( - "text chunk must not bump input_image_tokens; got {}", - usage.input_image_tokens - ); - } - if usage.input_audio_tokens != 0 { - anyhow::bail!( - "text chunk must not bump input_audio_tokens; got {}", - usage.input_audio_tokens - ); - } + let text_chunk = (0..chunks.len()) + .filter_map(|index| chunks.get(index)) + .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text)) + .ok_or_else(|| { + anyhow::anyhow!("text-only tokenization should produce at least one text chunk") + })?; - Ok(()) - } + let n_tokens = u64::try_from(text_chunk.n_tokens())?; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let input_text = MtmdInputText { - text: marker.to_owned(), - add_special: false, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let image_chunk = (0..chunks.len()) - .filter_map(|index| chunks.get(index)) - .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image)) - .ok_or_else(|| { - anyhow::anyhow!("multimodal tokenization should produce an image chunk") - })?; - - let n_tokens = u64::try_from(image_chunk.n_tokens())?; - if n_tokens == 0 { - anyhow::bail!("image chunk should report at least one token"); - } + let mut classifier = model.sampled_token_classifier(); - let mut classifier = model.sampled_token_classifier(); + ingest_prompt_chunk(&mut classifier, &text_chunk)?; - ingest_prompt_chunk(&mut classifier, &image_chunk)?; + let usage = classifier.usage(); + if usage.prompt_tokens != n_tokens { + anyhow::bail!( + "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}", + usage.prompt_tokens + ); + } + if usage.input_image_tokens != 0 { + anyhow::bail!( + "text chunk must not bump input_image_tokens; got {}", + usage.input_image_tokens + ); + } + if usage.input_audio_tokens != 0 { + anyhow::bail!( + "text chunk must not bump input_audio_tokens; got {}", + usage.input_audio_tokens + ); + } - let usage = classifier.usage(); - if usage.input_image_tokens != n_tokens { - anyhow::bail!( - "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}", - usage.input_image_tokens - ); - } - if usage.prompt_tokens != 0 { - anyhow::bail!( - "image chunk must not bump prompt_tokens; got {}", - usage.prompt_tokens - ); - } - if usage.input_audio_tokens != 0 { - anyhow::bail!( - "image chunk must not bump input_audio_tokens; got {}", - usage.input_audio_tokens - ); - } + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let input_text = MtmdInputText { + text: marker.to_owned(), + add_special: false, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let image_chunk = (0..chunks.len()) + .filter_map(|index| chunks.get(index)) + .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image)) + .ok_or_else(|| anyhow::anyhow!("multimodal tokenization should produce an image chunk"))?; - Ok(()) + let n_tokens = u64::try_from(image_chunk.n_tokens())?; + if n_tokens == 0 { + anyhow::bail!("image chunk should report at least one token"); } - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - fn text_chunk_drives_marker_state_machine_to_reasoning( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let input_text = MtmdInputText { - text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n\n".to_owned(), - add_special: false, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[])?; - - let mut classifier = model.sampled_token_classifier(); - - for index in 0..chunks.len() { - let chunk = chunks - .get(index) - .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?; - ingest_prompt_chunk(&mut classifier, &chunk)?; - } + let mut classifier = model.sampled_token_classifier(); - if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning { - anyhow::bail!( - "text chunk replay must transition the classifier section to Reasoning when the \ - prompt opens a `` block; got {:?}", - classifier.current_section() - ); - } + ingest_prompt_chunk(&mut classifier, &image_chunk)?; - Ok(()) + let usage = classifier.usage(); + if usage.input_image_tokens != n_tokens { + anyhow::bail!( + "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}", + usage.input_image_tokens + ); } + if usage.prompt_tokens != 0 { + anyhow::bail!( + "image chunk must not bump prompt_tokens; got {}", + usage.prompt_tokens + ); + } + if usage.input_audio_tokens != 0 { + anyhow::bail!( + "image chunk must not bump input_audio_tokens; got {}", + usage.input_audio_tokens + ); + } + + Ok(()) } -mod gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_bindings::mtmd::mtmd_default_marker; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_bindings_tests::test_model::fixtures_dir; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_drives_marker_state_machine_to_reasoning(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let input_text = MtmdInputText { + text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n\n".to_owned(), + add_special: false, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[])?; + + let mut classifier = model.sampled_token_classifier(); - const MAX_GENERATED_TOKENS: i32 = 200; + for index in 0..chunks.len() { + let chunk = chunks + .get(index) + .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?; + ingest_prompt_chunk(&mut classifier, &chunk)?; + } - #[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"), - )] - fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "user\n{marker}What animals do you see in this image?\nmodel\n<|channel>thought\n" + if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning { + anyhow::bail!( + "text chunk replay must transition the classifier section to Reasoning when the \ + prompt opens a `` block; got {:?}", + classifier.current_section() ); + } - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = - classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; + Ok(()) +} - let usage = classifier.usage(); +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"), +)] +fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + const MAX_GENERATED_TOKENS: i32 = 200; - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \ - when the prompt opens a `<|channel>thought` block; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "user\n{marker}What animals do you see in this image?\nmodel\n<|channel>thought\n" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; - Ok(()) + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, } -} + .run()?; -mod mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_bindings::mtmd::mtmd_default_marker; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_bindings_tests::test_model::fixtures_dir; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + let usage = classifier.usage(); - const MAX_GENERATED_TOKENS: i32 = 768; - - #[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"), - )] - fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ - First draft your thinking process (inner monologue) until you arrive at a response. \ - Format your response using Markdown, and use LaTeX for any mathematical equations. \ - Write both your thoughts and the response in the same language as the input.\n\n\ - Your thinking process must follow the template below:\ - [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ - Be as casual and as long as you want until you are confident to generate the response \ - to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ - [INST]{marker}What animals do you see in this image?[/INST]" + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \ + when the prompt opens a `<|channel>thought` block; outcome={outcome:?}" + ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" ); + } - let input_text = MtmdInputText { - text: prompt, - add_special: true, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = - classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::greedy(); - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; + Ok(()) +} - let usage = classifier.usage(); +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"), +)] +fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + const MAX_GENERATED_TOKENS: i32 = 768; - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \ - when the model opens a `[THINK]` block; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ + First draft your thinking process (inner monologue) until you arrive at a response. \ + Format your response using Markdown, and use LaTeX for any mathematical equations. \ + Write both your thoughts and the response in the same language as the input.\n\n\ + Your thinking process must follow the template below:\ + [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ + Be as casual and as long as you want until you are confident to generate the response \ + to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ + [INST]{marker}What animals do you see in this image?[/INST]" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: true, + parse_special: true, + }; - Ok(()) + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::greedy(); + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, } -} - -mod qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_bindings::mtmd::mtmd_default_marker; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_bindings_tests::test_model::fixtures_dir; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + .run()?; - const MAX_GENERATED_TOKENS: i32 = 200; + let usage = classifier.usage(); - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \ + when the model opens a `[THINK]` block; outcome={outcome:?}" + ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" ); + } - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = - classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; + Ok(()) +} - let usage = classifier.usage(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + const MAX_GENERATED_TOKENS: i32 = 200; - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \ - when the prompt opens a `` block; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; - Ok(()) + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, } -} + .run()?; -mod qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::mtmd::MtmdBitmap; - use llama_cpp_bindings::mtmd::MtmdInputText; - use llama_cpp_bindings::mtmd::mtmd_default_marker; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_bindings_tests::test_model::fixtures_dir; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - const MAX_GENERATED_TOKENS: i32 = 200; + let usage = classifier.usage(); - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), - )] - fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \ + when the prompt opens a `` block; outcome={outcome:?}" ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" + ); + } - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = - classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; + Ok(()) +} - let usage = classifier.usage(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + const MAX_GENERATED_TOKENS: i32 = 200; - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; - Ok(()) + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}" + ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" + ); } -} + Ok(()) +} llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs index a5aac3d4..d5cad959 100644 --- a/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs +++ b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs @@ -1,2484 +1,2215 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use anyhow::bail; +use llama_cpp_bindings::ChatMessageParseOutcome; +use llama_cpp_bindings::ToolCallArgsShape; +use llama_cpp_bindings::ToolCallArguments; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::llama_batch::LlamaBatch; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_bindings::model::LlamaChatMessage; +use llama_cpp_bindings::sampling::LlamaSampler; +use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; use llama_cpp_test_harness::llama_tests_main; - -mod deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +use serde_json::Value; +use serde_json::json; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 200; const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\ - <|User|>What is 2 + 2?<|Assistant|> +<|User|>What is 2 + 2?<|Assistant|> - + - "; +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = - model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!( - !outcome.generated_raw.is_empty(), - "DeepSeek-R1-8B: must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the think block before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = + model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!( + !outcome.generated_raw.is_empty(), + "DeepSeek-R1-8B: must generate at least one token" + ); + assert_eq!( + outcome.observed_reasoning, 0, + "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \ + when the prompt closes the think block before generation begins; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \ + before generation, so no Undeterminable tokens may be emitted; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + usage.reasoning_tokens, 0, + "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert!( + outcome.observed_content > 0, + "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content, + "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens" + ); + + for forbidden in FORBIDDEN_MARKERS { assert!( - outcome.observed_content > 0, - "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens" + !outcome.content_stream.contains(forbidden), + "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream ); - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) } -} -mod deepseek_r1_8b_classifier_emits_reasoning { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[expect( + clippy::too_many_lines, + reason = "test asserts many distinct properties of DeepSeek-R1-8B reasoning output; shortening messages or splitting the body would reduce diagnostic signal at failure time" +)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 1500; - // DeepSeek-R1-Distill-Llama-8B uses `...` reasoning markers - // and full-width-bar role tokens `<|User|>` / `<|Assistant|>` (U+FF5C, - // not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends - // `<|Assistant|>\n` — DeepSeek-R1 is a pure reasoner with no - // thinking-disabled mode — so the model resumes generation already inside - // the reasoning block. const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\ - <|User|>What is 2 + 2?<|Assistant|> - "; +<|User|>What is 2 + 2?<|Assistant|> +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[expect( - clippy::too_many_lines, - reason = "test asserts many distinct properties of DeepSeek-R1-8B reasoning output; shortening messages or splitting the body would reduce diagnostic signal at failure time" - )] - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!( - "DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized" - ); - }; - - assert!( - !outcome.generated_raw.is_empty(), - "DeepSeek-R1-8B: must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \ - opens a block; outcome={outcome:?}", - ); - assert!( - usage.reasoning_tokens > 0, - "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \ - block; usage was {usage:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \ - so no Undeterminable tokens may be emitted; outcome={outcome:?}" + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!( + !outcome.generated_raw.is_empty(), + "DeepSeek-R1-8B: must generate at least one token" + ); + assert!( + outcome.observed_reasoning > 0, + "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \ + opens a block; outcome={outcome:?}", + ); + assert!( + usage.reasoning_tokens > 0, + "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \ + block; usage was {usage:?}" + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \ + so no Undeterminable tokens may be emitted; outcome={outcome:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning" + ); + + if parsed.reasoning_content.is_empty() { + eprintln!( + "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \ + tokens — skipping strict parser-equality assertions" ); + } else { assert_eq!( - usage.undeterminable_tokens, 0, - "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}" + outcome.reasoning_stream, parsed.reasoning_content, + "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \ + (any difference means a marker leaked into the user-visible stream)", ); assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning" + outcome.content_stream, parsed.content, + "DeepSeek-R1-8B: per-token content stream must equal parser-side content \ + (any difference means a marker leaked into the user-visible stream)", ); + } - if parsed.reasoning_content.is_empty() { - eprintln!( - "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \ - tokens — skipping strict parser-equality assertions" - ); - } else { - assert_eq!( - outcome.reasoning_stream, parsed.reasoning_content, - "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \ - (any difference means a marker leaked into the user-visible stream)", - ); - assert_eq!( - outcome.content_stream, parsed.content, - "DeepSeek-R1-8B: per-token content stream must equal parser-side content \ - (any difference means a marker leaked into the user-visible stream)", - ); - } - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) + for forbidden in FORBIDDEN_MARKERS { + assert!( + !outcome.reasoning_stream.contains(forbidden), + "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \ + reasoning_stream={:?}", + outcome.reasoning_stream + ); + assert!( + !outcome.content_stream.contains(forbidden), + "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream + ); } -} -mod deepseek_r1_8b_duck_types_gemma_paired_quote { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const GEMMA_PAIRED_QUOTE_PAYLOAD: &str = "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise Gemma paired-quote on a model with no registered \ - template; got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise Gemma paired-quote on a model with no registered \ + template; got Unrecognized" ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); - Ok(()) - } + Ok(()) } -mod deepseek_r1_8b_duck_types_glm_key_value_tags { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const GLM_KEY_VALUE_PAYLOAD: &str = "get_weather\ - location\ - Paris\ - "; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise GLM key-value tags on a model with no registered \ - template; got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls +location\ +Paris\ +"; + + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise GLM key-value tags on a model with no registered \ + template; got Unrecognized" ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); - Ok(()) - } + Ok(()) } -mod deepseek_r1_8b_duck_types_mistral_bracketed_json { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const MISTRAL_BRACKETED_JSON_PAYLOAD: &str = r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \ - template; got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \ + template; got Unrecognized" ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); - Ok(()) - } + Ok(()) } -mod deepseek_r1_8b_duck_types_qwen_xml { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const QWEN_XML_PAYLOAD: &str = "\n\ - \n\ - \n\ - Paris\n\ - \n\ - \n\ - "; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise Qwen XML on a model with no registered template; \ - got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls +\n\ +\n\ +Paris\n\ +\n\ +\n\ +"; + + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise Qwen XML on a model with no registered template; \ + got Unrecognized" ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); - Ok(()) - } + Ok(()) } -mod deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "plain content with tools requested must produce Recognized (with empty tool_calls); \ - got Unrecognized" - ); - }; - assert!( - parsed.tool_calls.is_empty(), - "expected no tool calls; got {:?}", - parsed.tool_calls - ); + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; - Ok(()) - } + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "plain content with tools requested must produce Recognized (with empty tool_calls); \ + got Unrecognized" + ); + }; + assert!( + parsed.tool_calls.is_empty(), + "expected no tool calls; got {:?}", + parsed.tool_calls + ); + + Ok(()) } -mod deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const PLAIN_CONTENT: &str = "Hello there."; - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message("[]", PLAIN_CONTENT, false)?; + let outcome = fixture + .model + .parse_chat_message("[]", PLAIN_CONTENT, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("plain content with empty tools array must produce Recognized; got Unrecognized"); - }; - assert!( - parsed.tool_calls.is_empty(), - "expected no tool calls; got {:?}", - parsed.tool_calls - ); + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("plain content with empty tools array must produce Recognized; got Unrecognized"); + }; + assert!( + parsed.tool_calls.is_empty(), + "expected no tool calls; got {:?}", + parsed.tool_calls + ); - Ok(()) - } + Ok(()) } -mod gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 200; const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\ - user\nReply with the single word: four. Do not explain.\n\ - model\n<|channel>thought\n\n"; +user\nReply with the single word: four. Do not explain.\n\ +model\n<|channel>thought\n\n"; const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!( - !outcome.generated_raw.is_empty(), - "Gemma 4 must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the thought channel before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!( + !outcome.generated_raw.is_empty(), + "Gemma 4 must generate at least one token" + ); + assert_eq!( + outcome.observed_reasoning, 0, + "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \ + when the prompt closes the thought channel before generation begins; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \ + before generation, so no Undeterminable tokens may be emitted; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + usage.reasoning_tokens, 0, + "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert!( + outcome.observed_content > 0, + "Gemma 4 thinking-disabled: classifier must emit at least one Content token" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content, + "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens" + ); + + for forbidden in FORBIDDEN_MARKERS { assert!( - outcome.observed_content > 0, - "Gemma 4 thinking-disabled: classifier must emit at least one Content token" + !outcome.content_stream.contains(forbidden), + "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens" - ); - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) } -} -mod gemma4_classifier_emits_reasoning { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn gemma4_classifier_emits_reasoning_for_thinking_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 1500; const GEMMA4_THINKING_PROMPT: &str = "\ - user\nReply with the single word: four. Do not explain.\n\ - model\n<|channel>thought\n"; +user\nReply with the single word: four. Do not explain.\n\ +model\n<|channel>thought\n"; const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn gemma4_classifier_emits_reasoning_for_thinking_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!( - !outcome.generated_raw.is_empty(), - "Gemma 4 must generate at least one token" - ); + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!( + !outcome.generated_raw.is_empty(), + "Gemma 4 must generate at least one token" + ); + assert!( + outcome.observed_reasoning > 0, + "Gemma 4 classifier must emit at least one Reasoning token when the model \ + emits a `<|channel>thought` block; outcome={outcome:?}", + ); + assert!( + usage.reasoning_tokens > 0, + "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \ + reasoning block; usage was {usage:?}" + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "Gemma 4: classifier must not emit Undeterminable when the model emits a \ + detected `<|channel>thought` marker; outcome={outcome:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + "Gemma 4: completion tokens must equal observed Content + Reasoning" + ); + assert!( + !parsed.reasoning_content.is_empty(), + "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \ + increase the budget or pick a more direct prompt. generated={:?}", + outcome.generated_raw, + ); + + for forbidden in FORBIDDEN_MARKERS { assert!( - outcome.observed_reasoning > 0, - "Gemma 4 classifier must emit at least one Reasoning token when the model \ - emits a `<|channel>thought` block; outcome={outcome:?}", + !outcome.reasoning_stream.contains(forbidden), + "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \ + reasoning_stream={:?}", + outcome.reasoning_stream ); assert!( - usage.reasoning_tokens > 0, - "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \ - reasoning block; usage was {usage:?}" + !outcome.content_stream.contains(forbidden), + "Gemma 4: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Gemma 4: classifier must not emit Undeterminable when the model emits a \ - detected `<|channel>thought` marker; outcome={outcome:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - "Gemma 4: completion tokens must equal observed Content + Reasoning" - ); - assert!( - !parsed.reasoning_content.is_empty(), - "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \ - increase the budget or pick a more direct prompt. generated={:?}", - outcome.generated_raw, - ); - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "Gemma 4: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) } -} -mod gemma4_parses_tool_call_payload { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str = "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; - #[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized"); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); - Ok(()) - } + Ok(()) } -mod gemma4_template_override_returns_full_markers { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::ToolCallArgsShape; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let template = model - .chat_template(None) - .expect("Gemma 4 chat template must be present"); - let template_str = template.to_str().expect("template must be valid UTF-8"); - assert!( - template_str.contains("<|tool_call>call:"), - "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \ - template starts with: {:?}", - &template_str[..template_str.len().min(200)], - ); - - let markers = model - .tool_call_markers() - .expect("Gemma 4 must produce ToolCallMarkers via override registry"); - - assert_eq!(markers.open, "<|tool_call>call:"); - assert_eq!(markers.close, "}"); - let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else { - panic!("expected PairedQuote variant, got {:?}", markers.args_shape); - }; - assert_eq!(shape.name_args_separator, "{"); - assert_eq!(shape.value_quote.open, "<|\"|>"); - assert_eq!(shape.value_quote.close, "<|\"|>"); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model + .chat_template(None) + .expect("Gemma 4 chat template must be present"); + let template_str = template.to_str().expect("template must be valid UTF-8"); + assert!( + template_str.contains("<|tool_call>call:"), + "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \ + template starts with: {:?}", + &template_str[..template_str.len().min(200)], + ); + + let markers = model + .tool_call_markers() + .expect("Gemma 4 must produce ToolCallMarkers via override registry"); + + assert_eq!(markers.open, "<|tool_call>call:"); + assert_eq!(markers.close, "}"); + let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else { + panic!("expected PairedQuote variant, got {:?}", markers.args_shape); + }; + assert_eq!(shape.name_args_separator, "{"); + assert_eq!(shape.value_quote.open, "<|\"|>"); + assert_eq!(shape.value_quote.close, "<|\"|>"); + + Ok(()) } -mod glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 200; const GLM47_THINKING_DISABLED_PROMPT: &str = "\ - <|user|> - What is 2 + 2? - <|assistant|> - +<|user|> +What is 2 + 2? +<|assistant|> + - "; +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); + let usage = classifier.usage(); - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); - Ok(()) + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); } -} -mod glm47_classifier_emits_reasoning { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 1500; const GLM47_THINKING_PROMPT: &str = "\ - <|user|> - What is 2 + 2? - <|assistant|> - - "; +<|user|> +What is 2 + 2? +<|assistant|> + +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning + ); + + if parsed.reasoning_content.is_empty() { + eprintln!( + "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ + skipping strict parser-equality assertions" ); + } else { + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + } - if parsed.reasoning_content.is_empty() { - eprintln!( - "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ - skipping strict parser-equality assertions" - ); - } else { - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - } - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); } -} -mod glm47_parses_tool_call_payload { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const GLM47_KEY_VALUE_PAYLOAD: &str = "get_weather\ - location\ - Paris\ - "; - - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized" - ); - }; - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); +location\ +Paris\ +"; - Ok(()) - } -} + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?; -mod glm47_template_override_returns_full_markers { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::ToolCallArgsShape; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let template = model - .chat_template(None) - .expect("GLM-4.7 chat template must be present"); - let template_str = template.to_str().expect("template must be valid UTF-8"); - assert!(template_str.contains("")); - - let markers = model - .tool_call_markers() - .expect("GLM-4.7 must produce ToolCallMarkers via override registry"); - - assert_eq!(markers.open, ""); - assert_eq!(markers.close, ""); - let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else { - panic!( - "expected KeyValueXmlTags variant, got {:?}", - markers.args_shape - ); - }; - assert_eq!(shape.key_open, ""); - assert_eq!(shape.key_close, ""); - assert_eq!(shape.value_open, ""); - assert_eq!(shape.value_close, ""); - - Ok(()) - } + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized" + ); + }; + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) } -mod mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model + .chat_template(None) + .expect("GLM-4.7 chat template must be present"); + let template_str = template.to_str().expect("template must be valid UTF-8"); + assert!(template_str.contains("")); + + let markers = model + .tool_call_markers() + .expect("GLM-4.7 must produce ToolCallMarkers via override registry"); + + assert_eq!(markers.open, ""); + assert_eq!(markers.close, ""); + let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else { + panic!( + "expected KeyValueXmlTags variant, got {:?}", + markers.args_shape + ); + }; + assert_eq!(shape.key_open, ""); + assert_eq!(shape.key_close, ""); + assert_eq!(shape.value_open, ""); + assert_eq!(shape.value_close, ""); + + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 200; const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\ - [INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]"; +[INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]"; const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; - #[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = - model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; - let usage = classifier.usage(); + let usage = classifier.usage(); - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); } -} -mod mistral3_classifier_emits_reasoning { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn mistral3_classifier_emits_reasoning_for_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 768; const MISTRAL3_THINKING_PROMPT: &str = "\ - [SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ - First draft your thinking process (inner monologue) until you arrive at a response. \ - Format your response using Markdown, and use LaTeX for any mathematical equations. \ - Write both your thoughts and the response in the same language as the input.\n\n\ - Your thinking process must follow the template below:\ - [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ - Be as casual and as long as you want until you are confident to generate the response \ - to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ - [INST]Reply with the single word: four. Do not explain.[/INST]"; +[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ +First draft your thinking process (inner monologue) until you arrive at a response. \ +Format your response using Markdown, and use LaTeX for any mathematical equations. \ +Write both your thoughts and the response in the same language as the input.\n\n\ +Your thinking process must follow the template below:\ +[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ +Be as casual and as long as you want until you are confident to generate the response \ +to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ +[INST]Reply with the single word: four. Do not explain.[/INST]"; const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; - #[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn mistral3_classifier_emits_reasoning_for_thinking_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - ); - assert!(!parsed.reasoning_content.is_empty()); - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + ); + assert!(!parsed.reasoning_content.is_empty()); + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); } -} -mod mistral3_parses_tool_call_payload { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str = r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; - #[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized" - ); - }; - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?; - Ok(()) - } + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized" + ); + }; + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) } -mod qwen35_chat_inference_emits_reasoning_when_template_auto_opens { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::model::LlamaChatMessage; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, - )] - fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chat_template = model.chat_template(None)?; - let messages = vec![LlamaChatMessage::new( - "user".to_owned(), - "Hello! How are you?".to_owned(), - )?]; - let prompt = model.apply_chat_template(&chat_template, &messages, true)?; - - let mut classifier = model.sampled_token_classifier(); - let tokens = model.str_to_token(&prompt, AddBos::Always)?; - let prompt_token_count = u64::try_from(tokens.len())?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 1024, - } - .run()?; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(outcome.observed_content > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(outcome.observed_tool_call, 0); - - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); - }; - assert!(!parsed.content.is_empty()); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, prompt_token_count); - assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); - assert_eq!(usage.undeterminable_tokens, 0); - - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chat_template = model.chat_template(None)?; + let messages = vec![LlamaChatMessage::new( + "user".to_owned(), + "Hello! How are you?".to_owned(), + )?]; + let prompt = model.apply_chat_template(&chat_template, &messages, true)?; + + let mut classifier = model.sampled_token_classifier(); + let tokens = model.str_to_token(&prompt, AddBos::Always)?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 1024, } + .run()?; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(outcome.observed_content > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); + + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); + }; + assert!(!parsed.content.is_empty()); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); + assert_eq!(usage.undeterminable_tokens, 0); + + Ok(()) } -mod qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 200; const QWEN35_THINKING_DISABLED_PROMPT: &str = "\ - <|im_start|>user - What is 2 + 2?<|im_end|> - <|im_start|>assistant - +<|im_start|>user +What is 2 + 2?<|im_end|> +<|im_start|>assistant + - + - "; +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); + let usage = classifier.usage(); - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); - Ok(()) + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); } -} -mod qwen35_classifier_emits_reasoning { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 1500; const QWEN35_THINKING_PROMPT: &str = "\ - <|im_start|>user - What is 2 + 2?<|im_end|> - <|im_start|>assistant - - "; +<|im_start|>user +What is 2 + 2?<|im_end|> +<|im_start|>assistant + +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + ); + + if parsed.reasoning_content.is_empty() { + eprintln!( + "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ + skipping strict parser-equality assertions" ); + } else { + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + } - if parsed.reasoning_content.is_empty() { - eprintln!( - "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ - skipping strict parser-equality assertions" - ); - } else { - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - } + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); + } - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } + Ok(()) +} - Ok(()) +fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> { + match arguments { + ToolCallArguments::ValidJson(value) => Ok(value), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson arguments, got InvalidJson: {raw}") + } } } -mod qwen35_parses_constrained_schema_payload { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - use serde_json::Value; - use serde_json::json; - +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> { const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "negotiate_with_cat", - "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.", - "parameters": { - "type": "object", - "properties": { - "topic": { - "type": "string", - "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'" - }, - "bribe": { - "type": "string", - "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"], - "description": "What you are offering in exchange" - }, - "desperation_level": { - "type": "integer", - "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)", - "minimum": 1, - "maximum": 10 - } + { + "type": "function", + "function": { + "name": "negotiate_with_cat", + "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.", + "parameters": { + "type": "object", + "properties": { + "topic": { + "type": "string", + "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'" + }, + "bribe": { + "type": "string", + "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"], + "description": "What you are offering in exchange" }, - "required": ["topic"], - "additionalProperties": false - } + "desperation_level": { + "type": "integer", + "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)", + "minimum": 1, + "maximum": 10 + } + }, + "required": ["topic"], + "additionalProperties": false } } - ]"#; + } +]"#; const NEGOTIATE_WITH_CAT_INPUT: &str = "\n\ - \n\ - \n\ - tuna\n\ - \n\ - \n\ - 8\n\ - \n\ - \n\ - get off the keyboard\n\ - \n\ - \n\ - "; - - fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> { - match arguments { - ToolCallArguments::ValidJson(value) => Ok(value), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson arguments, got InvalidJson: {raw}") +\n\ +\n\ +tuna\n\ +\n\ +\n\ +8\n\ +\n\ +\n\ +get off the keyboard\n\ +\n\ +\n\ +"; + + let outcome = fixture.model.parse_chat_message( + NEGOTIATE_WITH_CAT_TOOLS_JSON, + NEGOTIATE_WITH_CAT_INPUT, + false, + )?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \ + got Unrecognized" + ); + }; + + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat"); + assert_eq!(parsed.tool_calls[0].id, "call_0"); + assert_eq!( + arguments_as_json(&parsed.tool_calls[0].arguments)?, + &json!({ + "bribe": "tuna", + "desperation_level": 8, + "topic": "get off the keyboard", + }), + ); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } } +]"#; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture.model.parse_chat_message( - NEGOTIATE_WITH_CAT_TOOLS_JSON, - NEGOTIATE_WITH_CAT_INPUT, - false, - )?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \ - got Unrecognized" - ); - }; - - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat"); - assert_eq!(parsed.tool_calls[0].id, "call_0"); - assert_eq!( - arguments_as_json(&parsed.tool_calls[0].arguments)?, - &json!({ - "bribe": "tuna", - "desperation_level": 8, - "topic": "get off the keyboard", - }), - ); + const QWEN_XML_PAYLOAD: &str = "\n\ +\n\ +\n\ +Paris\n\ +\n\ +\n\ +"; + + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized"); + }; + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); - Ok(()) - } + Ok(()) } -mod qwen35_parses_tool_call_payload { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::ToolCallArguments; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_partial_tool_call_returns_pending_state(fixture: &LlamaFixture<'_>) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; - - const QWEN_XML_PAYLOAD: &str = "\n\ - \n\ - \n\ - Paris\n\ - \n\ - \n\ - "; + } +]"#; const PARTIAL_QWEN_XML_PAYLOAD: &str = "\n\n) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized"); - }; - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?; - Ok(()) - } + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized"); + }; + assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1); - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn qwen35_parses_partial_tool_call_returns_pending_state( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized"); - }; - assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1); - - Ok(()) - } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } +]"#; - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized" - ); - }; - assert!( - !parsed.tool_calls.is_empty(), - "expected at least one tool call; got {:?}", - parsed.tool_calls + const TWO_QWEN_XML_PAYLOADS: &str = "\n\ +\n\ +\n\ +Paris\n\ +\n\ +\n\ +\n\ +\n\ +\n\ +\n\ +Berlin\n\ +\n\ +\n\ +"; + + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized" ); - - Ok(()) - } + }; + assert!( + !parsed.tool_calls.is_empty(), + "expected at least one tool call; got {:?}", + parsed.tool_calls + ); + + Ok(()) } -mod qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] } } - ]"#; + } +]"#; const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \ - tool_calls); got Unrecognized" - ); - }; - assert!( - parsed.tool_calls.is_empty(), - "expected no tool calls; got {:?}", - parsed.tool_calls - ); + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; - Ok(()) - } + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \ + tool_calls); got Unrecognized" + ); + }; + assert!( + parsed.tool_calls.is_empty(), + "expected no tool calls; got {:?}", + parsed.tool_calls + ); + + Ok(()) } -mod qwen36_chat_inference_emits_reasoning_when_template_auto_opens { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::model::LlamaChatMessage; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, - )] - fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chat_template = model.chat_template(None)?; - let messages = vec![LlamaChatMessage::new( - "user".to_owned(), - "Hello! How are you?".to_owned(), - )?]; - let prompt = model.apply_chat_template(&chat_template, &messages, true)?; - - let mut classifier = model.sampled_token_classifier(); - let tokens = model.str_to_token(&prompt, AddBos::Always)?; - let prompt_token_count = u64::try_from(tokens.len())?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 1024, - } - .run()?; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(outcome.observed_content > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(outcome.observed_tool_call, 0); - - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); - }; - assert!(!parsed.content.is_empty()); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, prompt_token_count); - assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); - assert_eq!(usage.undeterminable_tokens, 0); - - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chat_template = model.chat_template(None)?; + let messages = vec![LlamaChatMessage::new( + "user".to_owned(), + "Hello! How are you?".to_owned(), + )?]; + let prompt = model.apply_chat_template(&chat_template, &messages, true)?; + + let mut classifier = model.sampled_token_classifier(); + let tokens = model.str_to_token(&prompt, AddBos::Always)?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 1024, } + .run()?; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(outcome.observed_content > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); + + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); + }; + assert!(!parsed.content.is_empty()); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); + assert_eq!(usage.undeterminable_tokens, 0); + + Ok(()) } -mod qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 200; const QWEN36_THINKING_DISABLED_PROMPT: &str = "\ - <|im_start|>user - What is 2 + 2?<|im_end|> - <|im_start|>assistant - +<|im_start|>user +What is 2 + 2?<|im_end|> +<|im_start|>assistant + - + - "; +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); + let usage = classifier.usage(); - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); - Ok(()) + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); } -} -mod qwen36_classifier_emits_reasoning { - use anyhow::Result; - use anyhow::bail; - use llama_cpp_bindings::ChatMessageParseOutcome; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; + Ok(()) +} +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { const MAX_GENERATED_TOKENS: i32 = 1500; const QWEN36_THINKING_PROMPT: &str = "\ - <|im_start|>user - What is 2 + 2?<|im_end|> - <|im_start|>assistant - - "; +<|im_start|>user +What is 2 + 2?<|im_end|> +<|im_start|>assistant + +"; const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, - )] - fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - ); - - if parsed.reasoning_content.is_empty() { - eprintln!( - "Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS" - ); - } else { - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - } - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + ); + + if parsed.reasoning_content.is_empty() { + eprintln!("Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS"); + } else { + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + } - Ok(()) + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); } -} + Ok(()) +} llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs index dc9395aa..8ceec4d1 100644 --- a/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs +++ b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs @@ -1,2518 +1,2436 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use std::ffi::CStr; +use std::io::Write; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::Context as _; +use anyhow::Result; +use llama_cpp_bindings::GrammarError; +use llama_cpp_bindings::SampledToken; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::ggml_time_us; +use llama_cpp_bindings::json_schema_to_grammar; +use llama_cpp_bindings::llama_batch::LlamaBatch; +use llama_cpp_bindings::llguidance_sampler::create_llg_sampler; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_bindings::model::LlamaChatMessage; +use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier; +use llama_cpp_bindings::sampled_token_section::SampledTokenSection; +use llama_cpp_bindings::sampling::LlamaSampler; +use llama_cpp_bindings::streaming_markers::StreamingMarkers; +use llama_cpp_bindings::token::LlamaToken; +use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; use llama_cpp_test_harness::llama_tests_main; -mod model_sampling { - use anyhow::Result; - use llama_cpp_bindings::SampledToken; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::json_schema_to_grammar; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, - )] - fn sample_returns_result_and_succeeds_with_valid_index( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let tokens = model.str_to_token("Hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = - LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - - let result = sampler.sample(&context, batch.n_tokens() - 1); - - assert!(result.is_ok()); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let mut classifier = model.sampled_token_classifier(); - let (raw_token, mut outcomes) = - classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; - outcomes.extend(classifier.flush()); - - assert_eq!( - outcomes.len(), - 1, - "expected one finalised outcome after flush" - ); - let outcome = &outcomes[0]; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +fn sample_returns_result_and_succeeds_with_valid_index(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let tokens = model.str_to_token("Hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + + let result = sampler.sample(&context, batch.n_tokens() - 1); + + assert!(result.is_ok()); + Ok(()) +} - let raw_as_sampled = SampledToken::Content(raw_token); - assert!( - !model.is_eog_token(&raw_as_sampled), - "Grammar sampler should not allow EOS as first token" - ); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let mut classifier = model.sampled_token_classifier(); + let (raw_token, mut outcomes) = + classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; + outcomes.extend(classifier.flush()); + + assert_eq!( + outcomes.len(), + 1, + "expected one finalised outcome after flush" + ); + let outcome = &outcomes[0]; + + let raw_as_sampled = SampledToken::Content(raw_token); + assert!( + !model.is_eog_token(&raw_as_sampled), + "Grammar sampler should not allow EOS as first token" + ); + + let piece = &outcome.raw_piece; + let first_char = piece + .chars() + .next() + .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))? + .to_lowercase() + .next() + .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?; + + assert!( + first_char == 'y' || first_char == 'n', + "Grammar should constrain first token to start with y/n, got: '{piece}'" + ); + assert_eq!( + classifier.usage().completion_tokens(), + 1, + "exactly one completion token sampled" + ); + + Ok(()) +} - let piece = &outcome.raw_piece; - let first_char = piece - .chars() - .next() - .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))? - .to_lowercase() - .next() - .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?; - - assert!( - first_char == 'y' || first_char == 'n', - "Grammar should constrain first token to start with y/n, got: '{piece}'" - ); - assert_eq!( - classifier.usage().completion_tokens(), - 1, - "exactly one completion token sampled" - ); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn json_schema_grammar_sampler_constrains_output_to_json(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let grammar_str = json_schema_to_grammar( + r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#, + )?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, &grammar_str, "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let mut classifier = model.sampled_token_classifier(); + let (raw_token, mut outcomes) = + classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; + outcomes.extend(classifier.flush()); + + assert_eq!( + outcomes.len(), + 1, + "expected one finalised outcome after flush" + ); + let outcome = &outcomes[0]; + + let raw_as_sampled = SampledToken::Content(raw_token); + assert!( + !model.is_eog_token(&raw_as_sampled), + "Grammar sampler should not allow EOS as first token" + ); + + let piece = &outcome.raw_piece; + + assert!( + piece.starts_with('{'), + "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'" + ); + assert_eq!( + classifier.usage().completion_tokens(), + 1, + "exactly one completion token sampled" + ); + + Ok(()) +} - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn sample_with_grammar_produces_constrained_output_in_loop( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + let mut classifier = model.sampled_token_classifier(); + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + context.decode(&mut batch)?; + classifier.commit_prompt_tokens(); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 10, } + .run()?; + + let lowercase = outcome.generated_raw.to_lowercase(); + assert!( + lowercase == "yes" || lowercase == "no", + "Grammar loop should produce 'yes' or 'no', got: '{}'", + outcome.generated_raw + ); + assert!( + outcome.eog_seen, + "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}" + ); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); + assert!(outcome.observed_content > 0); + + let usage = classifier.into_usage(); + assert_eq!(usage.completion_tokens(), outcome.observed_content); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn json_schema_grammar_sampler_constrains_output_to_json( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let grammar_str = json_schema_to_grammar( - r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#, - )?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, &grammar_str, "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let mut classifier = model.sampled_token_classifier(); - let (raw_token, mut outcomes) = - classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; - outcomes.extend(classifier.flush()); - - assert_eq!( - outcomes.len(), - 1, - "expected one finalised outcome after flush" - ); - let outcome = &outcomes[0]; - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = + "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + + let mut classifier = model.sampled_token_classifier(); + let mut sampled_count: u64 = 0; + + for (position, _) in (batch.n_tokens()..).zip(0..5) { + let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?; let raw_as_sampled = SampledToken::Content(raw_token); - assert!( - !model.is_eog_token(&raw_as_sampled), - "Grammar sampler should not allow EOS as first token" - ); - - let piece = &outcome.raw_piece; - assert!( - piece.starts_with('{'), - "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'" - ); - assert_eq!( - classifier.usage().completion_tokens(), - 1, - "exactly one completion token sampled" - ); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn sample_with_grammar_produces_constrained_output_in_loop( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - let mut classifier = model.sampled_token_classifier(); - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - context.decode(&mut batch)?; - classifier.commit_prompt_tokens(); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 10, + if model.is_eog_token(&raw_as_sampled) { + break; } - .run()?; - - let lowercase = outcome.generated_raw.to_lowercase(); - assert!( - lowercase == "yes" || lowercase == "no", - "Grammar loop should produce 'yes' or 'no', got: '{}'", - outcome.generated_raw - ); - assert!( - outcome.eog_seen, - "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}" - ); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(outcome.observed_tool_call, 0); - assert!(outcome.observed_content > 0); - - let usage = classifier.into_usage(); - assert_eq!(usage.completion_tokens(), outcome.observed_content); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let prompt = - "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; + sampled_count += 1; - batch.add_sequence(&tokens, 0, false)?; + batch.clear(); + batch.add(&raw_as_sampled, position, &[0], true)?; context.decode(&mut batch)?; - - let mut sampler = - LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - - let mut classifier = model.sampled_token_classifier(); - let mut sampled_count: u64 = 0; - - for (position, _) in (batch.n_tokens()..).zip(0..5) { - let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?; - let raw_as_sampled = SampledToken::Content(raw_token); - - if model.is_eog_token(&raw_as_sampled) { - break; - } - - sampled_count += 1; - - batch.clear(); - batch.add(&raw_as_sampled, position, &[0], true)?; - - context.decode(&mut batch)?; - } - - let _ = classifier.flush(); - - assert!( - sampled_count > 0, - "Should produce at least one token without grammar" - ); - let usage = classifier.into_usage(); - assert!( - usage.completion_tokens() >= sampled_count, - "completion_tokens ({}) must include the {sampled_count} non-EOG samples", - usage.completion_tokens() - ); - - Ok(()) - } -} - -mod sampling { - #![expect( - clippy::unnecessary_wraps, - reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" - )] - - use anyhow::Result; - use llama_cpp_bindings::GrammarError; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings::token::LlamaToken; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let breakers: Vec<&[u8]> = vec![b"\n", b"\t"]; - let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn dry_sampler_with_null_byte_in_seq_breakers_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let breakers: Vec<&[u8]> = vec![b"hello\0world"]; - let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers); - - assert!(result.is_err()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root"); - - assert!(sampler.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let trigger_words: Vec<&[u8]> = vec![b"function"]; - let sampler = LlamaSampler::grammar_lazy( - fixture.model, - "root ::= \"hello\"", - "root", - trigger_words, - &[], - ); - - assert!(sampler.is_ok()); - - Ok(()) } - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let patterns = vec!["\\{.*".to_owned()]; - let sampler = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "root ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(sampler.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let trigger_words: Vec<&[u8]> = vec![b"function"]; - let result = LlamaSampler::grammar_lazy( - fixture.model, - "expr ::= \"hello\"", - "root", - trigger_words, - &[], - ); - - assert!(matches!(result, Err(GrammarError::RootNotFound))); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_lazy_with_null_byte_in_trigger_word_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"]; - let result = LlamaSampler::grammar_lazy( - fixture.model, - "root ::= \"hello\"", - "root", - trigger_words, - &[], - ); - - assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_)))); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_lazy_patterns_with_root_not_found_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let patterns = vec!["\\{.*".to_owned()]; - let result = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "expr ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(matches!(result, Err(GrammarError::RootNotFound))); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let patterns = vec!["hel\0lo".to_owned()]; - let result = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "root ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_)))); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let patterns = vec!["[".to_owned()]; - let result = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "root ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(matches!( - result, - Err(GrammarError::InvalidTriggerPattern { .. }), - )); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no"); - - assert!(result.is_ok()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> { - let result = LlamaSampler::logit_bias(0, &[]); + let _ = classifier.flush(); - assert!(result.is_ok()); + assert!( + sampled_count > 0, + "Should produce at least one token without grammar" + ); + let usage = classifier.into_usage(); + assert!( + usage.completion_tokens() >= sampled_count, + "completion_tokens ({}) must include the {sampled_count} non-EOG samples", + usage.completion_tokens() + ); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn dry_sampler_with_root_not_found_grammar_does_not_apply( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let breakers: Vec<&[u8]> = vec![b"\n"]; - let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()]; + Ok(()) +} - sampler.accept_many(&tokens)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let breakers: Vec<&[u8]> = vec![b"\n", b"\t"]; + let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn dry_sampler_with_null_byte_in_seq_breakers_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let breakers: Vec<&[u8]> = vec![b"hello\0world"]; + let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers); + + assert!(result.is_err()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn with_tokens_returns_self_after_accepting_each_token( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - let tokens = [fixture.model.token_bos(), fixture.model.token_eos()]; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root"); + + assert!(sampler.is_ok()); + + Ok(()) +} - let _consumed = sampler.with_tokens(tokens.iter().copied())?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let trigger_words: Vec<&[u8]> = vec![b"function"]; + let sampler = LlamaSampler::grammar_lazy( + fixture.model, + "root ::= \"hello\"", + "root", + trigger_words, + &[], + ); + + assert!(sampler.is_ok()); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["\\{.*".to_owned()]; + let sampler = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(sampler.is_ok()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let trigger_words: Vec<&[u8]> = vec![b"function"]; + let result = LlamaSampler::grammar_lazy( + fixture.model, + "expr ::= \"hello\"", + "root", + trigger_words, + &[], + ); + + assert!(matches!(result, Err(GrammarError::RootNotFound))); + + Ok(()) +} - sampler.accept(fixture.model.token_bos())?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_with_null_byte_in_trigger_word_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"]; + let result = LlamaSampler::grammar_lazy( + fixture.model, + "root ::= \"hello\"", + "root", + trigger_words, + &[], + ); + + assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_)))); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_with_root_not_found_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["\\{.*".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "expr ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(matches!(result, Err(GrammarError::RootNotFound))); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["hel\0lo".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_)))); + + Ok(()) +} - sampler.try_accept(LlamaToken::new(0))?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["[".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(matches!( + result, + Err(GrammarError::InvalidTriggerPattern { .. }), + )); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no"); + + assert!(result.is_ok()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> { + let result = LlamaSampler::logit_bias(0, &[]); + + assert!(result.is_ok()); + + Ok(()) +} - let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?; - let sampler = LlamaSampler::greedy(); - sampler.apply(&mut data_array); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn dry_sampler_with_root_not_found_grammar_does_not_apply( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let breakers: Vec<&[u8]> = vec![b"\n"]; + let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()]; + + sampler.accept_many(&tokens)?; + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - )] - fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - let mut sampler = - LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - let result = sampler.sample(&context, batch.n_tokens() - 1); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn with_tokens_returns_self_after_accepting_each_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + let tokens = [fixture.model.token_bos(), fixture.model.token_eos()]; + + let _consumed = sampler.with_tokens(tokens.iter().copied())?; + + Ok(()) +} - assert!(result.is_ok()); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + + sampler.accept(fixture.model.token_bos())?; + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + + sampler.try_accept(LlamaToken::new(0))?; + + Ok(()) } -mod text_generation { - use std::io::Write; - use std::time::Duration; - - use anyhow::Context as _; - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::ggml_time_us; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::model::LlamaChatMessage; - use llama_cpp_bindings::sampled_token::SampledToken; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut ctx = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create context")?; - - let prompt = "Hello my name is"; - let max_generated_tokens: i32 = 64; - - let mut classifier = model.sampled_token_classifier(); - let tokens_list = model - .str_to_token(prompt, AddBos::Always) - .with_context(|| format!("failed to tokenize {prompt}"))?; - let prompt_token_count = u64::try_from(tokens_list.len())?; - - let mut decoder = encoding_rs::UTF_8.new_decoder(); - - for token in &tokens_list { - eprint!( - "{}", - model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)? - ); - } - std::io::stderr().flush()?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?; - - assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); - - ctx.decode(&mut batch) - .with_context(|| "llama_decode() failed")?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, prompt_token_count); - - let mut sampler = - LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]); - let initial_position = batch.n_tokens(); - let t_main_start = ggml_time_us(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut ctx, - batch: &mut batch, - initial_position, - max_generated_tokens, - } - .run()?; - let t_main_end = ggml_time_us(); - let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); - let total_observed = - outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; - - #[expect( - clippy::cast_precision_loss, - reason = "logged throughput tolerates f32 precision" - )] - let tokens_per_second = total_observed as f32 / duration.as_secs_f32(); - - eprintln!( - "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", - duration.as_secs_f32(), - ); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?; + let sampler = LlamaSampler::greedy(); + sampler.apply(&mut data_array); + + Ok(()) +} - assert!( - !outcome.generated_raw.is_empty(), - "model should generate at least one token" - ); - assert_eq!( - outcome.observed_tool_call, 0, - "raw prompt without tool-call markers must not produce ToolCall tokens; \ - outcome={outcome:?}" - ); - assert!( - total_observed > 0, - "model must produce at least one classified token; outcome={outcome:?}" - ); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + let result = sampler.sample(&context, batch.n_tokens() - 1); + + assert!(result.is_ok()); + + Ok(()) +} - let usage = classifier.into_usage(); - assert_eq!( - usage.prompt_tokens, prompt_token_count, - "prompt_tokens must equal the tokenizer's prompt length" - ); - assert_eq!( - usage.content_tokens, outcome.observed_content, - "content_tokens must equal observed Content variants" - ); - assert_eq!( - usage.reasoning_tokens, outcome.observed_reasoning, - "reasoning_tokens must equal observed Reasoning variants" - ); - assert_eq!( - usage.undeterminable_tokens, outcome.observed_undeterminable, - "undeterminable_tokens must equal observed Undeterminable variants" - ); - assert_eq!( - usage.tool_call_tokens, outcome.observed_tool_call, - "tool_call_tokens must equal observed ToolCall variants" - ); - assert_eq!( - usage.completion_tokens(), - total_observed, - "completion_tokens must equal Content + Reasoning + Undeterminable" +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut ctx = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; + + let prompt = "Hello my name is"; + let max_generated_tokens: i32 = 64; + + let mut classifier = model.sampled_token_classifier(); + let tokens_list = model + .str_to_token(prompt, AddBos::Always) + .with_context(|| format!("failed to tokenize {prompt}"))?; + let prompt_token_count = u64::try_from(tokens_list.len())?; + + let mut decoder = encoding_rs::UTF_8.new_decoder(); + + for token in &tokens_list { + eprint!( + "{}", + model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)? ); - - Ok(()) } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, - )] - fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chat_template = model.chat_template(None)?; - let messages = vec![LlamaChatMessage::new( - "user".to_string(), - "Hello! How are you?".to_string(), - )?]; - let prompt = model.apply_chat_template(&chat_template, &messages, true)?; - - let mut classifier = model.sampled_token_classifier(); - let tokens = model.str_to_token(&prompt, AddBos::Always)?; - let prompt_token_count = u64::try_from(tokens.len())?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 1024, - } - .run()?; - - println!(); - - assert!( - !outcome.generated_raw.is_empty(), - "model should generate at least one token" - ); - let total_observed = - outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; - assert!( - total_observed > 0, - "model must produce at least one classified token; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_tool_call, 0, - "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}" - ); - - let usage = classifier.into_usage(); - - assert_eq!( - usage.prompt_tokens, prompt_token_count, - "prompt_tokens must equal the tokenizer's prompt length" - ); - assert_eq!( - usage.content_tokens, outcome.observed_content, - "content_tokens must equal observed Content variants" - ); - assert_eq!( - usage.reasoning_tokens, outcome.observed_reasoning, - "reasoning_tokens must equal observed Reasoning variants" - ); - assert_eq!( - usage.undeterminable_tokens, outcome.observed_undeterminable, - "undeterminable_tokens must equal observed Undeterminable variants" - ); - assert_eq!( - usage.completion_tokens(), - total_observed, - "completion_tokens must equal Content + Reasoning + Undeterminable" - ); - assert_eq!( - usage.tool_call_tokens, outcome.observed_tool_call, - "tool_call_tokens must equal observed ToolCall variants" - ); - - Ok(()) + std::io::stderr().flush()?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); + + ctx.decode(&mut batch) + .with_context(|| "llama_decode() failed")?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, prompt_token_count); + + let mut sampler = + LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]); + let initial_position = batch.n_tokens(); + let t_main_start = ggml_time_us(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut ctx, + batch: &mut batch, + initial_position, + max_generated_tokens, } + .run()?; + let t_main_end = ggml_time_us(); + let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); + let total_observed = + outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; + + #[expect( + clippy::cast_precision_loss, + reason = "logged throughput tolerates f32 precision" + )] + let tokens_per_second = total_observed as f32 / duration.as_secs_f32(); + + eprintln!( + "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", + duration.as_secs_f32(), + ); + + assert!( + !outcome.generated_raw.is_empty(), + "model should generate at least one token" + ); + assert_eq!( + outcome.observed_tool_call, 0, + "raw prompt without tool-call markers must not produce ToolCall tokens; \ + outcome={outcome:?}" + ); + assert!( + total_observed > 0, + "model must produce at least one classified token; outcome={outcome:?}" + ); + + let usage = classifier.into_usage(); + assert_eq!( + usage.prompt_tokens, prompt_token_count, + "prompt_tokens must equal the tokenizer's prompt length" + ); + assert_eq!( + usage.content_tokens, outcome.observed_content, + "content_tokens must equal observed Content variants" + ); + assert_eq!( + usage.reasoning_tokens, outcome.observed_reasoning, + "reasoning_tokens must equal observed Reasoning variants" + ); + assert_eq!( + usage.undeterminable_tokens, outcome.observed_undeterminable, + "undeterminable_tokens must equal observed Undeterminable variants" + ); + assert_eq!( + usage.tool_call_tokens, outcome.observed_tool_call, + "tool_call_tokens must equal observed ToolCall variants" + ); + assert_eq!( + usage.completion_tokens(), + total_observed, + "completion_tokens must equal Content + Reasoning + Undeterminable" + ); + + Ok(()) } -mod constrained_decoding { - use std::io::Write; - - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampled_token::SampledToken; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n"; - - let mut ctx = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let tokens_list = model.str_to_token(prompt, AddBos::Always)?; - - let mut batch = LlamaBatch::new(512, 1)?; - let last_index = i32::try_from(tokens_list.len())? - 1; - - for (index, token) in (0_i32..).zip(&tokens_list) { - batch.add( - &SampledToken::Content(*token), - index, - &[0], - index == last_index, - )?; - } - - ctx.decode(&mut batch)?; - - let schema = r#"{ - "type": "object", - "properties": { - "city": { "type": "string" }, - "temperature": { "type": "number" } - }, - "required": ["city", "temperature"] - }"#; - - let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?; - let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); - - let mut n_cur = batch.n_tokens(); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let mut generated = String::new(); - - while n_cur <= 128 { - let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?); - - if model.is_eog_token(&token) { - break; - } - - let output_string = model.token_to_piece(&token, &mut decoder, true, None)?; - generated.push_str(&output_string); - print!("{output_string}"); - std::io::stdout().flush()?; - - batch.clear(); - batch.add(&token, n_cur, &[0], true)?; - n_cur += 1; - ctx.decode(&mut batch)?; - } - - println!(); - - let parsed = serde_json::Deserializer::from_str(&generated) - .into_iter::() - .next() - .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??; - - assert!(parsed.get("city").is_some()); - assert!(parsed.get("temperature").is_some()); - - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chat_template = model.chat_template(None)?; + let messages = vec![LlamaChatMessage::new( + "user".to_string(), + "Hello! How are you?".to_string(), + )?]; + let prompt = model.apply_chat_template(&chat_template, &messages, true)?; + + let mut classifier = model.sampled_token_classifier(); + let tokens = model.str_to_token(&prompt, AddBos::Always)?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 1024, } + .run()?; + + println!(); + + assert!( + !outcome.generated_raw.is_empty(), + "model should generate at least one token" + ); + let total_observed = + outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; + assert!( + total_observed > 0, + "model must produce at least one classified token; outcome={outcome:?}" + ); + assert_eq!( + outcome.observed_tool_call, 0, + "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}" + ); + + let usage = classifier.into_usage(); + + assert_eq!( + usage.prompt_tokens, prompt_token_count, + "prompt_tokens must equal the tokenizer's prompt length" + ); + assert_eq!( + usage.content_tokens, outcome.observed_content, + "content_tokens must equal observed Content variants" + ); + assert_eq!( + usage.reasoning_tokens, outcome.observed_reasoning, + "reasoning_tokens must equal observed Reasoning variants" + ); + assert_eq!( + usage.undeterminable_tokens, outcome.observed_undeterminable, + "undeterminable_tokens must equal observed Undeterminable variants" + ); + assert_eq!( + usage.completion_tokens(), + total_observed, + "completion_tokens must equal Content + Reasoning + Undeterminable" + ); + assert_eq!( + usage.tool_call_tokens, outcome.observed_tool_call, + "tool_call_tokens must equal observed ToolCall variants" + ); + + Ok(()) } -mod llguidance { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use std::ffi::CStr; - use std::sync::Arc; - - use anyhow::Result; - use llama_cpp_bindings::context::LlamaContext; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::llguidance_sampler::create_llg_sampler; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_bindings::sampling::LlamaSampler; - use llama_cpp_bindings::token::LlamaToken; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - const JSON_SCHEMA: &str = - r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#; - const REGEX_GRAMMAR: &str = r"yes|no"; - const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?; - - assert!(!sampler.sampler.is_null()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - assert!(!sampler.sampler.is_null()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?; - - assert!(!sampler.sampler.is_null()); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything"); - - assert!(result.is_err()); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = create_llg_sampler(fixture.model, "json", "{this is not valid json"); - - assert!(result.is_err()); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = create_llg_sampler(fixture.model, "regex", "[invalid"); - - assert!(result.is_err()); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) }; - assert!(!name_ptr.is_null()); - let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?; - - assert_eq!(name, "llguidance"); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) }; - - assert!(!cloned.is_null()); - - unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) }; - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n"; + + let mut ctx = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let tokens_list = model.str_to_token(prompt, AddBos::Always)?; + + let mut batch = LlamaBatch::new(512, 1)?; + let last_index = i32::try_from(tokens_list.len())? - 1; + + for (index, token) in (0_i32..).zip(&tokens_list) { + batch.add( + &SampledToken::Content(*token), + index, + &[0], + index == last_index, )?; - - let prompt = "Answer yes or no:"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; - let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); - - let token = chain.sample(&context, batch.n_tokens() - 1)?; - chain.accept(token)?; - - Ok(()) } - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - let huge_token = LlamaToken(i32::MAX - 1); - let _ = sampler.accept(huge_token); + ctx.decode(&mut batch)?; - Ok(()) - } + let schema = r#"{ + "type": "object", + "properties": { + "city": { "type": "string" }, + "temperature": { "type": "number" } + }, + "required": ["city", "temperature"] +}"#; - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { - let first = fixture.model.approximate_tok_env(); - let second = fixture.model.approximate_tok_env(); + let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?; + let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); - assert!(Arc::ptr_eq(&first, &second)); + let mut n_cur = batch.n_tokens(); + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let mut generated = String::new(); - Ok(()) - } + while n_cur <= 128 { + let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?); - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn approximate_tok_env_drives_consistent_grammar_constraint( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + if model.is_eog_token(&token) { + break; + } - assert!(!first.sampler.is_null()); - assert!(!second.sampler.is_null()); + let output_string = model.token_to_piece(&token, &mut decoder, true, None)?; + generated.push_str(&output_string); + print!("{output_string}"); + std::io::stdout().flush()?; - Ok(()) + batch.clear(); + batch.add(&token, n_cur, &[0], true)?; + n_cur += 1; + ctx.decode(&mut batch)?; } - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let tokens = model.str_to_token("Answer:", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; + println!(); - let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; - let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); - let _ = chain.sample(&context, batch.n_tokens() - 1); + let parsed = serde_json::Deserializer::from_str(&generated) + .into_iter::() + .next() + .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??; - Ok(()) - } + assert!(parsed.get("city").is_some()); + assert!(parsed.get("temperature").is_some()); - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, - )] - fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - let huge_token = LlamaToken(i32::MAX - 1); - let _ = sampler.accept(huge_token); - sampler.reset(); - let after = sampler.accept(LlamaToken(0)); - assert!( - after.is_ok() || after.is_err(), - "after reset, sampler.accept must return Ok or Err (not panic)" - ); - Ok(()) - } + Ok(()) } -mod sampled_token_classifier_markers { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::SampledToken; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier; - use llama_cpp_bindings::sampled_token_section::SampledTokenSection; - use llama_cpp_bindings::streaming_markers::StreamingMarkers; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn classifier_starts_in_pending_section_for_default_fixture( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let classifier = fixture.model.sampled_token_classifier(); - - assert_eq!(classifier.current_section(), SampledTokenSection::Pending); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn classifier_construction_is_idempotent_across_calls( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let first = fixture.model.sampled_token_classifier(); - let second = fixture.model.sampled_token_classifier(); - - assert_eq!(first.current_section(), second.current_section()); - assert_eq!(first.usage(), second.usage()); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - - let outcomes = classifier.ingest(model.token_bos()); - - assert_eq!(outcomes.len(), 1); - let outcome = &outcomes[0]; - assert!(matches!( - outcome.sampled_token, - SampledToken::Undeterminable(_) - )); - assert_eq!(outcome.visible_piece, outcome.raw_piece); - assert_eq!(classifier.usage().undeterminable_tokens, 1); - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn ingest_with_no_markers_decodes_each_token_independently( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - - let _ = classifier.ingest(model.token_bos()); - let _ = classifier.ingest(model.token_eos()); - - assert_eq!(classifier.usage().undeterminable_tokens, 2); - Ok(()) - } +const JSON_SCHEMA: &str = + r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#; +const REGEX_GRAMMAR: &str = r"yes|no"; +const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?; + + assert!(!sampler.sampler.is_null()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let usage_before = *classifier.usage(); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + assert!(!sampler.sampler.is_null()); + + Ok(()) +} - classifier.ingest_prompt_token(model.token_bos()); - classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?; + + assert!(!sampler.sampler.is_null()); + + Ok(()) +} - assert_eq!(*classifier.usage(), usage_before); - assert_eq!(classifier.current_section(), SampledTokenSection::Pending); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything"); + + assert!(result.is_err()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn feed_prompt_to_batch_increments_pending_prompt_tokens( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "json", "{this is not valid json"); + + assert!(result.is_err()); + Ok(()) +} - classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; - classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "regex", "[invalid"); + + assert!(result.is_err()); + Ok(()) +} - assert_eq!(classifier.pending_prompt_tokens(), 2); - assert_eq!(batch.n_tokens(), 2); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) }; + assert!(!name_ptr.is_null()); + let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?; + + assert_eq!(name, "llguidance"); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) }; + + assert!(!cloned.is_null()); + + unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) }; + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "Answer yes or no:"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; + let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); + + let token = chain.sample(&context, batch.n_tokens() - 1)?; + chain.accept(token)?; + + Ok(()) +} - let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()]; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + let huge_token = LlamaToken(i32::MAX - 1); + let _ = sampler.accept(huge_token); + + Ok(()) +} - assert_eq!(classifier.pending_prompt_tokens(), 3); - assert_eq!(batch.n_tokens(), 3); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.approximate_tok_env(); + let second = fixture.model.approximate_tok_env(); + + assert!(Arc::ptr_eq(&first, &second)); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn approximate_tok_env_drives_consistent_grammar_constraint( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + assert!(!first.sampler.is_null()); + assert!(!second.sampler.is_null()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let tokens = model.str_to_token("Answer:", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; + let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); + let _ = chain.sample(&context, batch.n_tokens() - 1); + + Ok(()) +} - classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; - classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + let huge_token = LlamaToken(i32::MAX - 1); + let _ = sampler.accept(huge_token); + sampler.reset(); + let after = sampler.accept(LlamaToken(0)); + assert!( + after.is_ok() || after.is_err(), + "after reset, sampler.accept must return Ok or Err (not panic)" + ); + Ok(()) +} - let promoted = classifier.commit_prompt_tokens(); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn classifier_starts_in_pending_section_for_default_fixture( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let classifier = fixture.model.sampled_token_classifier(); + + assert_eq!(classifier.current_section(), SampledTokenSection::Pending); + Ok(()) +} - assert_eq!(promoted, 2); - assert_eq!(classifier.pending_prompt_tokens(), 0); - assert_eq!(classifier.usage().prompt_tokens, 2); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn classifier_construction_is_idempotent_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.sampled_token_classifier(); + let second = fixture.model.sampled_token_classifier(); + + assert_eq!(first.current_section(), second.current_section()); + assert_eq!(first.usage(), second.usage()); + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + + let outcomes = classifier.ingest(model.token_bos()); + + assert_eq!(outcomes.len(), 1); + let outcome = &outcomes[0]; + assert!(matches!( + outcome.sampled_token, + SampledToken::Undeterminable(_) + )); + assert_eq!(outcome.visible_piece, outcome.raw_piece); + assert_eq!(classifier.usage().undeterminable_tokens, 1); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn discard_pending_prompt_tokens_clears_count_without_recording_usage( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn ingest_with_no_markers_decodes_each_token_independently( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + + let _ = classifier.ingest(model.token_bos()); + let _ = classifier.ingest(model.token_eos()); + + assert_eq!(classifier.usage().undeterminable_tokens, 2); + Ok(()) +} - classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let usage_before = *classifier.usage(); + + classifier.ingest_prompt_token(model.token_bos()); + classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]); + + assert_eq!(*classifier.usage(), usage_before); + assert_eq!(classifier.current_section(), SampledTokenSection::Pending); + Ok(()) +} - let discarded = classifier.discard_pending_prompt_tokens(); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn feed_prompt_to_batch_increments_pending_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; + classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; + + assert_eq!(classifier.pending_prompt_tokens(), 2); + assert_eq!(batch.n_tokens(), 2); + + Ok(()) +} - assert_eq!(discarded, 1); - assert_eq!(classifier.pending_prompt_tokens(), 0); - assert_eq!(classifier.usage().prompt_tokens, 0); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()]; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), 3); + assert_eq!(batch.n_tokens(), 3); + + Ok(()) +} - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; + classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; + + let promoted = classifier.commit_prompt_tokens(); + + assert_eq!(promoted, 2); + assert_eq!(classifier.pending_prompt_tokens(), 0); + assert_eq!(classifier.usage().prompt_tokens, 2); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?; - let _ = left; - let _ = right; - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn discard_pending_prompt_tokens_clears_count_without_recording_usage( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; + + let discarded = classifier.discard_pending_prompt_tokens(); + + assert_eq!(discarded, 1); + assert_eq!(classifier.pending_prompt_tokens(), 0); + assert_eq!(classifier.usage().prompt_tokens, 0); + + Ok(()) } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?; + let _ = left; + let _ = right; + Ok(()) +} llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs index 7b26c7ee..fc3624f9 100644 --- a/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs +++ b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs @@ -1,1978 +1,1889 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use std::ffi::CString; +use std::num::NonZeroU16; +use std::pin::pin; + +use anyhow::Result; +use llama_cpp_bindings::SampledToken; +use llama_cpp_bindings::context::params::LlamaContextParams; +use llama_cpp_bindings::max_devices; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_bindings::model::params::LlamaModelParams; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; use llama_cpp_test_harness::llama_tests_main; -mod model_properties { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - - assert!(model.n_vocab() > 0); - assert!(model.n_embd() > 0); - assert!(model.n_params() > 0); - assert!(model.n_ctx_train()? > 0); - - Ok(()) - } - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.n_layer()? > 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + assert!(model.n_vocab() > 0); + assert!(model.n_embd() > 0); + assert!(model.n_params() > 0); + assert!(model.n_ctx_train()? > 0); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.n_head()? > 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_layer()? > 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.n_head_kv()? > 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_head()? > 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.size() > 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_head_kv()? > 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(!fixture.model.is_recurrent()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.size() > 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn is_hybrid_returns_false_for_non_hybrid_default_models( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - assert!( - !fixture.model.is_hybrid(), - "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(!fixture.model.is_recurrent()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!( - fixture.model.is_hybrid(), - "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_hybrid_returns_false_for_non_hybrid_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!( + !fixture.model.is_hybrid(), + "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn rope_type_returns_a_known_variant_for_rope_carrying_default_models( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - use llama_cpp_bindings::model::rope_type::RopeType; - let rope = fixture.model.rope_type(); - assert!( - matches!( - rope, - Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision) - ), - "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!( + fixture.model.is_hybrid(), + "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { - let rope = fixture.model.rope_type(); - assert!( - rope.is_none(), - "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn rope_type_returns_a_known_variant_for_rope_carrying_default_models( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + use llama_cpp_bindings::model::rope_type::RopeType; + let rope = fixture.model.rope_type(); + assert!( + matches!( + rope, + Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision) + ), + "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - use llama_cpp_bindings::model::vocab_type::VocabType; - let vocab = fixture.model.vocab_type()?; - assert!( - matches!(vocab, VocabType::BPE | VocabType::SPM), - "vocab_type must be a known variant; got {vocab:?}" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { + let rope = fixture.model.rope_type(); + assert!( + rope.is_none(), + "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}" + ); + Ok(()) } -mod model_metadata_kv { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.meta_count() > 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + use llama_cpp_bindings::model::vocab_type::VocabType; + let vocab = fixture.model.vocab_type()?; + assert!( + matches!(vocab, VocabType::BPE | VocabType::SPM), + "vocab_type must be a known variant; got {vocab:?}" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> { - let key = fixture.model.meta_key_by_index(0)?; - assert!(!key.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.meta_count() > 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> { - let value = fixture.model.meta_val_str_by_index(0)?; - assert!(!value.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> { + let key = fixture.model.meta_key_by_index(0)?; + assert!(!key.is_empty()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture.model.meta_key_by_index(999_999); - assert!(result.is_err()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> { + let value = fixture.model.meta_val_str_by_index(0)?; + assert!(!value.is_empty()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture.model.meta_val_str_by_index(999_999); - assert!(result.is_err()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_key_by_index(999_999); + assert!(result.is_err()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let first_key = model.meta_key_by_index(0)?; - let value = model.meta_val_str(&first_key)?; - assert!(!value.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_val_str_by_index(999_999); + assert!(result.is_err()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_val_str_with_long_value_triggers_buffer_resize( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let count = model.meta_count(); - - for index in 0..count { - let key = model.meta_key_by_index(index); - let value = model.meta_val_str_by_index(index); - assert!(key.is_ok()); - assert!(value.is_ok()); - } - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let first_key = model.meta_key_by_index(0)?; + let value = model.meta_val_str(&first_key)?; + assert!(!value.is_empty()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture.model.meta_val_str("key\0with_null"); - assert!(result.is_err()); - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_with_long_value_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let count = model.meta_count(); + + for index in 0..count { + let key = model.meta_key_by_index(index); + let value = model.meta_val_str_by_index(index); + assert!(key.is_ok()); + assert!(value.is_ok()); } + Ok(()) } -mod model_params { - #![expect( - clippy::similar_names, - reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity" - )] - - use std::ffi::CString; - use std::pin::pin; - - use anyhow::Result; - use llama_cpp_bindings::context::params::LlamaContextParams; - use llama_cpp_bindings::max_devices; - use llama_cpp_bindings::model::params::LlamaModelParams; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let model_path_str = fixture - .model_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?; - let model_path_cstr = CString::new(model_path_str)?; - - let mut params = pin!(LlamaModelParams::default()); - let mut context_params = LlamaContextParams::default(); - let mut margins = vec![0usize; max_devices()]; - - let result = params.as_mut().fit_params( - &model_path_cstr, - &mut context_params, - &mut margins, - 512, - llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE, - ); - - let fit = - result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?; - assert!(fit.n_ctx > 0); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_val_str("key\0with_null"); + assert!(result.is_err()); + Ok(()) } -mod model_special_tokens { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_bindings::SampledToken; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let bos = model.token_bos(); - let eos = model.token_eos(); - - assert_ne!(bos, eos); - assert!(model.is_eog_token(&SampledToken::Content(eos))); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[expect( + clippy::similar_names, + reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity" +)] +fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let model_path_str = fixture + .model_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?; + let model_path_cstr = CString::new(model_path_str)?; + + let mut params = pin!(LlamaModelParams::default()); + let mut context_params = LlamaContextParams::default(); + let mut margins = vec![0usize; max_devices()]; + + let result = params.as_mut().fit_params( + &model_path_cstr, + &mut context_params, + &mut margins, + 512, + llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE, + ); + + let fit = result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?; + assert!(fit.n_ctx > 0); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let nl_token = fixture.model.token_nl(); - assert!(nl_token.0 >= 0); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let bos = model.token_bos(); + let eos = model.token_eos(); + + assert_ne!(bos, eos); + assert!(model.is_eog_token(&SampledToken::Content(eos))); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let eos = model.token_eos(); - assert!(model.is_eog_token(&SampledToken::Reasoning(eos))); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let nl_token = fixture.model.token_nl(); + assert!(nl_token.0 >= 0); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let eos = model.token_eos(); - assert!(model.is_eog_token(&SampledToken::ToolCall(eos))); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::Reasoning(eos))); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let eos = model.token_eos(); - assert!(model.is_eog_token(&SampledToken::Undeterminable(eos))); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::ToolCall(eos))); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let token = model.decode_start_token(); - let n_vocab = model.n_vocab(); - assert!( - token.0 == -1 || (0..n_vocab).contains(&token.0), - "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}" - ); - assert_eq!( - token, - model.decode_start_token(), - "decode_start_token must be deterministic across calls" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::Undeterminable(eos))); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let token = model.token_sep(); - let n_vocab = model.n_vocab(); - assert!( - token.0 == -1 || (0..n_vocab).contains(&token.0), - "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}" - ); - assert_eq!( - token, - model.token_sep(), - "token_sep must be deterministic across calls" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let token = model.decode_start_token(); + let n_vocab = model.n_vocab(); + assert!( + token.0 == -1 || (0..n_vocab).contains(&token.0), + "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}" + ); + assert_eq!( + token, + model.decode_start_token(), + "decode_start_token must be deterministic across calls" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let bos = model.token_bos(); - let attrs = model.token_attr(bos)?; - let bit_repr = format!("{:?}", *attrs); - assert!( - !bit_repr.is_empty(), - "token_attr(bos) must produce Debug output" - ); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let token = model.token_sep(); + let n_vocab = model.n_vocab(); + assert!( + token.0 == -1 || (0..n_vocab).contains(&token.0), + "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}" + ); + assert_eq!( + token, + model.token_sep(), + "token_sep must be deterministic across calls" + ); + Ok(()) } -mod model_str_to_token { - use anyhow::Result; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let tokens = model.str_to_token("hello world", AddBos::Never)?; - assert!(!tokens.is_empty()); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let piece = model.token_to_piece( - &llama_cpp_bindings::SampledToken::Content(tokens[0]), - &mut decoder, - false, - None, - )?; - - assert!(!piece.is_empty()); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let bos = model.token_bos(); + let attrs = model.token_attr(bos)?; + let bit_repr = format!("{:?}", *attrs); + assert!( + !bit_repr.is_empty(), + "token_attr(bos) must produce Debug output" + ); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn str_to_token_grows_buffer_when_initial_estimation_too_small( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let many_short_chars = "a b c d e f g h i j k l"; - let tokens = fixture - .model - .str_to_token(many_short_chars, AddBos::Always)?; - - assert!( - tokens.len() > 8, - "expected regrow; got {} tokens", - tokens.len() - ); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello world", AddBos::Never)?; + assert!(!tokens.is_empty()); + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let piece = + model.token_to_piece(&SampledToken::Content(tokens[0]), &mut decoder, false, None)?; + + assert!(!piece.is_empty()); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?; - let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?; - - assert!(tokens_with_bos.len() >= tokens_without_bos.len()); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_grows_buffer_when_initial_estimation_too_small( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let many_short_chars = "a b c d e f g h i j k l"; + let tokens = fixture + .model + .str_to_token(many_short_chars, AddBos::Always)?; + + assert!( + tokens.len() > 8, + "expected regrow; got {} tokens", + tokens.len() + ); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn str_to_token_with_many_tokens_triggers_buffer_resize( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - use std::fmt::Write; - - let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| { - let _ = write!(accumulator, "{number} "); - accumulator - }); - - let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?; - - assert!(tokens.len() > many_numbers.len() / 2); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?; + let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?; + + assert!(tokens_with_bos.len() >= tokens_without_bos.len()); + + Ok(()) } -mod model_token_to_piece { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use std::num::NonZeroU16; - - use anyhow::Result; - use llama_cpp_bindings::SampledToken; - use llama_cpp_bindings::model::AddBos; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_to_piece_bytes_returns_bytes_for_known_token( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let tokens = model.str_to_token("hello", AddBos::Never)?; - let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?; - - assert!(!bytes.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_with_many_tokens_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::fmt::Write; + + let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| { + let _ = write!(accumulator, "{number} "); + accumulator + }); + + let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?; + + assert!(tokens.len() > many_numbers.len() / 2); + + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_to_piece_handles_large_token_requiring_buffer_resize( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - - for (token, _) in model.tokens(true).take(200) { - let result = - model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None); - assert!(result.is_ok()); - } - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_bytes_returns_bytes_for_known_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello", AddBos::Never)?; + let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?; + + assert!(!bytes.is_empty()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_to_piece_bytes_insufficient_buffer_returns_error( - fixture: &LlamaFixture<'_>, - ) -> Result<()> { - let model = fixture.model; - let tokens = model.str_to_token("hello", AddBos::Never)?; - let result = model.token_to_piece_bytes(tokens[0], 1, false, None); - - assert!( - result - .unwrap_err() - .to_string() - .contains("Insufficient Buffer Space") - ); - Ok(()) +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_handles_large_token_requiring_buffer_resize( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + + for (token, _) in model.tokens(true).take(200) { + let result = model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None); + assert!(result.is_ok()); } + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hello", AddBos::Never)?; - let result = model.token_to_piece( - &SampledToken::Content(tokens[0]), - &mut decoder, - false, - NonZeroU16::new(1), - ); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_bytes_insufficient_buffer_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello", AddBos::Never)?; + let result = model.token_to_piece_bytes(tokens[0], 1, false, None); + + assert!( + result + .unwrap_err() + .to_string() + .contains("Insufficient Buffer Space") + ); + Ok(()) +} - assert!(result.is_ok()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hello", AddBos::Never)?; + let result = model.token_to_piece( + &SampledToken::Content(tokens[0]), + &mut decoder, + false, + NonZeroU16::new(1), + ); + + assert!(result.is_ok()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = model.token_to_piece( - &SampledToken::Reasoning(tokens[0]), - &mut decoder, - true, - None, - )?; - - assert!(!piece.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = model.token_to_piece( + &SampledToken::Reasoning(tokens[0]), + &mut decoder, + true, + None, + )?; + + assert!(!piece.is_empty()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = - model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?; - - assert!(!piece.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = + model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?; + + assert!(!piece.is_empty()); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = model.token_to_piece( - &SampledToken::Undeterminable(tokens[0]), - &mut decoder, - true, - None, - )?; - - assert!(!piece.is_empty()); - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = model.token_to_piece( + &SampledToken::Undeterminable(tokens[0]), + &mut decoder, + true, + None, + )?; + + assert!(!piece.is_empty()); + Ok(()) } -mod model_tokens_iterator { - #![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" - )] - - use anyhow::Result; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut count = 0; - - for (token, _piece_result) in model.tokens(false) { - assert!(token.0 >= 0); - count += 1; - - if count >= 100 { - break; - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut count = 0; + + for (token, _piece_result) in model.tokens(false) { + assert!(token.0 >= 0); + count += 1; + + if count >= 100 { + break; } - - assert_eq!(count, 100); - Ok(()) } - #[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - )] - fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let n_vocab = model.n_vocab(); - let count = model.tokens(false).count(); - - assert_eq!(count, usize::try_from(n_vocab)?); - Ok(()) - } + assert_eq!(count, 100); + Ok(()) } -mod model_helpers { - #![expect( - clippy::unnecessary_wraps, - reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" - )] - - use anyhow::Result; - use llama_cpp_test_harness::LlamaFixture; - use llama_cpp_test_harness::llama_test; - - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 - )] - fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> { - let formatted = format!("{:?}", fixture.model); - - assert!(formatted.contains("LlamaModel")); - assert!(formatted.contains("model")); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let n_vocab = model.n_vocab(); + let count = model.tokens(false).count(); + + assert_eq!(count, usize::try_from(n_vocab)?); + Ok(()) +} - #[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 - )] - fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { - let first = fixture.model.approximate_tok_env(); - let second = fixture.model.approximate_tok_env(); - - assert!(std::sync::Arc::ptr_eq(&first, &second)); - - Ok(()) - } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> { + let formatted = format!("{:?}", fixture.model); + + assert!(formatted.contains("LlamaModel")); + assert!(formatted.contains("model")); + + Ok(()) } +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.approximate_tok_env(); + let second = fixture.model.approximate_tok_env(); + + assert!(std::sync::Arc::ptr_eq(&first, &second)); + + Ok(()) +} llama_tests_main!(); diff --git a/llama-cpp-bindings/src/batch_add_error.rs b/llama-cpp-bindings/src/batch_add_error.rs index ea4cb154..e3ec5864 100644 --- a/llama-cpp-bindings/src/batch_add_error.rs +++ b/llama-cpp-bindings/src/batch_add_error.rs @@ -1,13 +1,9 @@ -/// Errors that can occur when adding a token to a batch. #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum BatchAddError { - /// There was not enough space in the batch to add the token. #[error("Insufficient Space of {0}")] InsufficientSpace(usize), - /// Empty buffer is provided for [`crate::llama_batch::LlamaBatch::get_one`] #[error("Empty buffer")] EmptyBuffer, - /// An integer value exceeded the allowed range. #[error("Integer overflow: {0}")] IntegerOverflow(String), } diff --git a/llama-cpp-bindings/src/context.rs b/llama-cpp-bindings/src/context.rs index b980f831..49702de6 100644 --- a/llama-cpp-bindings/src/context.rs +++ b/llama-cpp-bindings/src/context.rs @@ -1,5 +1,3 @@ -//! Safe wrapper around `llama_context`. - use std::ffi::c_void; use std::fmt::{Debug, Formatter}; use std::num::NonZeroI32; @@ -57,11 +55,8 @@ unsafe extern "C" fn abort_callback_trampoline(data: *mut c_void) -> bool { flag.load(Ordering::Relaxed) } -/// Safe wrapper around `llama_context`. pub struct LlamaContext<'model> { - /// Raw pointer to the underlying `llama_context`. pub context: NonNull, - /// A reference to the context's model. pub model: &'model LlamaModel, abort_flag: Option>, initialized_logits: Vec, @@ -77,7 +72,6 @@ impl Debug for LlamaContext<'_> { } impl<'model> LlamaContext<'model> { - /// Wraps existing raw pointers into a new `LlamaContext`. #[must_use] pub const fn new( llama_model: &'model LlamaModel, @@ -93,11 +87,6 @@ impl<'model> LlamaContext<'model> { } } - /// Create a new context bound to `model`. - /// - /// `_backend` is unused in the body but serves as a compile-time witness that - /// the global llama.cpp backend has been initialised before context creation. - /// /// # Errors /// /// Returns [`LlamaContextLoadError`] when llama.cpp fails to allocate the context. @@ -143,29 +132,21 @@ impl<'model> LlamaContext<'model> { } } - /// Gets the max number of logical tokens that can be submitted to decode. Must be greater than or equal to [`Self::n_ubatch`]. #[must_use] pub fn n_batch(&self) -> u32 { unsafe { llama_cpp_bindings_sys::llama_n_batch(self.context.as_ptr()) } } - /// Gets the max number of physical tokens (hardware level) to decode in batch. Must be less than or equal to [`Self::n_batch`]. #[must_use] pub fn n_ubatch(&self) -> u32 { unsafe { llama_cpp_bindings_sys::llama_n_ubatch(self.context.as_ptr()) } } - /// Gets the size of the context. #[must_use] pub fn n_ctx(&self) -> u32 { unsafe { llama_cpp_bindings_sys::llama_n_ctx(self.context.as_ptr()) } } - /// Sets an abort flag that llama.cpp checks during computation. - /// - /// When the flag is set to `true`, any in-progress `decode()` call will - /// abort and return `DecodeError::Aborted`. The `Arc` is stored internally - /// to ensure the flag outlives the callback registration. #[expect(unsafe_code, reason = "required for FFI abort callback registration")] pub fn set_abort_flag(&mut self, flag: Arc) { let raw_ptr = Arc::as_ptr(&flag) as *mut c_void; @@ -180,7 +161,6 @@ impl<'model> LlamaContext<'model> { } } - /// Clears the abort callback so that decode calls are no longer interruptible. #[expect(unsafe_code, reason = "required for FFI abort callback deregistration")] pub fn clear_abort_callback(&mut self) { self.abort_flag = None; @@ -194,33 +174,20 @@ impl<'model> LlamaContext<'model> { } } - /// Waits for all pending backend operations to complete. - /// - /// Must be called before freeing the context to prevent hangs - /// during resource cleanup. #[expect(unsafe_code, reason = "required for FFI synchronization call")] pub fn synchronize(&self) { unsafe { llama_cpp_bindings_sys::llama_synchronize(self.context.as_ptr()) } } - /// Detaches the threadpool from the context. - /// - /// Must be called before freeing the context to prevent threadpool - /// workers from accessing freed resources. #[expect(unsafe_code, reason = "required for FFI threadpool detachment")] pub fn detach_threadpool(&self) { unsafe { llama_cpp_bindings_sys::llama_detach_threadpool(self.context.as_ptr()) } } - /// Marks a logit index as initialized so it can be read via - /// `get_logits_ith`. Use after external decode operations (like - /// `eval_chunks`) that bypass the Rust `decode()` method. pub fn mark_logits_initialized(&mut self, token_index: i32) { self.initialized_logits = vec![token_index]; } - /// Decodes the batch. - /// /// # Errors /// /// - `DecodeError` if the decoding failed. @@ -267,8 +234,6 @@ impl<'model> LlamaContext<'model> { } } - /// Encodes the batch. - /// /// # Errors /// /// - `EncodeError` if the encoding failed. @@ -318,13 +283,6 @@ impl<'model> LlamaContext<'model> { } } - /// Get the embeddings for the given sequence in the current context. - /// - /// # Returns - /// - /// A slice containing the embeddings for the last decoded batch. - /// The size corresponds to the `n_embd` parameter of the context's model. - /// /// # Errors /// /// - When the current context was constructed without enabling embeddings. @@ -353,13 +311,6 @@ impl<'model> LlamaContext<'model> { } } - /// Get the embeddings for the given token in the current context. - /// - /// # Returns - /// - /// A slice containing the embeddings for the last decoded batch of the given token. - /// The size corresponds to the `n_embd` parameter of the context's model. - /// /// # Errors /// /// - When the current context was constructed without enabling embeddings. @@ -388,12 +339,6 @@ impl<'model> LlamaContext<'model> { } } - /// Get the logits for the last token in the context. - /// - /// # Returns - /// An iterator over unsorted `LlamaTokenData` containing the - /// logits for the last token in the context. - /// /// # Errors /// Returns `LogitsError` if logits are null or `n_vocab` overflows. pub fn candidates(&self) -> Result + '_, LogitsError> { @@ -405,25 +350,12 @@ impl<'model> LlamaContext<'model> { })) } - /// Get the token data array for the last token in the context. - /// /// # Errors /// Returns `LogitsError` if logits are null or `n_vocab` overflows. pub fn token_data_array(&self) -> Result { Ok(LlamaTokenDataArray::from_iter(self.candidates()?, false)) } - /// Token logits obtained from the last call to `decode()`. - /// The logits for which `batch.logits[i] != 0` are stored contiguously - /// in the order they have appeared in the batch. - /// Rows: number of tokens for which `batch.logits[i] != 0` - /// Cols: `n_vocab` - /// - /// # Returns - /// - /// A slice containing the logits for the last decoded token. - /// The size corresponds to the `n_vocab` parameter of the context's model. - /// /// # Errors /// Returns `LogitsError` if the logits pointer is null or `n_vocab` overflows. pub fn get_logits(&self) -> Result<&[f32], LogitsError> { @@ -438,8 +370,6 @@ impl<'model> LlamaContext<'model> { Ok(unsafe { slice::from_raw_parts(data, len) }) } - /// Get the logits for the ith token in the context. - /// /// # Errors /// Returns `LogitsError` if the token is not initialized or out of range. pub fn candidates_ith( @@ -454,8 +384,6 @@ impl<'model> LlamaContext<'model> { })) } - /// Get the token data array for the ith token in the context. - /// /// # Errors /// Returns `LogitsError` if the token is not initialized or out of range. pub fn token_data_array_ith( @@ -468,8 +396,6 @@ impl<'model> LlamaContext<'model> { )) } - /// Get the logits for the ith token in the context. - /// /// # Errors /// Returns `LogitsError` if the token is not initialized, out of range, or `n_vocab` overflows. pub fn get_logits_ith(&self, token_index: i32) -> Result<&[f32], LogitsError> { @@ -497,19 +423,15 @@ impl<'model> LlamaContext<'model> { Ok(unsafe { slice::from_raw_parts(data, len) }) } - /// Reset the timings for the context. pub fn reset_timings(&mut self) { unsafe { llama_cpp_bindings_sys::llama_perf_context_reset(self.context.as_ptr()) } } - /// Returns the timings for the context. pub fn timings(&mut self) -> LlamaTimings { let timings = unsafe { llama_cpp_bindings_sys::llama_perf_context(self.context.as_ptr()) }; LlamaTimings { timings } } - /// Sets a lora adapter. - /// /// # Errors /// /// See [`LlamaLoraAdapterSetError`] for more information. @@ -534,11 +456,6 @@ impl<'model> LlamaContext<'model> { Ok(()) } - /// Remove all lora adapters. - /// - /// Note: The upstream API now replaces all adapters at once via - /// `llama_set_adapters_lora`. This clears all adapters from the context. - /// /// # Errors /// /// See [`LlamaLoraAdapterRemoveError`] for more information. diff --git a/llama-cpp-bindings/src/context/kv_cache.rs b/llama-cpp-bindings/src/context/kv_cache.rs index dff5e2aa..80b97a67 100644 --- a/llama-cpp-bindings/src/context/kv_cache.rs +++ b/llama-cpp-bindings/src/context/kv_cache.rs @@ -1,5 +1,3 @@ -//! utilities for working with the kv cache - use std::ffi::c_int; use std::num::{NonZeroU8, TryFromIntError}; use std::os::raw::c_char; @@ -9,44 +7,22 @@ use crate::context::LlamaContext; use crate::error::{KvCacheSeqAddError, KvCacheSeqDivError}; use crate::ffi_error_reader::read_and_free_cpp_error; -/// Errors that can occur when attempting to prepare values for the kv cache #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum KvCacheConversionError { - /// Sequence id conversion to i32 failed #[error("Provided sequence id is too large for a i32")] SeqIdTooLarge(#[source] TryFromIntError), - /// Position 0 conversion to i32 failed #[error("Provided start position is too large for a i32")] P0TooLarge(#[source] TryFromIntError), - /// Position 1 conversion to i32 failed #[error("Provided end position is too large for a i32")] P1TooLarge(#[source] TryFromIntError), } impl LlamaContext<'_> { - /// Copy the cache from one sequence to another. - /// - /// # Parameters - /// - /// * `src` - The sequence id to copy the cache from. - /// * `dest` - The sequence id to copy the cache to. - /// * `size` - The size of the cache to copy. pub fn copy_cache(&mut self, src: i32, dest: i32, size: i32) { let mem = unsafe { llama_cpp_bindings_sys::llama_get_memory(self.context.as_ptr()) }; unsafe { llama_cpp_bindings_sys::llama_memory_seq_cp(mem, src, dest, 0, size) } } - /// Copy the cache from one sequence to another. - /// - /// # Returns - /// A `Result` indicating whether the operation was successful. - /// - /// # Parameters - /// * `src` - The sequence id to copy the cache from. - /// * `dest` - The sequence id to copy the cache to. - /// * `p0` - The start position of the cache to clear. If `None`, the entire cache is copied up to `p1`. - /// * `p1` - The end position of the cache to clear. If `None`, the entire cache is copied starting from `p0`. - /// /// # Errors /// If either position exceeds [`i32::MAX`]. pub fn copy_kv_cache_seq( @@ -67,18 +43,6 @@ impl LlamaContext<'_> { Ok(()) } - /// Clear the kv cache for the given sequence within the specified range `[p0, p1)` - /// Returns `false` only when partial sequence removals fail. Full sequence removals always succeed. - /// - /// # Returns - /// A `Result` indicating whether the operation was successful. If the sequence id or - /// either position exceeds the maximum i32 value, no removal is attempted and an `Err` is returned. - /// - /// # Parameters - /// * `src` - The sequence id to clear the cache for. If `None`, matches all sequences - /// * `p0` - The start position of the cache to clear. If `None`, the entire cache is cleared up to `p1`. - /// * `p1` - The end position of the cache to clear. If `None`, the entire cache is cleared from `p0`. - /// /// # Errors /// If the sequence id or either position exceeds [`i32::MAX`]. pub fn clear_kv_cache_seq( @@ -100,38 +64,17 @@ impl LlamaContext<'_> { Ok(unsafe { llama_cpp_bindings_sys::llama_memory_seq_rm(mem, src, p0, p1) }) } - /// Clear the KV cache, including both metadata and the underlying data buffers. pub fn clear_kv_cache(&mut self) { let mem = unsafe { llama_cpp_bindings_sys::llama_get_memory(self.context.as_ptr()) }; let clear_data_buffers = true; unsafe { llama_cpp_bindings_sys::llama_memory_clear(mem, clear_data_buffers) } } - /// Removes all tokens that do not belong to the specified sequence - /// - /// # Parameters - /// - /// * `seq_id` - The sequence id to keep pub fn kv_cache_seq_keep(&mut self, seq_id: i32) { let mem = unsafe { llama_cpp_bindings_sys::llama_get_memory(self.context.as_ptr()) }; unsafe { llama_cpp_bindings_sys::llama_memory_seq_keep(mem, seq_id) } } - /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in `[p0, p1)` - /// If the KV cache is `RoPEd`, the KV data is updated accordingly: - /// - lazily on next [`LlamaContext::decode`] - /// - explicitly with [`Self::kv_cache_update`] - /// - /// # Returns - /// A `Result` indicating whether the operation was successful. - /// - /// # Parameters - /// - /// * `seq_id` - The sequence id to update - /// * `p0` - The start position of the cache to update. If `None`, the entire cache is updated up to `p1`. - /// * `p1` - The end position of the cache to update. If `None`, the entire cache is updated starting from `p0`. - /// * `delta` - The relative position to add to the tokens - /// /// # Errors /// If either position exceeds [`i32::MAX`], or the underlying memory operation reports a failure. pub fn kv_cache_seq_add( @@ -177,21 +120,6 @@ impl LlamaContext<'_> { } } - /// Integer division of the positions by factor of `d > 1` - /// If the KV cache is `RoPEd`, the KV data is updated accordingly: - /// - lazily on next [`LlamaContext::decode`] - /// - explicitly with [`Self::kv_cache_update`] - /// - /// # Returns - /// A `Result` indicating whether the operation was successful. - /// - /// # Parameters - /// - /// * `seq_id` - The sequence id to update - /// * `p0` - The start position of the cache to update. If `None`, the entire cache is updated up to `p1`. - /// * `p1` - The end position of the cache to update. If `None`, the entire cache is updated starting from `p0`. - /// * `d` - The factor to divide the positions by - /// /// # Errors /// If either position exceeds [`i32::MAX`], or the underlying memory operation reports a failure. pub fn kv_cache_seq_div( @@ -238,11 +166,6 @@ impl LlamaContext<'_> { } } - /// Returns the largest position present in the KV cache for the specified sequence - /// - /// # Parameters - /// - /// * `seq_id` - The sequence id to get the max position for #[must_use] pub fn kv_cache_seq_pos_max(&self, seq_id: i32) -> i32 { unsafe { diff --git a/llama-cpp-bindings/src/context/kv_cache_type.rs b/llama-cpp-bindings/src/context/kv_cache_type.rs index 661a59e1..3ffc7f11 100644 --- a/llama-cpp-bindings/src/context/kv_cache_type.rs +++ b/llama-cpp-bindings/src/context/kv_cache_type.rs @@ -1,4 +1,3 @@ -/// A rusty wrapper around `ggml_type` for KV cache types. #[expect( non_camel_case_types, reason = "variant names mirror llama.cpp's `enum ggml_type` symbol names verbatim so they can \ @@ -11,11 +10,6 @@ )] #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum KvCacheType { - /// Represents an unknown or not-yet-mapped `ggml_type` and carries the raw value. - /// When passed through FFI, the raw value is used as-is (if llama.cpp supports it, - /// the runtime will operate with that type). - /// This variant preserves API compatibility when new `ggml_type` values are - /// introduced in the future. Unknown(llama_cpp_bindings_sys::ggml_type), F32, F16, diff --git a/llama-cpp-bindings/src/context/llama_attention_type.rs b/llama-cpp-bindings/src/context/llama_attention_type.rs index b785ffb0..79b9f66f 100644 --- a/llama-cpp-bindings/src/context/llama_attention_type.rs +++ b/llama-cpp-bindings/src/context/llama_attention_type.rs @@ -1,12 +1,8 @@ -/// A rusty wrapper around `LLAMA_ATTENTION_TYPE`. #[repr(i8)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum LlamaAttentionType { - /// The attention type is unspecified Unspecified = -1, - /// Causal attention Causal = 0, - /// Non-causal attention NonCausal = 1, } diff --git a/llama-cpp-bindings/src/context/llama_pooling_type.rs b/llama-cpp-bindings/src/context/llama_pooling_type.rs index f0d4486b..651216f3 100644 --- a/llama-cpp-bindings/src/context/llama_pooling_type.rs +++ b/llama-cpp-bindings/src/context/llama_pooling_type.rs @@ -1,23 +1,14 @@ -/// A rusty wrapper around `LLAMA_POOLING_TYPE`. #[repr(i8)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum LlamaPoolingType { - /// The pooling type is unspecified Unspecified = -1, - /// No pooling None = 0, - /// Mean pooling Mean = 1, - /// CLS pooling Cls = 2, - /// Last pooling Last = 3, - /// Rank pooling Rank = 4, } -/// Create a `LlamaPoolingType` from a `c_int` - returns `LlamaPoolingType::Unspecified` if -/// the value is not recognized. impl From for LlamaPoolingType { fn from(value: i32) -> Self { match value { @@ -31,7 +22,6 @@ impl From for LlamaPoolingType { } } -/// Create a `c_int` from a `LlamaPoolingType`. impl From for i32 { fn from(value: LlamaPoolingType) -> Self { match value { diff --git a/llama-cpp-bindings/src/context/llama_state_seq_flags.rs b/llama-cpp-bindings/src/context/llama_state_seq_flags.rs index efc66b94..cbe5ccc0 100644 --- a/llama-cpp-bindings/src/context/llama_state_seq_flags.rs +++ b/llama-cpp-bindings/src/context/llama_state_seq_flags.rs @@ -1,31 +1,21 @@ -//! Flags for extended state sequence operations on hybrid/recurrent models. - -/// Flags controlling which parts of state to save/restore for sequence operations. -/// -/// Used with the `state_seq_*_ext` methods on [`super::LlamaContext`] to enable -/// partial state operations (e.g., saving only recurrent/SSM state for hybrid models). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct LlamaStateSeqFlags { flags: u32, } impl LlamaStateSeqFlags { - /// Save/restore only partial (recurrent/SSM) state, skipping attention KV cache. pub const PARTIAL_ONLY: Self = Self { flags: 1 }; - /// No flags set. #[must_use] pub const fn empty() -> Self { Self { flags: 0 } } - /// Returns the raw bit representation. #[must_use] pub const fn bits(&self) -> u32 { self.flags } - /// Returns true if `self` contains all bits in `other`. #[must_use] pub const fn contains(&self, other: Self) -> bool { (self.flags & other.flags) == other.flags diff --git a/llama-cpp-bindings/src/context/load_seq_state_error.rs b/llama-cpp-bindings/src/context/load_seq_state_error.rs index 158c8c3b..b3e27983 100644 --- a/llama-cpp-bindings/src/context/load_seq_state_error.rs +++ b/llama-cpp-bindings/src/context/load_seq_state_error.rs @@ -1,29 +1,17 @@ -//! Error type for sequence state file load operations. - use std::ffi::NulError; use std::path::PathBuf; -/// Failed to load a sequence state file. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum LoadSeqStateError { - /// llama.cpp failed to load the sequence state file #[error("Failed to load sequence state file")] FailedToLoad, - /// null byte in string #[error("null byte in string {0}")] NullError(#[from] NulError), - /// failed to convert path to str #[error("failed to convert path {0} to str")] PathToStrError(PathBuf), - /// Insufficient max length #[error("max_length is not large enough to hold {n_out} (was {max_tokens})")] - InsufficientMaxLength { - /// The length of the loaded sequence - n_out: usize, - /// The maximum length - max_tokens: usize, - }, + InsufficientMaxLength { n_out: usize, max_tokens: usize }, } diff --git a/llama-cpp-bindings/src/context/load_session_error.rs b/llama-cpp-bindings/src/context/load_session_error.rs index e317f278..be514e13 100644 --- a/llama-cpp-bindings/src/context/load_session_error.rs +++ b/llama-cpp-bindings/src/context/load_session_error.rs @@ -1,29 +1,17 @@ -//! Error type for session file load operations. - use std::ffi::NulError; use std::path::PathBuf; -/// Failed to load a session file. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum LoadSessionError { - /// llama.cpp failed to load the session file #[error("Failed to load session file")] FailedToLoad, - /// null byte in string #[error("null byte in string {0}")] NullError(#[from] NulError), - /// failed to convert path to str #[error("failed to convert path {0} to str")] PathToStrError(PathBuf), - /// Insufficient max length #[error("max_length is not large enough to hold {n_out} (was {max_tokens})")] - InsufficientMaxLength { - /// The length of the session file - n_out: usize, - /// The maximum length - max_tokens: usize, - }, + InsufficientMaxLength { n_out: usize, max_tokens: usize }, } diff --git a/llama-cpp-bindings/src/context/params.rs b/llama-cpp-bindings/src/context/params.rs index 13935e21..0b2f8348 100644 --- a/llama-cpp-bindings/src/context/params.rs +++ b/llama-cpp-bindings/src/context/params.rs @@ -1,4 +1,3 @@ -//! A safe wrapper around `llama_context_params`. use std::fmt::Debug; use std::num::NonZeroU32; @@ -7,21 +6,6 @@ pub use crate::context::llama_attention_type::LlamaAttentionType; pub use crate::context::llama_pooling_type::LlamaPoolingType; pub use crate::context::rope_scaling_type::RopeScalingType; -/// A safe wrapper around `llama_context_params`. -/// -/// Generally this should be created with [`Default::default()`] and then modified with `with_*` methods. -/// -/// # Examples -/// -/// ```rust -/// # use std::num::NonZeroU32; -/// use llama_cpp_bindings::context::params::LlamaContextParams; -/// -///let ctx_params = LlamaContextParams::default() -/// .with_n_ctx(NonZeroU32::new(2048)); -/// -/// assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048)); -/// ``` #[derive(Debug, Clone)] #[expect( missing_docs, @@ -38,105 +22,43 @@ pub struct LlamaContextParams { pub context_params: llama_cpp_bindings_sys::llama_context_params, } -/// SAFETY: we do not currently allow setting or reading the pointers that cause this to not be automatically send or sync. unsafe impl Send for LlamaContextParams {} unsafe impl Sync for LlamaContextParams {} impl LlamaContextParams { - /// Set the side of the context - /// - /// # Examples - /// - /// ```rust - /// # use std::num::NonZeroU32; - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// let params = params.with_n_ctx(NonZeroU32::new(2048)); - /// assert_eq!(params.n_ctx(), NonZeroU32::new(2048)); - /// ``` #[must_use] pub fn with_n_ctx(mut self, n_ctx: Option) -> Self { self.context_params.n_ctx = n_ctx.map_or(0, NonZeroU32::get); self } - /// Get the size of the context. - /// - /// [`None`] if the context size is specified by the model and not the context. - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.n_ctx(), std::num::NonZeroU32::new(512)); #[must_use] pub const fn n_ctx(&self) -> Option { NonZeroU32::new(self.context_params.n_ctx) } - /// Set the `n_batch` - /// - /// # Examples - /// - /// ```rust - /// # use std::num::NonZeroU32; - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_n_batch(2048); - /// assert_eq!(params.n_batch(), 2048); - /// ``` #[must_use] pub const fn with_n_batch(mut self, n_batch: u32) -> Self { self.context_params.n_batch = n_batch; self } - /// Get the `n_batch` - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// assert_eq!(params.n_batch(), 2048); - /// ``` #[must_use] pub const fn n_batch(&self) -> u32 { self.context_params.n_batch } - /// Set the `n_ubatch` - /// - /// # Examples - /// - /// ```rust - /// # use std::num::NonZeroU32; - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_n_ubatch(512); - /// assert_eq!(params.n_ubatch(), 512); - /// ``` #[must_use] pub const fn with_n_ubatch(mut self, n_ubatch: u32) -> Self { self.context_params.n_ubatch = n_ubatch; self } - /// Get the `n_ubatch` - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// assert_eq!(params.n_ubatch(), 512); - /// ``` #[must_use] pub const fn n_ubatch(&self) -> u32 { self.context_params.n_ubatch } - /// Set the flash attention policy using llama.cpp enum #[must_use] pub const fn with_flash_attention_policy( mut self, @@ -146,232 +68,88 @@ impl LlamaContextParams { self } - /// Get the flash attention policy #[must_use] pub const fn flash_attention_policy(&self) -> llama_cpp_bindings_sys::llama_flash_attn_type { self.context_params.flash_attn_type } - /// Set the `offload_kqv` parameter to control offloading KV cache & KQV ops to GPU - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_offload_kqv(false); - /// assert_eq!(params.offload_kqv(), false); - /// ``` #[must_use] pub const fn with_offload_kqv(mut self, enabled: bool) -> Self { self.context_params.offload_kqv = enabled; self } - /// Get the `offload_kqv` parameter - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// assert_eq!(params.offload_kqv(), true); - /// ``` #[must_use] pub const fn offload_kqv(&self) -> bool { self.context_params.offload_kqv } - /// Set the type of rope scaling. - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::{LlamaContextParams, RopeScalingType}; - /// let params = LlamaContextParams::default() - /// .with_rope_scaling_type(RopeScalingType::Linear); - /// assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear); - /// ``` #[must_use] pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self { self.context_params.rope_scaling_type = i32::from(rope_scaling_type); self } - /// Get the type of rope scaling. - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.rope_scaling_type(), llama_cpp_bindings::context::params::RopeScalingType::Unspecified); - /// ``` #[must_use] pub fn rope_scaling_type(&self) -> RopeScalingType { RopeScalingType::from(self.context_params.rope_scaling_type) } - /// Set the rope frequency base. - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_rope_freq_base(0.5); - /// assert_eq!(params.rope_freq_base(), 0.5); - /// ``` #[must_use] pub const fn with_rope_freq_base(mut self, rope_freq_base: f32) -> Self { self.context_params.rope_freq_base = rope_freq_base; self } - /// Get the rope frequency base. - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.rope_freq_base(), 0.0); - /// ``` #[must_use] pub const fn rope_freq_base(&self) -> f32 { self.context_params.rope_freq_base } - /// Set the rope frequency scale. - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_rope_freq_scale(0.5); - /// assert_eq!(params.rope_freq_scale(), 0.5); - /// ``` #[must_use] pub const fn with_rope_freq_scale(mut self, rope_freq_scale: f32) -> Self { self.context_params.rope_freq_scale = rope_freq_scale; self } - /// Get the rope frequency scale. - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.rope_freq_scale(), 0.0); - /// ``` #[must_use] pub const fn rope_freq_scale(&self) -> f32 { self.context_params.rope_freq_scale } - /// Get the number of threads. - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.n_threads(), 4); - /// ``` #[must_use] pub const fn n_threads(&self) -> i32 { self.context_params.n_threads } - /// Get the number of threads allocated for batches. - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.n_threads_batch(), 4); - /// ``` #[must_use] pub const fn n_threads_batch(&self) -> i32 { self.context_params.n_threads_batch } - /// Set the number of threads. - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_n_threads(8); - /// assert_eq!(params.n_threads(), 8); - /// ``` #[must_use] pub const fn with_n_threads(mut self, n_threads: i32) -> Self { self.context_params.n_threads = n_threads; self } - /// Set the number of threads allocated for batches. - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_n_threads_batch(8); - /// assert_eq!(params.n_threads_batch(), 8); - /// ``` #[must_use] pub const fn with_n_threads_batch(mut self, n_threads: i32) -> Self { self.context_params.n_threads_batch = n_threads; self } - /// Check whether embeddings are enabled - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert!(!params.embeddings()); - /// ``` #[must_use] pub const fn embeddings(&self) -> bool { self.context_params.embeddings } - /// Enable the use of embeddings - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_embeddings(true); - /// assert!(params.embeddings()); - /// ``` #[must_use] pub const fn with_embeddings(mut self, embedding: bool) -> Self { self.context_params.embeddings = embedding; self } - /// Set the evaluation callback. - /// - /// # Examples - /// - /// ```no_run - /// extern "C" fn cb_eval_fn( - /// t: *mut llama_cpp_bindings_sys::ggml_tensor, - /// ask: bool, - /// user_data: *mut std::ffi::c_void, - /// ) -> bool { - /// false - /// } - /// - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn)); - /// ``` #[must_use] pub fn with_cb_eval( mut self, @@ -381,16 +159,6 @@ impl LlamaContextParams { self } - /// Set the evaluation callback user data. - /// - /// # Examples - /// - /// ```no_run - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// let user_data = std::ptr::null_mut(); - /// let params = params.with_cb_eval_user_data(user_data); - /// ``` #[must_use] pub const fn with_cb_eval_user_data( mut self, @@ -400,382 +168,171 @@ impl LlamaContextParams { self } - /// Set the type of pooling. - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::{LlamaContextParams, LlamaPoolingType}; - /// let params = LlamaContextParams::default() - /// .with_pooling_type(LlamaPoolingType::Last); - /// assert_eq!(params.pooling_type(), LlamaPoolingType::Last); - /// ``` #[must_use] pub fn with_pooling_type(mut self, pooling_type: LlamaPoolingType) -> Self { self.context_params.pooling_type = i32::from(pooling_type); self } - /// Get the type of pooling. - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.pooling_type(), llama_cpp_bindings::context::params::LlamaPoolingType::Unspecified); - /// ``` #[must_use] pub fn pooling_type(&self) -> LlamaPoolingType { LlamaPoolingType::from(self.context_params.pooling_type) } - /// Set whether to use full sliding window attention - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_swa_full(false); - /// assert_eq!(params.swa_full(), false); - /// ``` #[must_use] pub const fn with_swa_full(mut self, enabled: bool) -> Self { self.context_params.swa_full = enabled; self } - /// Get whether full sliding window attention is enabled - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// assert_eq!(params.swa_full(), true); - /// ``` #[must_use] pub const fn swa_full(&self) -> bool { self.context_params.swa_full } - /// Set the max number of sequences (i.e. distinct states for recurrent models) - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_n_seq_max(64); - /// assert_eq!(params.n_seq_max(), 64); - /// ``` #[must_use] pub const fn with_n_seq_max(mut self, n_seq_max: u32) -> Self { self.context_params.n_seq_max = n_seq_max; self } - /// Get the max number of sequences (i.e. distinct states for recurrent models) - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// assert_eq!(params.n_seq_max(), 1); - /// ``` #[must_use] pub const fn n_seq_max(&self) -> u32 { self.context_params.n_seq_max } - /// Set the KV cache data type for K - /// use `llama_cpp_bindings::context::params::{LlamaContextParams`, `KvCacheType`}; - /// let params = `LlamaContextParams::default().with_type_k(KvCacheType::Q4_0)`; - /// `assert_eq!(params.type_k()`, `KvCacheType::Q4_0`); - /// ``` #[must_use] pub fn with_type_k(mut self, type_k: KvCacheType) -> Self { self.context_params.type_k = type_k.into(); self } - /// Get the KV cache data type for K - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// let _ = params.type_k(); - /// ``` #[must_use] pub fn type_k(&self) -> KvCacheType { KvCacheType::from(self.context_params.type_k) } - /// Set the KV cache data type for V - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::{LlamaContextParams, KvCacheType}; - /// let params = LlamaContextParams::default().with_type_v(KvCacheType::Q4_1); - /// assert_eq!(params.type_v(), KvCacheType::Q4_1); - /// ``` #[must_use] pub fn with_type_v(mut self, type_v: KvCacheType) -> Self { self.context_params.type_v = type_v.into(); self } - /// Get the KV cache data type for V - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// let _ = params.type_v(); - /// ``` #[must_use] pub fn type_v(&self) -> KvCacheType { KvCacheType::from(self.context_params.type_v) } - /// Set the attention type - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::{LlamaContextParams, LlamaAttentionType}; - /// let params = LlamaContextParams::default() - /// .with_attention_type(LlamaAttentionType::NonCausal); - /// assert_eq!(params.attention_type(), LlamaAttentionType::NonCausal); - /// ``` #[must_use] pub fn with_attention_type(mut self, attention_type: LlamaAttentionType) -> Self { self.context_params.attention_type = i32::from(attention_type); self } - /// Get the attention type - /// - /// # Examples - /// - /// ```rust - /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default(); - /// assert_eq!(params.attention_type(), llama_cpp_bindings::context::params::LlamaAttentionType::Unspecified); - /// ``` #[must_use] pub fn attention_type(&self) -> LlamaAttentionType { LlamaAttentionType::from(self.context_params.attention_type) } - /// Set the `YaRN` extrapolation factor - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_yarn_ext_factor(1.0); - /// assert!((params.yarn_ext_factor() - 1.0).abs() < f32::EPSILON); - /// ``` #[must_use] pub const fn with_yarn_ext_factor(mut self, yarn_ext_factor: f32) -> Self { self.context_params.yarn_ext_factor = yarn_ext_factor; self } - /// Get the `YaRN` extrapolation factor #[must_use] pub const fn yarn_ext_factor(&self) -> f32 { self.context_params.yarn_ext_factor } - /// Set the `YaRN` attention factor - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_yarn_attn_factor(2.0); - /// assert!((params.yarn_attn_factor() - 2.0).abs() < f32::EPSILON); - /// ``` #[must_use] pub const fn with_yarn_attn_factor(mut self, yarn_attn_factor: f32) -> Self { self.context_params.yarn_attn_factor = yarn_attn_factor; self } - /// Get the `YaRN` attention factor #[must_use] pub const fn yarn_attn_factor(&self) -> f32 { self.context_params.yarn_attn_factor } - /// Set the `YaRN` low correction dim - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_yarn_beta_fast(32.0); - /// assert!((params.yarn_beta_fast() - 32.0).abs() < f32::EPSILON); - /// ``` #[must_use] pub const fn with_yarn_beta_fast(mut self, yarn_beta_fast: f32) -> Self { self.context_params.yarn_beta_fast = yarn_beta_fast; self } - /// Get the `YaRN` low correction dim #[must_use] pub const fn yarn_beta_fast(&self) -> f32 { self.context_params.yarn_beta_fast } - /// Set the `YaRN` high correction dim - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_yarn_beta_slow(1.0); - /// assert!((params.yarn_beta_slow() - 1.0).abs() < f32::EPSILON); - /// ``` #[must_use] pub const fn with_yarn_beta_slow(mut self, yarn_beta_slow: f32) -> Self { self.context_params.yarn_beta_slow = yarn_beta_slow; self } - /// Get the `YaRN` high correction dim #[must_use] pub const fn yarn_beta_slow(&self) -> f32 { self.context_params.yarn_beta_slow } - /// Set the `YaRN` original context size - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_yarn_orig_ctx(4096); - /// assert_eq!(params.yarn_orig_ctx(), 4096); - /// ``` #[must_use] pub const fn with_yarn_orig_ctx(mut self, yarn_orig_ctx: u32) -> Self { self.context_params.yarn_orig_ctx = yarn_orig_ctx; self } - /// Get the `YaRN` original context size #[must_use] pub const fn yarn_orig_ctx(&self) -> u32 { self.context_params.yarn_orig_ctx } - /// Set the KV cache defragmentation threshold - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_defrag_thold(0.1); - /// assert!((params.defrag_thold() - 0.1).abs() < f32::EPSILON); - /// ``` #[must_use] pub const fn with_defrag_thold(mut self, defrag_thold: f32) -> Self { self.context_params.defrag_thold = defrag_thold; self } - /// Get the KV cache defragmentation threshold #[must_use] pub const fn defrag_thold(&self) -> f32 { self.context_params.defrag_thold } - /// Set whether performance timings are disabled - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_no_perf(true); - /// assert!(params.no_perf()); - /// ``` #[must_use] pub const fn with_no_perf(mut self, no_perf: bool) -> Self { self.context_params.no_perf = no_perf; self } - /// Get whether performance timings are disabled #[must_use] pub const fn no_perf(&self) -> bool { self.context_params.no_perf } - /// Set whether to offload ops to GPU - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_op_offload(false); - /// assert!(!params.op_offload()); - /// ``` #[must_use] pub const fn with_op_offload(mut self, op_offload: bool) -> Self { self.context_params.op_offload = op_offload; self } - /// Get whether ops are offloaded to GPU #[must_use] pub const fn op_offload(&self) -> bool { self.context_params.op_offload } - /// Set whether to use a unified KV cache buffer across input sequences - /// - /// # Examples - /// - /// ```rust - /// use llama_cpp_bindings::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default() - /// .with_kv_unified(true); - /// assert!(params.kv_unified()); - /// ``` #[must_use] pub const fn with_kv_unified(mut self, kv_unified: bool) -> Self { self.context_params.kv_unified = kv_unified; self } - /// Get whether a unified KV cache buffer is used across input sequences #[must_use] pub const fn kv_unified(&self) -> bool { self.context_params.kv_unified } } -/// Default parameters for `LlamaContext`. (as defined in llama.cpp by `llama_context_default_params`) -/// ``` -/// # use std::num::NonZeroU32; -/// use llama_cpp_bindings::context::params::{LlamaContextParams, RopeScalingType}; -/// let params = LlamaContextParams::default(); -/// assert_eq!(params.n_ctx(), NonZeroU32::new(512), "n_ctx should be 512"); -/// assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified); -/// ``` impl Default for LlamaContextParams { fn default() -> Self { let context_params = unsafe { llama_cpp_bindings_sys::llama_context_default_params() }; diff --git a/llama-cpp-bindings/src/context/rope_scaling_type.rs b/llama-cpp-bindings/src/context/rope_scaling_type.rs index 0bbfa831..92aca372 100644 --- a/llama-cpp-bindings/src/context/rope_scaling_type.rs +++ b/llama-cpp-bindings/src/context/rope_scaling_type.rs @@ -1,19 +1,12 @@ -/// A rusty wrapper around `rope_scaling_type`. #[repr(i8)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum RopeScalingType { - /// The scaling type is unspecified Unspecified = -1, - /// No scaling None = 0, - /// Linear scaling Linear = 1, - /// Yarn scaling Yarn = 2, } -/// Create a `RopeScalingType` from a `c_int` - returns `RopeScalingType::ScalingUnspecified` if -/// the value is not recognized. impl From for RopeScalingType { fn from(value: i32) -> Self { match value { @@ -25,7 +18,6 @@ impl From for RopeScalingType { } } -/// Create a `c_int` from a `RopeScalingType`. impl From for i32 { fn from(value: RopeScalingType) -> Self { match value { diff --git a/llama-cpp-bindings/src/context/save_seq_state_error.rs b/llama-cpp-bindings/src/context/save_seq_state_error.rs index 129cd1cd..96410430 100644 --- a/llama-cpp-bindings/src/context/save_seq_state_error.rs +++ b/llama-cpp-bindings/src/context/save_seq_state_error.rs @@ -1,20 +1,14 @@ -//! Error type for sequence state file save operations. - use std::ffi::NulError; use std::path::PathBuf; -/// Failed to save a sequence state file. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum SaveSeqStateError { - /// llama.cpp failed to save the sequence state file #[error("Failed to save sequence state file")] FailedToSave, - /// null byte in string #[error("null byte in string {0}")] NullError(#[from] NulError), - /// failed to convert path to str #[error("failed to convert path {0} to str")] PathToStrError(PathBuf), } diff --git a/llama-cpp-bindings/src/context/save_session_error.rs b/llama-cpp-bindings/src/context/save_session_error.rs index 95999328..6814182e 100644 --- a/llama-cpp-bindings/src/context/save_session_error.rs +++ b/llama-cpp-bindings/src/context/save_session_error.rs @@ -1,20 +1,14 @@ -//! Error type for session file save operations. - use std::ffi::NulError; use std::path::PathBuf; -/// Failed to save a session file. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum SaveSessionError { - /// llama.cpp failed to save the session file #[error("Failed to save session file")] FailedToSave, - /// null byte in string #[error("null byte in string {0}")] NullError(#[from] NulError), - /// failed to convert path to str #[error("failed to convert path {0} to str")] PathToStrError(PathBuf), } diff --git a/llama-cpp-bindings/src/context/session.rs b/llama-cpp-bindings/src/context/session.rs index 3c7d211c..4a3f16ba 100644 --- a/llama-cpp-bindings/src/context/session.rs +++ b/llama-cpp-bindings/src/context/session.rs @@ -1,5 +1,3 @@ -//! utilities for working with session files - use crate::context::LlamaContext; use crate::context::llama_state_seq_flags::LlamaStateSeqFlags; use crate::context::load_seq_state_error::LoadSeqStateError; @@ -49,14 +47,6 @@ fn process_seq_load_result( } impl LlamaContext<'_> { - /// Save the full state to a file. - /// - /// # Parameters - /// - /// * `path_session` - The file to save to. - /// * `tokens` - The tokens to associate the state with. This should be a prefix of a sequence - /// of tokens that the context has processed, so that the relevant KV caches are already filled. - /// /// # Errors /// /// Fails if the path is not a valid utf8 or llama.cpp fails to save the state file. @@ -88,18 +78,6 @@ impl LlamaContext<'_> { } } - /// Load a state file into the current context. - /// - /// You still need to pass the returned tokens to the context for inference to work. What this - /// function buys you is that the KV caches are already filled with the relevant data. - /// - /// # Parameters - /// - /// * `path_session` - The file to load from. It must be a state file from a compatible context, - /// otherwise the function will error. - /// * `max_tokens` - The maximum token length of the loaded state. If the state was saved with a - /// longer length, the function will error. - /// /// # Errors /// /// Fails if the path is not a valid utf8 or llama.cpp fails to load the state file. @@ -134,24 +112,10 @@ impl LlamaContext<'_> { process_session_load_result(success, n_out, max_tokens, tokens) } - /// Save state for a single sequence to a file. - /// - /// This enables saving state for individual sequences, which is useful for multi-sequence - /// inference scenarios. - /// - /// # Parameters - /// - /// * `filepath` - The file to save to. - /// * `seq_id` - The sequence ID whose state to save. - /// * `tokens` - The tokens to associate with the saved state. - /// /// # Errors /// /// Fails if the path is not a valid utf8 or llama.cpp fails to save the sequence state file. /// - /// # Returns - /// - /// The number of bytes written on success. pub fn state_seq_save_file( &self, filepath: impl AsRef, @@ -184,24 +148,10 @@ impl LlamaContext<'_> { } } - /// Load state for a single sequence from a file. - /// - /// This enables loading state for individual sequences, which is useful for multi-sequence - /// inference scenarios. - /// - /// # Parameters - /// - /// * `filepath` - The file to load from. - /// * `dest_seq_id` - The destination sequence ID to load the state into. - /// * `max_tokens` - The maximum number of tokens to read. - /// /// # Errors /// /// Fails if the path is not a valid utf8 or llama.cpp fails to load the sequence state file. /// - /// # Returns - /// - /// A tuple of `(tokens, bytes_read)` on success. pub fn state_seq_load_file( &mut self, filepath: impl AsRef, @@ -236,19 +186,11 @@ impl LlamaContext<'_> { process_seq_load_result(bytes_read, n_out, max_tokens, tokens) } - /// Returns the maximum size in bytes of the state (rng, logits, embedding - /// and `kv_cache`) - will often be smaller after compacting tokens #[must_use] pub fn get_state_size(&self) -> usize { unsafe { llama_cpp_bindings_sys::llama_state_get_size(self.context.as_ptr()) } } - /// Copies the state to the specified destination buffer. - /// - /// Use [`get_state_size`](Self::get_state_size) to determine the required buffer size. - /// - /// Returns the number of bytes copied. - /// /// # Safety /// /// The `dest` buffer must be large enough to hold the complete state data. @@ -262,10 +204,6 @@ impl LlamaContext<'_> { } } - /// Set the state reading from the specified buffer. - /// - /// Returns the number of bytes read. - /// /// # Safety /// /// The `src` buffer must contain data previously obtained from [`copy_state_data`](Self::copy_state_data) @@ -281,10 +219,6 @@ impl LlamaContext<'_> { } } - /// Get the size of the state data for a specific sequence, with extended flags. - /// - /// Useful for hybrid/recurrent models where partial state (e.g., only SSM state) - /// may be saved or restored. #[must_use] pub fn state_seq_get_size_ext(&self, seq_id: i32, flags: &LlamaStateSeqFlags) -> usize { unsafe { @@ -296,13 +230,6 @@ impl LlamaContext<'_> { } } - /// Copy state data for a specific sequence into `dest`, with extended flags. - /// - /// Use [`state_seq_get_size_ext`](Self::state_seq_get_size_ext) to determine the required - /// buffer size before calling this method. - /// - /// Returns the number of bytes written. - /// /// # Safety /// /// The `dest` buffer must be large enough to hold the complete state data. @@ -323,10 +250,6 @@ impl LlamaContext<'_> { } } - /// Restore state data for a specific sequence from `src`, with extended flags. - /// - /// Returns the number of bytes read. - /// /// # Safety /// /// The `src` buffer must contain data previously obtained from diff --git a/llama-cpp-bindings/src/error.rs b/llama-cpp-bindings/src/error.rs index ba684109..436edad7 100644 --- a/llama-cpp-bindings/src/error.rs +++ b/llama-cpp-bindings/src/error.rs @@ -70,5 +70,4 @@ pub use token_to_string_error::TokenToStringError; pub use tool_call_format_failure::ToolCallFormatFailure; pub use xml_function_tags_failure::XmlFunctionTagsFailure; -/// A failable result from a llama.cpp function. pub type Result = std::result::Result; diff --git a/llama-cpp-bindings/src/error/apply_chat_template_error.rs b/llama-cpp-bindings/src/error/apply_chat_template_error.rs index 251dda35..363c9f38 100644 --- a/llama-cpp-bindings/src/error/apply_chat_template_error.rs +++ b/llama-cpp-bindings/src/error/apply_chat_template_error.rs @@ -1,12 +1,9 @@ use std::string::FromUtf8Error; -/// Failed to apply model chat template. #[derive(Debug, thiserror::Error)] pub enum ApplyChatTemplateError { - /// the string could not be converted to utf8. #[error("{0}")] FromUtf8Error(#[from] FromUtf8Error), - /// An integer conversion failed. #[error("Integer conversion error: {0}")] IntConversionError(#[from] std::num::TryFromIntError), } diff --git a/llama-cpp-bindings/src/error/bracketed_args_failure.rs b/llama-cpp-bindings/src/error/bracketed_args_failure.rs index 8750a9be..dcda30ae 100644 --- a/llama-cpp-bindings/src/error/bracketed_args_failure.rs +++ b/llama-cpp-bindings/src/error/bracketed_args_failure.rs @@ -1,4 +1,3 @@ -/// Failures specific to the bracketed-JSON args parser (Mistral 3 `[TOOL_CALLS]name[ARGS]{...}`). #[derive(Debug, thiserror::Error)] pub enum BracketedArgsFailure { #[error("tool call '{tool_name}' arguments are not valid JSON: {message}")] diff --git a/llama-cpp-bindings/src/error/chat_template_error.rs b/llama-cpp-bindings/src/error/chat_template_error.rs index 190b96fa..d063de05 100644 --- a/llama-cpp-bindings/src/error/chat_template_error.rs +++ b/llama-cpp-bindings/src/error/chat_template_error.rs @@ -1,17 +1,13 @@ use std::ffi::NulError; -/// There was an error while getting the chat template from a model. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum ChatTemplateError { - /// gguf has no chat template (by that name) #[error("chat template not found - returned null pointer")] MissingTemplate, - /// chat template contained a null byte #[error("null byte in string {0}")] NullError(#[from] NulError), - /// The chat template was not valid utf8. #[error(transparent)] Utf8Error(#[from] std::str::Utf8Error), } diff --git a/llama-cpp-bindings/src/error/embeddings_error.rs b/llama-cpp-bindings/src/error/embeddings_error.rs index a01bb428..9555f196 100644 --- a/llama-cpp-bindings/src/error/embeddings_error.rs +++ b/llama-cpp-bindings/src/error/embeddings_error.rs @@ -1,16 +1,11 @@ -/// When embedding related functions fail #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum EmbeddingsError { - /// Embeddings weren't enabled in the context options #[error("Embeddings weren't enabled in the context options")] NotEnabled, - /// Logits weren't enabled for the given token #[error("Logits were not enabled for the given token")] LogitsNotEnabled, - /// The given sequence index exceeds the max sequence id #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")] NonePoolType, - /// The embedding dimension does not fit into a usize. #[error("Invalid embedding dimension: {0}")] InvalidEmbeddingDimension(#[source] std::num::TryFromIntError), } diff --git a/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs b/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs index 146bcedb..d5d485f7 100644 --- a/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs +++ b/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs @@ -1,16 +1,12 @@ use crate::mtmd::MtmdEvalError; use crate::mtmd::mtmd_input_chunk_type_error::MtmdInputChunkTypeError; -/// Failed to evaluate multimodal chunks through the request classifier. #[derive(Debug, thiserror::Error)] pub enum EvalMultimodalChunksError { - /// `MtmdInputChunks::eval_chunks` returned an error. #[error("{0}")] EvalFailed(#[from] MtmdEvalError), - /// A chunk reported a type that is not known to this binding. #[error("{0}")] UnknownChunkType(#[from] MtmdInputChunkTypeError), - /// A chunk index that was within `chunks.len()` returned `None` from `chunks.get(index)`. #[error("chunk index {0} out of bounds during post-eval walk")] ChunkOutOfBounds(usize), } diff --git a/llama-cpp-bindings/src/error/fit_error.rs b/llama-cpp-bindings/src/error/fit_error.rs index 2d6fe6b5..fbb809c5 100644 --- a/llama-cpp-bindings/src/error/fit_error.rs +++ b/llama-cpp-bindings/src/error/fit_error.rs @@ -1,20 +1,13 @@ -/// Returned by [`crate::model::params::LlamaModelParams::fit_params`]. #[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] pub enum FitError { - /// No combination of model parameters fits the available device memory. #[error("no parameter combination fits available memory")] NoFittingMemoryLayout, - /// Parameter fitting was aborted by a hard error reported by the underlying library - /// (e.g., model file missing, backend initialization failed). #[error("parameter fitting aborted")] Aborted, - /// The fitting helper returned a status code the wrapper does not recognise. #[error("parameter fitting returned an unknown status code: {code}")] UnknownStatus { code: i32 }, - /// Wrapper could not allocate memory for an error message. #[error("not enough memory")] NotEnoughMemory, - /// Generic exception caught at the wrapper boundary, with the underlying message. #[error("{message}")] Reported { message: String }, } diff --git a/llama-cpp-bindings/src/error/json_object_failure.rs b/llama-cpp-bindings/src/error/json_object_failure.rs index b5d88570..e18868ce 100644 --- a/llama-cpp-bindings/src/error/json_object_failure.rs +++ b/llama-cpp-bindings/src/error/json_object_failure.rs @@ -1,4 +1,3 @@ -/// Failures specific to the JSON-object args parser (Qwen 3 `{"name":..., "arguments":...}`). #[derive(Debug, thiserror::Error)] pub enum JsonObjectFailure { #[error("tool call body has malformed JSON: {message}")] diff --git a/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs b/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs index 3c46093a..83941376 100644 --- a/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs +++ b/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs @@ -1,4 +1,3 @@ -/// Failures specific to the key-value XML-tags parser (GLM-4.7 `{name}{k}{v}...`). #[derive(Debug, thiserror::Error)] pub enum KeyValueXmlTagsFailure { #[error("tool call function tag has empty name")] diff --git a/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs b/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs index 3d536c4a..cf9be711 100644 --- a/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs +++ b/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs @@ -1,7 +1,5 @@ -/// An error that can occur when loading a model. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum LlamaLoraAdapterRemoveError { - /// llama.cpp returned a non-zero error code. #[error("error code from llama cpp")] ErrorResult(i32), } diff --git a/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs b/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs index 362f6ca1..3bca954f 100644 --- a/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs +++ b/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs @@ -1,7 +1,5 @@ -/// An error that can occur when loading a model. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum LlamaLoraAdapterSetError { - /// llama.cpp returned a non-zero error code. #[error("error code from llama cpp")] ErrorResult(i32), } diff --git a/llama-cpp-bindings/src/error/logits_error.rs b/llama-cpp-bindings/src/error/logits_error.rs index f6a198d2..8462a9b9 100644 --- a/llama-cpp-bindings/src/error/logits_error.rs +++ b/llama-cpp-bindings/src/error/logits_error.rs @@ -1,24 +1,13 @@ -/// When logits-related functions fail #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum LogitsError { - /// The logits data pointer is null. #[error("logits data pointer is null")] NullLogits, - /// The requested token index has not been initialized for logits. #[error("logit for token index {0} is not initialized")] TokenNotInitialized(i32), - /// The token index exceeds the context size. #[error("token index {token_index} exceeds context size {context_size}")] - TokenIndexExceedsContext { - /// The token index that was requested. - token_index: u32, - /// The context size. - context_size: u32, - }, - /// The vocabulary size does not fit into a usize. + TokenIndexExceedsContext { token_index: u32, context_size: u32 }, #[error("n_vocab does not fit into usize: {0}")] VocabSizeOverflow(#[source] std::num::TryFromIntError), - /// The token index does not fit into a u32. #[error("token_index does not fit into u32: {0}")] TokenIndexOverflow(#[source] std::num::TryFromIntError), } diff --git a/llama-cpp-bindings/src/error/meta_val_error.rs b/llama-cpp-bindings/src/error/meta_val_error.rs index 30b07223..ecd86e6b 100644 --- a/llama-cpp-bindings/src/error/meta_val_error.rs +++ b/llama-cpp-bindings/src/error/meta_val_error.rs @@ -1,18 +1,14 @@ use std::ffi::NulError; use std::string::FromUtf8Error; -/// Failed fetching metadata value #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum MetaValError { - /// The provided string contains an unexpected null-byte #[error("null byte in string {0}")] NullError(#[from] NulError), - /// The returned data contains invalid UTF8 data #[error("FromUtf8Error {0}")] FromUtf8Error(#[from] FromUtf8Error), - /// Got negative return value. This happens if the key or index queried does not exist. #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")] NegativeReturn(i32), } diff --git a/llama-cpp-bindings/src/error/model_params_error.rs b/llama-cpp-bindings/src/error/model_params_error.rs index 377596f1..8e70ebb4 100644 --- a/llama-cpp-bindings/src/error/model_params_error.rs +++ b/llama-cpp-bindings/src/error/model_params_error.rs @@ -1,18 +1,9 @@ -/// Errors that can occur when modifying model parameters. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum ModelParamsError { - /// The internal override vector has no available slot. #[error("No available slot in override vector")] NoAvailableSlot, - /// The first override slot is not empty. #[error("Override slot is not empty")] SlotNotEmpty, - /// A character in the key is not a valid C char. #[error("Invalid character in key: byte {byte}, {reason}")] - InvalidCharacterInKey { - /// The byte value that failed conversion. - byte: u8, - /// The reason the conversion failed. - reason: String, - }, + InvalidCharacterInKey { byte: u8, reason: String }, } diff --git a/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs b/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs index c7076486..d38337bf 100644 --- a/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs +++ b/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs @@ -1,9 +1,7 @@ use std::ffi::NulError; -/// Failed to apply model chat template. #[derive(Debug, thiserror::Error)] pub enum NewLlamaChatMessageError { - /// the string contained a null byte and thus could not be converted to a c string. #[error("{0}")] NulError(#[from] NulError), } diff --git a/llama-cpp-bindings/src/error/paired_quote_failure.rs b/llama-cpp-bindings/src/error/paired_quote_failure.rs index 9a2a3d85..53b50aa8 100644 --- a/llama-cpp-bindings/src/error/paired_quote_failure.rs +++ b/llama-cpp-bindings/src/error/paired_quote_failure.rs @@ -1,4 +1,3 @@ -/// Failures specific to the paired-quote args parser (Gemma 4 `<|tool_call>call:name{key:<|"|>val<|"|>}`). #[derive(Debug, thiserror::Error)] pub enum PairedQuoteFailure { #[error("empty key in tool call '{tool_name}' arguments")] diff --git a/llama-cpp-bindings/src/error/sampling_error.rs b/llama-cpp-bindings/src/error/sampling_error.rs index 7a2e7346..de13b87e 100644 --- a/llama-cpp-bindings/src/error/sampling_error.rs +++ b/llama-cpp-bindings/src/error/sampling_error.rs @@ -1,7 +1,5 @@ -/// Errors that can occur when creating a sampling configuration. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum SamplingError { - /// An integer value exceeded the allowed range #[error("Integer overflow: {0}")] IntegerOverflow(String), } diff --git a/llama-cpp-bindings/src/error/token_sampling_error.rs b/llama-cpp-bindings/src/error/token_sampling_error.rs index da1bc7f0..90b89dcc 100644 --- a/llama-cpp-bindings/src/error/token_sampling_error.rs +++ b/llama-cpp-bindings/src/error/token_sampling_error.rs @@ -1,7 +1,5 @@ -/// Failed to sample a token from the data array. #[derive(Debug, Eq, PartialEq, thiserror::Error)] pub enum TokenSamplingError { - /// The sampler did not select any token. #[error("No token was selected by the sampler")] NoTokenSelected, } diff --git a/llama-cpp-bindings/src/error/token_to_string_error.rs b/llama-cpp-bindings/src/error/token_to_string_error.rs index 0fb0eb89..af3ea657 100644 --- a/llama-cpp-bindings/src/error/token_to_string_error.rs +++ b/llama-cpp-bindings/src/error/token_to_string_error.rs @@ -1,20 +1,15 @@ use std::os::raw::c_int; use std::string::FromUtf8Error; -/// An error that can occur when converting a token to a string. #[derive(Debug, thiserror::Error, Clone)] #[non_exhaustive] pub enum TokenToStringError { - /// the token type was unknown #[error("Unknown Token Type")] UnknownTokenType, - /// There was insufficient buffer space to convert the token to a string. #[error("Insufficient Buffer Space {0}")] InsufficientBufferSpace(c_int), - /// The token was not valid utf8. #[error("FromUtf8Error {0}")] FromUtf8Error(#[from] FromUtf8Error), - /// An integer conversion failed. #[error("Integer conversion error: {0}")] IntConversionError(#[from] std::num::TryFromIntError), } diff --git a/llama-cpp-bindings/src/error/tool_call_format_failure.rs b/llama-cpp-bindings/src/error/tool_call_format_failure.rs index ca1bd3d7..e188f81b 100644 --- a/llama-cpp-bindings/src/error/tool_call_format_failure.rs +++ b/llama-cpp-bindings/src/error/tool_call_format_failure.rs @@ -4,7 +4,6 @@ use crate::error::key_value_xml_tags_failure::KeyValueXmlTagsFailure; use crate::error::paired_quote_failure::PairedQuoteFailure; use crate::error::xml_function_tags_failure::XmlFunctionTagsFailure; -/// Top-level failure for the wrapper-side template-override parsers (one variant per supported shape). #[derive(Debug, thiserror::Error)] pub enum ToolCallFormatFailure { #[error("bracketed-args fallback parser: {0}")] diff --git a/llama-cpp-bindings/src/error/xml_function_tags_failure.rs b/llama-cpp-bindings/src/error/xml_function_tags_failure.rs index 49180c00..bdff9936 100644 --- a/llama-cpp-bindings/src/error/xml_function_tags_failure.rs +++ b/llama-cpp-bindings/src/error/xml_function_tags_failure.rs @@ -1,4 +1,3 @@ -/// Failures specific to the XML function-tags parser (Qwen 3.5+ `val`). #[derive(Debug, thiserror::Error)] pub enum XmlFunctionTagsFailure { #[error("tool call function tag has empty name")] diff --git a/llama-cpp-bindings/src/ffi_error_reader.rs b/llama-cpp-bindings/src/ffi_error_reader.rs index 59313c27..77fa0359 100644 --- a/llama-cpp-bindings/src/ffi_error_reader.rs +++ b/llama-cpp-bindings/src/ffi_error_reader.rs @@ -1,7 +1,5 @@ use std::ffi::{CStr, c_char}; -/// Reads a C error string, converts to Rust `String`, and frees the C memory. -/// /// # Safety /// /// `error_ptr` must be either null or a valid pointer to a null-terminated diff --git a/llama-cpp-bindings/src/ffi_status_is_ok.rs b/llama-cpp-bindings/src/ffi_status_is_ok.rs index f847162a..7127c5c2 100644 --- a/llama-cpp-bindings/src/ffi_status_is_ok.rs +++ b/llama-cpp-bindings/src/ffi_status_is_ok.rs @@ -1,4 +1,3 @@ -/// Returns true if the given status indicates success. #[must_use] pub const fn status_is_ok(status: llama_cpp_bindings_sys::llama_rs_status) -> bool { status == llama_cpp_bindings_sys::LLAMA_RS_STATUS_OK diff --git a/llama-cpp-bindings/src/ffi_status_to_i32.rs b/llama-cpp-bindings/src/ffi_status_to_i32.rs index a181d57c..faf7e39d 100644 --- a/llama-cpp-bindings/src/ffi_status_to_i32.rs +++ b/llama-cpp-bindings/src/ffi_status_to_i32.rs @@ -1,4 +1,3 @@ -/// Converts a status code to its underlying `i32` representation. #[must_use] pub const fn status_to_i32(status: llama_cpp_bindings_sys::llama_rs_status) -> i32 { status diff --git a/llama-cpp-bindings/src/ggml_time_us.rs b/llama-cpp-bindings/src/ggml_time_us.rs index 4db4b490..4d9db374 100644 --- a/llama-cpp-bindings/src/ggml_time_us.rs +++ b/llama-cpp-bindings/src/ggml_time_us.rs @@ -1,20 +1,3 @@ -/// Get the time in microseconds according to ggml. -/// -/// ``` -/// # use std::time::Duration; -/// # use llama_cpp_bindings::llama_backend::LlamaBackend; -/// let backend = LlamaBackend::init().unwrap(); -/// use llama_cpp_bindings::ggml_time_us; -/// -/// let start = ggml_time_us(); -/// -/// std::thread::sleep(Duration::from_micros(10)); -/// -/// let end = ggml_time_us(); -/// -/// let elapsed = end - start; -/// -/// assert!(elapsed >= 10) #[must_use] pub fn ggml_time_us() -> i64 { unsafe { llama_cpp_bindings_sys::ggml_time_us() } diff --git a/llama-cpp-bindings/src/gguf_context.rs b/llama-cpp-bindings/src/gguf_context.rs index 7ef7114c..d51e2667 100644 --- a/llama-cpp-bindings/src/gguf_context.rs +++ b/llama-cpp-bindings/src/gguf_context.rs @@ -1,7 +1,3 @@ -//! Safe wrapper around `gguf_context` for reading GGUF file metadata. -//! -//! Provides metadata-only access to GGUF files without loading tensor data. - use std::ffi::{CStr, CString}; use std::path::Path; use std::ptr::NonNull; @@ -9,18 +5,12 @@ use std::ptr::NonNull; use crate::gguf_context_error::GgufContextError; use crate::gguf_type::GgufType; -/// A safe wrapper around `gguf_context`. -/// -/// Opens a GGUF file in metadata-only mode (`no_alloc = true`), allowing -/// inspection of key-value pairs and tensor metadata without loading tensor data. #[derive(Debug)] pub struct GgufContext { context: NonNull, } impl GgufContext { - /// Open a GGUF file and parse its metadata header. - /// /// # Errors /// /// Returns [`GgufContextError::InitFailed`] if the file cannot be opened or parsed. @@ -46,14 +36,11 @@ impl GgufContext { Ok(Self { context }) } - /// Returns the number of key-value pairs in the GGUF file. #[must_use] pub fn n_kv(&self) -> i64 { unsafe { llama_cpp_bindings_sys::gguf_get_n_kv(self.context.as_ptr()) } } - /// Find the index of a key by name. - /// /// # Errors /// /// Returns [`GgufContextError::KeyNotFound`] if the key does not exist. @@ -72,8 +59,6 @@ impl GgufContext { Ok(index) } - /// Returns the key name at the given index. - /// /// # Safety considerations /// /// The caller must ensure `key_id` is in range `[0, n_kv())`. @@ -92,8 +77,6 @@ impl GgufContext { Ok(c_str.to_str()?) } - /// Returns the value type of the key-value pair at the given index. - /// /// # Safety considerations /// /// The caller must ensure `key_id` is in range `[0, n_kv())`. @@ -105,8 +88,6 @@ impl GgufContext { GgufType::from_raw(raw) } - /// Returns the u32 value at the given key index. - /// /// # Safety considerations /// /// The caller must ensure the key at `key_id` has type [`GgufType::Uint32`]. @@ -115,8 +96,6 @@ impl GgufContext { unsafe { llama_cpp_bindings_sys::gguf_get_val_u32(self.context.as_ptr(), key_id) } } - /// Returns the i32 value at the given key index. - /// /// # Safety considerations /// /// The caller must ensure the key at `key_id` has type [`GgufType::Int32`]. @@ -125,8 +104,6 @@ impl GgufContext { unsafe { llama_cpp_bindings_sys::gguf_get_val_i32(self.context.as_ptr(), key_id) } } - /// Returns the u64 value at the given key index. - /// /// # Safety considerations /// /// The caller must ensure the key at `key_id` has type [`GgufType::Uint64`]. @@ -135,8 +112,6 @@ impl GgufContext { unsafe { llama_cpp_bindings_sys::gguf_get_val_u64(self.context.as_ptr(), key_id) } } - /// Returns the string value at the given key index. - /// /// # Safety considerations /// /// The caller must ensure the key at `key_id` has type [`GgufType::String`]. @@ -155,7 +130,6 @@ impl GgufContext { Ok(c_str.to_str()?) } - /// Returns the number of tensors in the GGUF file. #[must_use] pub fn n_tensors(&self) -> i64 { unsafe { llama_cpp_bindings_sys::gguf_get_n_tensors(self.context.as_ptr()) } diff --git a/llama-cpp-bindings/src/gguf_context_error.rs b/llama-cpp-bindings/src/gguf_context_error.rs index ba1aa0dc..69523c9d 100644 --- a/llama-cpp-bindings/src/gguf_context_error.rs +++ b/llama-cpp-bindings/src/gguf_context_error.rs @@ -1,31 +1,20 @@ -//! Error types for GGUF context operations. - use std::ffi::NulError; use std::path::PathBuf; -/// Errors that can occur when working with GGUF contexts. #[derive(Debug, thiserror::Error)] pub enum GgufContextError { - /// Failed to initialize GGUF context from file #[error("Failed to initialize GGUF context from file: {0}")] InitFailed(PathBuf), - /// Key not found in GGUF metadata #[error("Key not found in GGUF context: {key}")] - KeyNotFound { - /// The key that was not found - key: String, - }, + KeyNotFound { key: String }, - /// Null byte in string #[error("null byte in string: {0}")] NulError(#[from] NulError), - /// Path cannot be converted to UTF-8 #[error("failed to convert path {0} to str")] PathToStrError(PathBuf), - /// Value is not valid UTF-8 #[error("GGUF value is not valid UTF-8: {0}")] Utf8Error(#[from] std::str::Utf8Error), } diff --git a/llama-cpp-bindings/src/gguf_type.rs b/llama-cpp-bindings/src/gguf_type.rs index 33de25cd..e59451e1 100644 --- a/llama-cpp-bindings/src/gguf_type.rs +++ b/llama-cpp-bindings/src/gguf_type.rs @@ -1,39 +1,22 @@ -//! GGUF value types. - -/// The type of a value stored in a GGUF key-value pair. #[repr(u32)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum GgufType { - /// 8-bit unsigned integer Uint8 = 0, - /// 8-bit signed integer Int8 = 1, - /// 16-bit unsigned integer Uint16 = 2, - /// 16-bit signed integer Int16 = 3, - /// 32-bit unsigned integer Uint32 = 4, - /// 32-bit signed integer Int32 = 5, - /// 32-bit floating point Float32 = 6, - /// Boolean Bool = 7, - /// String String = 8, - /// Array Array = 9, - /// 64-bit unsigned integer Uint64 = 10, - /// 64-bit signed integer Int64 = 11, - /// 64-bit floating point Float64 = 12, } impl GgufType { - /// Converts from the raw `gguf_type` value. Returns None for unknown types. #[must_use] pub const fn from_raw(value: llama_cpp_bindings_sys::gguf_type) -> Option { match value { @@ -54,7 +37,6 @@ impl GgufType { } } - /// Converts to the raw `gguf_type` value. #[must_use] pub const fn to_raw(self) -> llama_cpp_bindings_sys::gguf_type { self as llama_cpp_bindings_sys::gguf_type diff --git a/llama-cpp-bindings/src/ingest_outcome.rs b/llama-cpp-bindings/src/ingest_outcome.rs index abf3a44b..56bc0efb 100644 --- a/llama-cpp-bindings/src/ingest_outcome.rs +++ b/llama-cpp-bindings/src/ingest_outcome.rs @@ -3,12 +3,6 @@ use crate::sampled_token::SampledToken; #[derive(Clone, Debug)] pub struct IngestOutcome { pub sampled_token: SampledToken, - /// Empty when the token is part of a recognised marker boundary; otherwise - /// the decoded UTF-8 piece. Callers should stream `visible_piece` and skip - /// emission when it is empty. pub visible_piece: String, - /// Always the decoded UTF-8 piece, even for marker-boundary tokens. Useful - /// for accumulating the full raw model output (e.g. for downstream parser - /// cross-checks) without losing marker bytes. pub raw_piece: String, } diff --git a/llama-cpp-bindings/src/ingest_prompt_chunk.rs b/llama-cpp-bindings/src/ingest_prompt_chunk.rs index c17b0993..c83ff230 100644 --- a/llama-cpp-bindings/src/ingest_prompt_chunk.rs +++ b/llama-cpp-bindings/src/ingest_prompt_chunk.rs @@ -3,17 +3,6 @@ use crate::mtmd::MtmdInputChunkType; use crate::mtmd::MtmdInputChunkTypeError; use crate::sampled_token_classifier::SampledTokenClassifier; -/// Dispatches a single multimodal chunk into the classifier: -/// - Text chunks bump `prompt_tokens` and replay every text token through the -/// marker state machine, so prompt-end markers like `` reach the -/// classifier and the section transitions before generation begins. -/// - Image / Audio chunks bump only their own usage counters; they have no -/// text token IDs to replay. -/// -/// This is the single canonical per-chunk ingest path for the multimodal -/// driver. Any future per-chunk invariant (e.g. cached prefix replay) lives -/// here so it cannot diverge between consumers. -/// /// # Errors /// Returns [`MtmdInputChunkTypeError`] when the chunk reports a type unknown /// to this binding. Counters are not updated on error. diff --git a/llama-cpp-bindings/src/invalid_numa_strategy.rs b/llama-cpp-bindings/src/invalid_numa_strategy.rs index 2d00b029..9f80058f 100644 --- a/llama-cpp-bindings/src/invalid_numa_strategy.rs +++ b/llama-cpp-bindings/src/invalid_numa_strategy.rs @@ -1,6 +1,2 @@ -/// An invalid numa strategy was provided. #[derive(Debug, Eq, PartialEq, Copy, Clone)] -pub struct InvalidNumaStrategy( - /// The invalid numa strategy that was provided. - pub llama_cpp_bindings_sys::ggml_numa_strategy, -); +pub struct InvalidNumaStrategy(pub llama_cpp_bindings_sys::ggml_numa_strategy); diff --git a/llama-cpp-bindings/src/lib.rs b/llama-cpp-bindings/src/lib.rs index 9bed927b..9d3fc7e1 100644 --- a/llama-cpp-bindings/src/lib.rs +++ b/llama-cpp-bindings/src/lib.rs @@ -1,14 +1,3 @@ -//! Bindings to the llama.cpp library. -//! -//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API -//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to -//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that -//! the API is not as nice as it could be. -//! -//! # Feature Flags -//! -//! - `cuda` enables CUDA gpu support. - pub mod batch_add_error; pub mod chat_message_parse_outcome; pub mod context; diff --git a/llama-cpp-bindings/src/llama_backend.rs b/llama-cpp-bindings/src/llama_backend.rs index 30d83cf0..e6c8f4ee 100644 --- a/llama-cpp-bindings/src/llama_backend.rs +++ b/llama-cpp-bindings/src/llama_backend.rs @@ -1,22 +1,15 @@ -//! Representation of an initialized llama backend - use crate::LlamaCppError; use crate::llama_backend_numa_strategy::NumaStrategy; use llama_cpp_bindings_sys::ggml_log_level; use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering::SeqCst; -/// Representation of an initialized llama backend. -/// -/// This is required as a parameter for most llama functions as the backend must be initialized -/// before any llama functions are called. This type is proof of initialization. #[derive(Eq, PartialEq, Debug)] pub struct LlamaBackend {} static LLAMA_BACKEND_INITIALIZED: AtomicBool = AtomicBool::new(false); impl LlamaBackend { - /// Mark the llama backend as initialized fn mark_init() -> crate::Result<()> { match LLAMA_BACKEND_INITIALIZED.compare_exchange(false, true, SeqCst, SeqCst) { Ok(_was_uninitialized) => Ok(()), @@ -24,25 +17,6 @@ impl LlamaBackend { } } - /// Initialize the llama backend (without numa). - /// - /// # Examples - /// - /// ``` - ///# use llama_cpp_bindings::llama_backend::LlamaBackend; - ///# use llama_cpp_bindings::LlamaCppError; - ///# use std::error::Error; - /// - ///# fn main() -> Result<(), Box> { - /// - /// - /// let backend = LlamaBackend::init()?; - /// // the llama backend can only be initialized once - /// assert!(matches!(LlamaBackend::init(), Err(LlamaCppError::BackendAlreadyInitialized))); - /// - ///# Ok(()) - ///# } - /// ``` /// # Errors /// Returns an error if the backend was already initialized. pub fn init() -> crate::Result { @@ -51,19 +25,6 @@ impl LlamaBackend { Ok(Self {}) } - /// Initialize the llama backend (with numa). - /// ``` - ///# use llama_cpp_bindings::llama_backend::LlamaBackend; - ///# use std::error::Error; - ///# use llama_cpp_bindings::llama_backend_numa_strategy::NumaStrategy; - /// - ///# fn main() -> Result<(), Box> { - /// - /// let llama_backend = LlamaBackend::init_numa(NumaStrategy::Mirror)?; - /// - ///# Ok(()) - ///# } - /// ``` /// # Errors /// Returns an error if the backend was already initialized. pub fn init_numa(strategy: NumaStrategy) -> crate::Result { @@ -76,25 +37,21 @@ impl LlamaBackend { Ok(Self {}) } - /// Was the code built for a GPU backend & is a supported one available. #[must_use] pub fn supports_gpu_offload(&self) -> bool { unsafe { llama_cpp_bindings_sys::llama_supports_gpu_offload() } } - /// Does this platform support loading the model via mmap. #[must_use] pub fn supports_mmap(&self) -> bool { unsafe { llama_cpp_bindings_sys::llama_supports_mmap() } } - /// Does this platform support locking the model in RAM. #[must_use] pub fn supports_mlock(&self) -> bool { unsafe { llama_cpp_bindings_sys::llama_supports_mlock() } } - /// Change the output of llama.cpp's logging to be voided instead of pushed to `stderr`. pub fn void_logs(&mut self) { unsafe { llama_cpp_bindings_sys::llama_log_set(Some(void_log), std::ptr::null_mut()); @@ -109,21 +66,6 @@ const unsafe extern "C" fn void_log( ) { } -/// Drops the llama backend. -/// ``` -/// -///# use llama_cpp_bindings::llama_backend::LlamaBackend; -///# use std::error::Error; -/// -///# fn main() -> Result<(), Box> { -/// let backend = LlamaBackend::init()?; -/// drop(backend); -/// // can be initialized again after being dropped -/// let backend = LlamaBackend::init()?; -///# Ok(()) -///# } -/// -/// ``` impl Drop for LlamaBackend { fn drop(&mut self) { LLAMA_BACKEND_INITIALIZED.store(false, SeqCst); diff --git a/llama-cpp-bindings/src/llama_backend_device.rs b/llama-cpp-bindings/src/llama_backend_device.rs index b5851efb..aa7ce51f 100644 --- a/llama-cpp-bindings/src/llama_backend_device.rs +++ b/llama-cpp-bindings/src/llama_backend_device.rs @@ -4,26 +4,14 @@ use crate::llama_backend_device_type::device_type_from_raw; pub use crate::llama_backend_device_type::LlamaBackendDeviceType; -/// A ggml backend device -/// -/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices. #[derive(Debug, Clone)] pub struct LlamaBackendDevice { - /// The index of the device - /// - /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices. pub index: usize, - /// The name of the device (e.g. "Vulkan0") pub name: String, - /// A description of the device (e.g. "NVIDIA `GeForce` RTX 3080") pub description: String, - /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU") pub backend: String, - /// Total memory of the device in bytes pub memory_total: usize, - /// Free memory of the device in bytes pub memory_free: usize, - /// Device type pub device_type: LlamaBackendDeviceType, } @@ -37,7 +25,6 @@ fn cstr_to_string(ptr: *const c_char) -> String { } } -/// List ggml backend devices #[must_use] pub fn list_llama_ggml_backend_devices() -> Vec { let mut devices = Vec::new(); diff --git a/llama-cpp-bindings/src/llama_backend_device_type.rs b/llama-cpp-bindings/src/llama_backend_device_type.rs index fd22c8fd..5f1885cd 100644 --- a/llama-cpp-bindings/src/llama_backend_device_type.rs +++ b/llama-cpp-bindings/src/llama_backend_device_type.rs @@ -1,15 +1,9 @@ -/// Backend device type #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum LlamaBackendDeviceType { - /// CPU device Cpu, - /// ACCEL device Accelerator, - /// GPU device Gpu, - /// iGPU device IntegratedGpu, - /// Unknown device type Unknown, } diff --git a/llama-cpp-bindings/src/llama_backend_numa_strategy.rs b/llama-cpp-bindings/src/llama_backend_numa_strategy.rs index 2be150fa..fccef05a 100644 --- a/llama-cpp-bindings/src/llama_backend_numa_strategy.rs +++ b/llama-cpp-bindings/src/llama_backend_numa_strategy.rs @@ -1,17 +1,11 @@ use crate::invalid_numa_strategy::InvalidNumaStrategy; -/// NUMA (Non-Uniform Memory Access) thread affinity strategy for llama.cpp. #[derive(Debug, Eq, PartialEq, Copy, Clone)] pub enum NumaStrategy { - /// NUMA-aware scheduling is disabled. Threads are not pinned to specific nodes. Disabled, - /// Distributes threads across NUMA nodes in a round-robin fashion. Distribute, - /// Pins all threads to the current NUMA node to avoid cross-node memory access. Isolate, - /// Respects the CPU affinity mask set externally by the `numactl` command. Numactl, - /// Mirrors memory across NUMA nodes. Currently a no-op in llama.cpp. Mirror, } diff --git a/llama-cpp-bindings/src/llama_batch.rs b/llama-cpp-bindings/src/llama_batch.rs index b6b8b189..cc6e93ee 100644 --- a/llama-cpp-bindings/src/llama_batch.rs +++ b/llama-cpp-bindings/src/llama_batch.rs @@ -1,5 +1,3 @@ -//! Safe wrapper around `llama_batch`. - use crate::batch_add_error::BatchAddError; use crate::sampled_token::SampledToken; use crate::token::LlamaToken; @@ -53,32 +51,20 @@ fn checked_usize_as_llama_pos(value: usize, description: &str) -> Result { - /// The number of tokens the batch was allocated with. they are safe to write to - but not necessarily read from as they are not necessarily initialized allocated: usize, - /// The logits that are initialized. Used by [`LlamaContext`] to ensure that only initialized logits are accessed. pub initialized_logits: Vec, - /// The underlying `llama_batch` from the C API. pub llama_batch: llama_batch, phantom: PhantomData<&'tokens [LlamaToken]>, } impl<'tokens> LlamaBatch<'tokens> { - /// Clear the batch. This does not free the memory associated with the batch, but it does reset - /// the number of tokens to 0. pub fn clear(&mut self) { self.llama_batch.n_tokens = 0; self.initialized_logits.clear(); } - /// add a token to the batch for sequences `seq_ids` at position `pos`. If `logits` is true, the - /// token will be initialized and can be read from after the next decode. - /// /// # Errors /// /// Returns an error if there is insufficient space in the buffer or if integer conversions fail. @@ -126,11 +112,6 @@ impl<'tokens> LlamaBatch<'tokens> { Ok(()) } - /// Add a sequence of tokens to the batch for the given sequence id. If `logits_all` is true, the - /// tokens will be initialized and can be read from after the next decode. - /// - /// Either way the last token in the sequence will have its logits set to `true`. - /// /// # Errors /// /// Returns an error if there is insufficient space in the buffer or if integer conversions fail. @@ -154,13 +135,6 @@ impl<'tokens> LlamaBatch<'tokens> { Ok(()) } - /// Create a new `LlamaBatch` that can contain up to `n_tokens` tokens. - /// - /// # Arguments - /// - /// - `n_tokens`: the maximum number of tokens that can be added to the batch - /// - `n_seq_max`: the maximum number of sequences that can be added to the batch (generally 1 unless you know what you are doing) - /// /// # Errors /// /// Returns an error if `n_tokens` exceeds `i32::MAX`. @@ -176,11 +150,6 @@ impl<'tokens> LlamaBatch<'tokens> { }) } - /// ``llama_batch_get_one`` - /// Return batch for single sequence of tokens - /// - /// NOTE: this is a helper function to facilitate transition to the new batch API - /// /// # Errors /// /// Returns an error if the provided token buffer is empty or if integer conversions fail. @@ -210,7 +179,6 @@ impl<'tokens> LlamaBatch<'tokens> { }) } - /// Returns the number of tokens in the batch. #[must_use] pub const fn n_tokens(&self) -> i32 { self.llama_batch.n_tokens @@ -218,17 +186,6 @@ impl<'tokens> LlamaBatch<'tokens> { } impl Drop for LlamaBatch<'_> { - /// Drops the `LlamaBatch`. - /// - /// ``` - /// # use llama_cpp_bindings::llama_batch::LlamaBatch; - /// # use std::error::Error; - /// # fn main() -> Result<(), Box> { - /// let batch = LlamaBatch::new(512, 1)?; - /// // frees the memory associated with the batch. (allocated by llama.cpp) - /// drop(batch); - /// # Ok(()) - /// # } fn drop(&mut self) { unsafe { if self.allocated > 0 { diff --git a/llama-cpp-bindings/src/llama_time_us.rs b/llama-cpp-bindings/src/llama_time_us.rs index ee1c707e..63d43ad8 100644 --- a/llama-cpp-bindings/src/llama_time_us.rs +++ b/llama-cpp-bindings/src/llama_time_us.rs @@ -1,12 +1,3 @@ -/// Get the time (in microseconds) according to llama.cpp. -/// -/// ``` -/// # use llama_cpp_bindings::llama_time_us; -/// # use llama_cpp_bindings::llama_backend::LlamaBackend; -/// let backend = LlamaBackend::init().unwrap(); -/// let time = llama_time_us(); -/// assert!(time > 0); -/// ``` #[must_use] pub fn llama_time_us() -> i64 { unsafe { llama_cpp_bindings_sys::llama_time_us() } diff --git a/llama-cpp-bindings/src/llama_token_attr.rs b/llama-cpp-bindings/src/llama_token_attr.rs index fb9de83c..9af9fb98 100644 --- a/llama-cpp-bindings/src/llama_token_attr.rs +++ b/llama-cpp-bindings/src/llama_token_attr.rs @@ -1,28 +1,17 @@ use enumflags2::bitflags; -/// A rust flavored equivalent of `llama_token_type`. #[derive(Eq, PartialEq, Debug, Clone, Copy)] #[bitflags] #[repr(u32)] pub enum LlamaTokenAttr { - /// Unknown token attribute. Unknown = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_UNKNOWN as _, - /// Unused token attribute. Unused = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_UNUSED as _, - /// Normal text token. Normal = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_NORMAL as _, - /// Control token (e.g. BOS, EOS). Control = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_CONTROL as _, - /// User-defined token. UserDefined = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_USER_DEFINED as _, - /// Byte-level fallback token. Byte = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_BYTE as _, - /// Token with normalized text. Normalized = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_NORMALIZED as _, - /// Token with left-stripped whitespace. LStrip = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_LSTRIP as _, - /// Token with right-stripped whitespace. RStrip = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_RSTRIP as _, - /// Token representing a single word. SingleWord = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_SINGLE_WORD as _, } diff --git a/llama-cpp-bindings/src/llama_token_attrs.rs b/llama-cpp-bindings/src/llama_token_attrs.rs index 872aeb4e..d5ecd6de 100644 --- a/llama-cpp-bindings/src/llama_token_attrs.rs +++ b/llama-cpp-bindings/src/llama_token_attrs.rs @@ -15,7 +15,6 @@ const fn llama_token_type_to_u32(value: llama_cpp_bindings_sys::llama_token_type value } -/// A set of [`LlamaTokenAttr`] flags. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct LlamaTokenAttrs(pub BitFlags); diff --git a/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs b/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs index df1ad6c2..f294339d 100644 --- a/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs +++ b/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs @@ -1,9 +1,5 @@ -/// Returned by [`crate::llama_token_attrs::LlamaTokenAttrs::try_from`] when the -/// integer bit pattern contains bits not defined by -/// [`crate::llama_token_attr::LlamaTokenAttr`]. #[derive(thiserror::Error, Debug, Eq, PartialEq)] pub enum LlamaTokenAttrsFromIntError { - /// The value is not a valid `llama_token_type`. #[error("Unknown Value {0}")] UnknownValue(std::ffi::c_uint), } diff --git a/llama-cpp-bindings/src/llguidance_sampler.rs b/llama-cpp-bindings/src/llguidance_sampler.rs index 76a987a9..c57dfe55 100644 --- a/llama-cpp-bindings/src/llguidance_sampler.rs +++ b/llama-cpp-bindings/src/llguidance_sampler.rs @@ -1,8 +1,3 @@ -//! Pure Rust llguidance sampler for constrained decoding. -//! -//! Implements a custom `llama_sampler` using the `llguidance` and `toktrie` Rust crates -//! to enforce grammar constraints (JSON schema, regex, Lark, etc.) during token sampling. - use std::ffi::c_void; use std::sync::Arc; @@ -13,7 +8,6 @@ use crate::GrammarError; use crate::model::LlamaModel; use crate::sampling::LlamaSampler; -/// Internal state for the llguidance sampler. struct LlgContext { matcher: Matcher, tok_env: Arc, @@ -113,8 +107,6 @@ static mut LLG_SAMPLER_I: llama_cpp_bindings_sys::llama_sampler_i = backend_set_input: None, }; -/// Create an llguidance-based constrained decoding sampler. -/// /// # Errors /// /// Returns `GrammarError` if the parser factory, grammar, or parser cannot be created. diff --git a/llama-cpp-bindings/src/load_backends.rs b/llama-cpp-bindings/src/load_backends.rs index 22393a5a..54634f1a 100644 --- a/llama-cpp-bindings/src/load_backends.rs +++ b/llama-cpp-bindings/src/load_backends.rs @@ -3,17 +3,8 @@ use std::path::Path; use crate::load_backends_error::LoadBackendsError; use crate::load_backends_from_path::load_backends_from_path; -/// Compile-time path to the built GGML backend modules directory. -/// -/// Populated by `llama-cpp-bindings/build.rs` from the `DEP_LLAMA_BACKENDS_DIR` cargo metadata -/// emitted by `llama-cpp-bindings-sys` when built with the `dynamic-backends` feature. `None` -/// when the metadata is missing (e.g. when this crate is built outside the cargo workspace). pub const BACKENDS_DIR: Option<&str> = option_env!("GGML_BACKENDS_DIR"); -/// Load GGML backend modules from the compile-time default directory ([`BACKENDS_DIR`]). -/// -/// This is a no-op when `BACKENDS_DIR` is `None`. -/// /// # Errors /// /// Returns [`LoadBackendsError::PathNotUtf8`] when `BACKENDS_DIR` cannot be converted to UTF-8 diff --git a/llama-cpp-bindings/src/load_backends_error.rs b/llama-cpp-bindings/src/load_backends_error.rs index b3d628c3..3f7c5e6a 100644 --- a/llama-cpp-bindings/src/load_backends_error.rs +++ b/llama-cpp-bindings/src/load_backends_error.rs @@ -1,13 +1,10 @@ use std::ffi::NulError; use std::path::PathBuf; -/// Error returned when loading GGML backend modules from a path. #[derive(Debug, thiserror::Error)] pub enum LoadBackendsError { - /// The provided path could not be converted to UTF-8. #[error("backend directory path is not valid UTF-8: {0}")] PathNotUtf8(PathBuf), - /// The provided path contained an interior null byte. #[error("backend directory path contains a null byte: {0}")] PathNullByte(#[from] NulError), } diff --git a/llama-cpp-bindings/src/load_backends_from_path.rs b/llama-cpp-bindings/src/load_backends_from_path.rs index 7af9cce4..c2434e99 100644 --- a/llama-cpp-bindings/src/load_backends_from_path.rs +++ b/llama-cpp-bindings/src/load_backends_from_path.rs @@ -3,11 +3,6 @@ use std::path::Path; use crate::load_backends_error::LoadBackendsError; -/// Load GGML backend modules from the given directory. -/// -/// Call this before [`crate::llama_backend::LlamaBackend::init`] to enable runtime hardware -/// selection (Vulkan, CPU-AVX512, CPU-AVX2, etc.) when built with the `dynamic-backends` feature. -/// /// # Errors /// /// Returns [`LoadBackendsError::PathNotUtf8`] when `path` cannot be converted to UTF-8 and diff --git a/llama-cpp-bindings/src/log_options.rs b/llama-cpp-bindings/src/log_options.rs index ca6eacca..6192d0ff 100644 --- a/llama-cpp-bindings/src/log_options.rs +++ b/llama-cpp-bindings/src/log_options.rs @@ -1,4 +1,3 @@ -/// Options to configure how llama.cpp logs are intercepted. #[derive(Default, Debug, Clone)] pub struct LogOptions { pub disabled: bool, @@ -6,8 +5,6 @@ pub struct LogOptions { } impl LogOptions { - /// If enabled, logs are dispatched through the `log` crate. If disabled, all logs are - /// suppressed. Default is for logs to be dispatched. #[must_use] pub const fn with_logs_enabled(mut self, enabled: bool) -> Self { self.disabled = !enabled; @@ -15,10 +12,6 @@ impl LogOptions { self } - /// When enabled, llama.cpp and ggml INFO logs are dispatched at DEBUG level. WARN and - /// ERROR logs retain their original severity. This suppresses verbose informational output - /// under a typical INFO-level logger while keeping important diagnostics visible. - /// All demoted logs remain available via `RUST_LOG=debug`. #[must_use] pub const fn with_demote_info_to_debug(mut self, demote: bool) -> Self { self.demote_info_to_debug = demote; diff --git a/llama-cpp-bindings/src/max_devices.rs b/llama-cpp-bindings/src/max_devices.rs index e5d12b4a..014eeaee 100644 --- a/llama-cpp-bindings/src/max_devices.rs +++ b/llama-cpp-bindings/src/max_devices.rs @@ -1,10 +1,3 @@ -/// Get the max number of devices according to llama.cpp (this is generally cuda devices). -/// -/// ``` -/// # use llama_cpp_bindings::max_devices; -/// let max_devices = max_devices(); -/// assert!(max_devices >= 0); -/// ``` #[must_use] pub fn max_devices() -> usize { unsafe { llama_cpp_bindings_sys::llama_max_devices() } diff --git a/llama-cpp-bindings/src/mlock_supported.rs b/llama-cpp-bindings/src/mlock_supported.rs index 2899a2a2..96bf1728 100644 --- a/llama-cpp-bindings/src/mlock_supported.rs +++ b/llama-cpp-bindings/src/mlock_supported.rs @@ -1,12 +1,3 @@ -/// Is memory locking supported according to llama.cpp. -/// -/// ``` -/// # use llama_cpp_bindings::mlock_supported; -/// let mlock_supported = mlock_supported(); -/// if mlock_supported { -/// println!("mlock_supported!"); -/// } -/// ``` #[must_use] pub fn mlock_supported() -> bool { unsafe { llama_cpp_bindings_sys::llama_supports_mlock() } diff --git a/llama-cpp-bindings/src/mmap_supported.rs b/llama-cpp-bindings/src/mmap_supported.rs index b00d62c8..47ccbfe7 100644 --- a/llama-cpp-bindings/src/mmap_supported.rs +++ b/llama-cpp-bindings/src/mmap_supported.rs @@ -1,12 +1,3 @@ -/// Is memory mapping supported according to llama.cpp. -/// -/// ``` -/// # use llama_cpp_bindings::mmap_supported; -/// let mmap_supported = mmap_supported(); -/// if mmap_supported { -/// println!("mmap_supported!"); -/// } -/// ``` #[must_use] pub fn mmap_supported() -> bool { unsafe { llama_cpp_bindings_sys::llama_supports_mmap() } diff --git a/llama-cpp-bindings/src/model.rs b/llama-cpp-bindings/src/model.rs index d55ee679..8c33486d 100644 --- a/llama-cpp-bindings/src/model.rs +++ b/llama-cpp-bindings/src/model.rs @@ -1,5 +1,3 @@ -//! A safe wrapper around `llama_model`. - pub mod add_bos; pub mod llama_chat_message; pub mod llama_chat_template; @@ -78,9 +76,7 @@ fn cstring_with_validated_len(str: &str) -> Result<(CString, c_int), StringToTok Ok((c_string, len)) } -/// A safe wrapper around `llama_model`. pub struct LlamaModel { - /// Raw pointer to the underlying `llama_model`. pub model: NonNull, tok_env: OnceLock>, } @@ -98,14 +94,11 @@ unsafe impl Send for LlamaModel {} unsafe impl Sync for LlamaModel {} impl LlamaModel { - /// Returns a raw pointer to the model's vocabulary. #[must_use] pub fn vocab_ptr(&self) -> *const llama_cpp_bindings_sys::llama_vocab { unsafe { llama_cpp_bindings_sys::llama_model_get_vocab(self.model.as_ptr()) } } - /// Get the number of tokens the model was trained on. - /// /// # Errors /// /// Returns an error if the value returned by llama.cpp does not fit into a `u32`. @@ -115,7 +108,6 @@ impl LlamaModel { u32::try_from(n_ctx_train) } - /// Get all tokens in the model. pub fn tokens( &self, decode_special: bool, @@ -136,28 +128,24 @@ impl LlamaModel { }) } - /// Get the beginning of stream token. #[must_use] pub fn token_bos(&self) -> LlamaToken { let token = unsafe { llama_cpp_bindings_sys::llama_token_bos(self.vocab_ptr()) }; LlamaToken(token) } - /// Get the end of stream token. #[must_use] pub fn token_eos(&self) -> LlamaToken { let token = unsafe { llama_cpp_bindings_sys::llama_token_eos(self.vocab_ptr()) }; LlamaToken(token) } - /// Get the newline token. #[must_use] pub fn token_nl(&self) -> LlamaToken { let token = unsafe { llama_cpp_bindings_sys::llama_token_nl(self.vocab_ptr()) }; LlamaToken(token) } - /// Check if a token represents the end of generation (end of turn, end of sequence, etc.) #[must_use] pub fn is_eog_token(&self, token: &SampledToken) -> bool { let (SampledToken::Content(LlamaToken(id)) @@ -168,7 +156,6 @@ impl LlamaModel { unsafe { llama_cpp_bindings_sys::llama_token_is_eog(self.vocab_ptr(), id) } } - /// Get the decoder start token. #[must_use] pub fn decode_start_token(&self) -> LlamaToken { let token = @@ -176,15 +163,12 @@ impl LlamaModel { LlamaToken(token) } - /// Get the separator token (SEP). #[must_use] pub fn token_sep(&self) -> LlamaToken { let token = unsafe { llama_cpp_bindings_sys::llama_vocab_sep(self.vocab_ptr()) }; LlamaToken(token) } - /// Convert a string to a Vector of tokens. - /// /// # Errors /// /// - if [`str`] contains a null byte @@ -194,14 +178,6 @@ impl LlamaModel { /// ```no_run /// use llama_cpp_bindings::model::LlamaModel; /// - /// # fn main() -> Result<(), Box> { - /// use std::path::Path; - /// use llama_cpp_bindings::model::AddBos; - /// let backend = llama_cpp_bindings::llama_backend::LlamaBackend::init()?; - /// let model = LlamaModel::load_from_file(&backend, Path::new("path/to/model"), &Default::default())?; - /// let tokens = model.str_to_token("Hello, World!", AddBos::Always)?; - /// # Ok(()) - /// # } pub fn str_to_token( &self, str: &str, @@ -253,8 +229,6 @@ impl LlamaModel { Ok(buffer) } - /// Get the type of a token. - /// /// # Errors /// /// Returns an error if the token type is not known to this library. @@ -268,16 +242,6 @@ impl LlamaModel { LlamaTokenAttrs::try_from(token_type) } - /// Convert a token to a string using the underlying llama.cpp `llama_token_to_piece` function. - /// - /// This is the new default function for token decoding and provides direct access to - /// the llama.cpp token decoding functionality without any special logic or filtering. - /// - /// Decoding raw string requires using an decoder, tokens from language models may not always map - /// to full characters depending on the encoding so stateful decoding is required, otherwise partial strings may be lost! - /// Invalid characters are mapped to REPLACEMENT CHARACTER making the method safe to use even if the model inherently produces - /// garbage. - /// /// # Errors /// /// - if the token type is unknown @@ -310,12 +274,6 @@ impl LlamaModel { Ok(output_piece) } - /// Raw token decoding to bytes, use if you want to handle the decoding model output yourself - /// - /// Convert a token to bytes using the underlying llama.cpp `llama_token_to_piece` function. This is mostly - /// a thin wrapper around `llama_token_to_piece` function, that handles rust <-> c type conversions while - /// letting the caller handle errors. For a safer interface returning rust strings directly use `token_to_piece` instead! - /// /// # Errors /// /// - if the token type is unknown @@ -356,17 +314,11 @@ impl LlamaModel { } } - /// The number of tokens the model was trained on. - /// - /// This returns a `c_int` for maximum compatibility. Most of the time it can be cast to an i32 - /// without issue. #[must_use] pub fn n_vocab(&self) -> i32 { unsafe { llama_cpp_bindings_sys::llama_n_vocab(self.vocab_ptr()) } } - /// The type of vocab the model was trained on. - /// /// # Errors /// /// Returns an error if llama.cpp emits a vocab type that is not known to this library. @@ -376,33 +328,26 @@ impl LlamaModel { VocabType::try_from(vocab_type) } - /// This returns a `c_int` for maximum compatibility. Most of the time it can be cast to an i32 - /// without issue. #[must_use] pub fn n_embd(&self) -> c_int { unsafe { llama_cpp_bindings_sys::llama_n_embd(self.model.as_ptr()) } } - /// Returns the total size of all the tensors in the model in bytes. #[must_use] pub fn size(&self) -> u64 { unsafe { llama_cpp_bindings_sys::llama_model_size(self.model.as_ptr()) } } - /// Returns the number of parameters in the model. #[must_use] pub fn n_params(&self) -> u64 { unsafe { llama_cpp_bindings_sys::llama_model_n_params(self.model.as_ptr()) } } - /// Returns whether the model is a recurrent network (Mamba, RWKV, etc) #[must_use] pub fn is_recurrent(&self) -> bool { unsafe { llama_cpp_bindings_sys::llama_model_is_recurrent(self.model.as_ptr()) } } - /// Returns the number of layers within the model. - /// /// # Errors /// /// Returns an error if the layer count returned by llama.cpp does not fit into a `u32`. @@ -410,8 +355,6 @@ impl LlamaModel { u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_layer(self.model.as_ptr()) }) } - /// Returns the number of attention heads within the model. - /// /// # Errors /// /// Returns an error if the head count returned by llama.cpp does not fit into a `u32`. @@ -419,8 +362,6 @@ impl LlamaModel { u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_head(self.model.as_ptr()) }) } - /// Returns the number of KV attention heads. - /// /// # Errors /// /// Returns an error if the KV head count returned by llama.cpp does not fit into a `u32`. @@ -428,16 +369,11 @@ impl LlamaModel { u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_head_kv(self.model.as_ptr()) }) } - /// Returns whether the model is a hybrid network (Jamba, Granite, Qwen3xx, etc.) - /// - /// Hybrid models have both attention layers and recurrent/SSM layers. #[must_use] pub fn is_hybrid(&self) -> bool { unsafe { llama_cpp_bindings_sys::llama_model_is_hybrid(self.model.as_ptr()) } } - /// Get metadata value as a string by key name - /// /// # Errors /// Returns an error if the key is not found or the value is not valid UTF-8. pub fn meta_val_str(&self, key: &str) -> Result { @@ -457,14 +393,11 @@ impl LlamaModel { ) } - /// Get the number of metadata key/value pairs #[must_use] pub fn meta_count(&self) -> i32 { unsafe { llama_cpp_bindings_sys::llama_model_meta_count(self.model.as_ptr()) } } - /// Get metadata key name by index - /// /// # Errors /// Returns an error if the index is out of range or the key is not valid UTF-8. pub fn meta_key_by_index(&self, index: i32) -> Result { @@ -481,8 +414,6 @@ impl LlamaModel { ) } - /// Get metadata value as a string by index - /// /// # Errors /// Returns an error if the index is out of range or the value is not valid UTF-8. pub fn meta_val_str_by_index(&self, index: i32) -> Result { @@ -499,7 +430,6 @@ impl LlamaModel { ) } - /// Returns the rope type of the model. #[must_use] pub fn rope_type(&self) -> Option { let raw = unsafe { llama_cpp_bindings_sys::llama_model_rope_type(self.model.as_ptr()) }; @@ -507,15 +437,6 @@ impl LlamaModel { rope_type::rope_type_from_raw(raw) } - /// Get chat template from model by name. If the name parameter is None, the default chat template will be returned. - /// - /// You supply this into [`Self::apply_chat_template`] to get back a string with the appropriate template - /// substitution applied to convert a list of messages into a prompt the LLM can use to complete - /// the chat. - /// - /// You could also use an external jinja parser, like [minijinja](https://github.com/mitsuhiko/minijinja), - /// to parse jinja templates not supported by the llama.cpp template engine. - /// /// # Errors /// /// * If the model has no chat template by that name @@ -546,8 +467,6 @@ impl LlamaModel { } } - /// Loads a model from a file. - /// /// # Errors /// /// See [`LlamaModelLoadError`] for more information. @@ -610,8 +529,6 @@ impl LlamaModel { } } - /// Initializes a lora adapter from a file. - /// /// # Errors /// /// See [`LlamaLoraAdapterInitError`] for more information. @@ -643,21 +560,6 @@ impl LlamaModel { }) } - /// Apply the models chat template to some messages. - /// See - /// - /// Unlike the llama.cpp `apply_chat_template` which just randomly uses the `ChatML` template when given - /// a null pointer for the template, this requires an explicit template to be specified. If you want to - /// use "chatml", then just do `LlamaChatTemplate::new("chatml")` or any other model name or template - /// string. - /// - /// Use [`Self::chat_template`] to retrieve the template baked into the model (this is the preferred - /// mechanism as using the wrong chat template can result in really unexpected responses from the LLM). - /// - /// You probably want to set `add_ass` to true so that the generated template string ends with a the - /// opening tag of the assistant. If you fail to leave a hanging chat tag, the model will likely generate - /// one into the output and the output may also have unexpected output aside from that. - /// /// # Errors /// There are many ways this can fail. See [`ApplyChatTemplateError`] for more information. pub fn apply_chat_template( @@ -720,17 +622,6 @@ impl LlamaModel { truncated_buffer_to_string(buff, final_size) } - /// Build a streaming [`SampledTokenClassifier`] for this model. - /// - /// At construction the bindings detect reasoning markers (via the - /// autoparser, with a chunked-thinking fallback for templates that consume - /// thoughts via content blocks), tool-call markers, and the trailing - /// generation-prompt slice. The classifier then runs a state machine over - /// the decoded token stream — no per-model branches. - /// - /// If the model has no usable chat template the classifier is built in a - /// blind mode that classifies every token as - /// [`SampledToken::Undeterminable`]. pub fn sampled_token_classifier(&self) -> SampledTokenClassifier<'_> { let markers = match self.streaming_markers() { Ok(markers) => markers, @@ -745,12 +636,6 @@ impl LlamaModel { SampledTokenClassifier::new(self, markers) } - /// Detect reasoning / tool-call markers (as token-ID sequences) and the - /// trailing generation-prompt slice for this model's chat template. The - /// returned `StreamingMarkers` carry tokenised markers — never raw strings - /// — so the classifier matches by `LlamaToken` equality rather than text - /// scanning. - /// /// # Errors /// Returns [`MarkerDetectionError`] when any underlying FFI call fails. pub fn streaming_markers(&self) -> Result { @@ -781,9 +666,6 @@ impl LlamaModel { }) } - /// When the autoparser-driven FFI returned no tool-call markers, consult the - /// per-template override registry so wrapper-known templates (Gemma 4, - /// Mistral 3, ...) still drive the classifier. fn resolve_tool_call_marker_strings( &self, autoparser_open: Option, @@ -828,11 +710,6 @@ impl LlamaModel { } } - /// Returns the rich tool-call marker bundle (open / separator / close / - /// optional value-quote pair) for this model's chat template, sourced from - /// the wrapper's per-template override registry. Returns `None` when no - /// registered override matches — callers in that case fall back to - /// llama.cpp's autoparser via [`Self::parse_chat_message`]. #[must_use] pub fn tool_call_markers(&self) -> Option { let template = match self.chat_template(None) { @@ -873,27 +750,6 @@ impl LlamaModel { } } - /// Parse the assistant's output text into structured content, reasoning, - /// and tool calls. - /// - /// Two passes, in order: - /// 1. Duck-type the wrapper-side parsers across every known shape - /// (Qwen XML, GLM key-value, Gemma paired-quote, Mistral bracketed-JSON). - /// First match wins. The shapes are ordered so that more restrictive - /// shapes run first, which keeps the duck-type pass safe for inputs - /// that share an open marker but differ in inner structure. - /// 2. Delegate to llama.cpp's `common_chat_parse`. If it succeeds the - /// result is `Recognized`; if it throws `ParseException` the result is - /// `Unrecognized` with the raw input plus the FFI's diagnostic, so the - /// caller can pass the unstructured tokens to the client. - /// - /// Empty tool-call `id` fields are filled with `call_{index}` before - /// returning, so callers always see well-formed identifiers. - /// - /// `tools_json` is a JSON-array string of OpenAI-style tool definitions - /// (use `"[]"` when no tools are in scope). `is_partial` switches between - /// mid-stream (lenient) and final (strict) parses for the FFI step. - /// /// # Errors /// /// Returns [`ParseChatMessageError`] when `tools_json` is not valid JSON, @@ -1029,11 +885,6 @@ impl LlamaModel { } } - /// Render the model's chat template with the autoparser's synthetic - /// no-tools and with-tools inputs. Returns `(output_no_tools, - /// output_with_tools)`. Either side can be empty when the template throws - /// during rendering. Useful for debugging tool-call marker detection. - /// /// # Errors /// /// Returns [`MarkerDetectionError`] when the C++ analyzer throws or the FFI @@ -1049,10 +900,6 @@ impl LlamaModel { } impl LlamaModel { - /// Returns a process-cached, approximate token environment built from this model's vocabulary. - /// - /// The first call iterates the full vocabulary and constructs the trie; subsequent calls - /// return the cached `Arc` without further FFI work. pub fn approximate_tok_env(&self) -> Arc { Arc::clone(self.tok_env.get_or_init(|| build_approximate_tok_env(self))) } diff --git a/llama-cpp-bindings/src/model/add_bos.rs b/llama-cpp-bindings/src/model/add_bos.rs index 1d38814a..ab257829 100644 --- a/llama-cpp-bindings/src/model/add_bos.rs +++ b/llama-cpp-bindings/src/model/add_bos.rs @@ -1,8 +1,5 @@ -/// How to determine if we should prepend a bos token to tokens #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AddBos { - /// Add the beginning of stream token to the start of the string. Always, - /// Do not add the beginning of stream token to the start of the string. Never, } diff --git a/llama-cpp-bindings/src/model/llama_chat_message.rs b/llama-cpp-bindings/src/model/llama_chat_message.rs index 7920f750..51e1f086 100644 --- a/llama-cpp-bindings/src/model/llama_chat_message.rs +++ b/llama-cpp-bindings/src/model/llama_chat_message.rs @@ -2,7 +2,6 @@ use std::ffi::CString; use crate::NewLlamaChatMessageError; -/// A Safe wrapper around `llama_chat_message` #[derive(Debug, Eq, PartialEq, Clone)] pub struct LlamaChatMessage { pub role: CString, @@ -10,8 +9,6 @@ pub struct LlamaChatMessage { } impl LlamaChatMessage { - /// Create a new `LlamaChatMessage` - /// /// # Errors /// If either of ``role`` or ``content`` contain null bytes. pub fn new(role: String, content: String) -> Result { diff --git a/llama-cpp-bindings/src/model/llama_chat_template.rs b/llama-cpp-bindings/src/model/llama_chat_template.rs index 54e4118a..3e8f86d0 100644 --- a/llama-cpp-bindings/src/model/llama_chat_template.rs +++ b/llama-cpp-bindings/src/model/llama_chat_template.rs @@ -1,40 +1,27 @@ use std::ffi::{CStr, CString}; use std::str::Utf8Error; -/// A performance-friendly wrapper around [`super::LlamaModel::chat_template`]. -/// -/// This is fed into [`super::LlamaModel::apply_chat_template`] to convert a list of messages into -/// an LLM prompt. Internally the template is stored as a `CString` to avoid round-trip conversions -/// within the FFI. #[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash)] pub struct LlamaChatTemplate(pub CString); impl LlamaChatTemplate { - /// Create a new template from a string. This can either be the name of a llama.cpp [chat template](https://github.com/ggerganov/llama.cpp/blob/8a8c4ceb6050bd9392609114ca56ae6d26f5b8f5/src/llama-chat.cpp#L27-L61) - /// like "chatml" or "llama3" or an actual Jinja template for llama.cpp to interpret. - /// /// # Errors /// Returns an error if the template string contains null bytes. pub fn new(template: &str) -> Result { Ok(Self(CString::new(template)?)) } - /// Accesses the template as a c string reference. #[must_use] pub fn as_c_str(&self) -> &CStr { &self.0 } - /// Attempts to convert the `CString` into a Rust str reference. - /// /// # Errors /// Returns an error if the template is not valid UTF-8. pub fn to_str(&self) -> Result<&str, Utf8Error> { self.0.to_str() } - /// Convenience method to create an owned String. - /// /// # Errors /// Returns an error if the template is not valid UTF-8. pub fn to_string(&self) -> Result { diff --git a/llama-cpp-bindings/src/model/llama_lora_adapter.rs b/llama-cpp-bindings/src/model/llama_lora_adapter.rs index a0d754a1..a209a278 100644 --- a/llama-cpp-bindings/src/model/llama_lora_adapter.rs +++ b/llama-cpp-bindings/src/model/llama_lora_adapter.rs @@ -1,9 +1,7 @@ use std::ptr::NonNull; -/// A safe wrapper around `llama_lora_adapter`. #[derive(Debug)] #[repr(transparent)] pub struct LlamaLoraAdapter { - /// Raw pointer to the underlying `llama_adapter_lora`. pub lora_adapter: NonNull, } diff --git a/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs b/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs index ed644534..46c246eb 100644 --- a/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs +++ b/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs @@ -1,8 +1,5 @@ -/// An error that occurs when unknown split mode is encountered. #[derive(Debug, Clone, PartialEq, Eq)] pub struct LlamaSplitModeParseError { - /// The value that could not be parsed as a split mode. pub value: i32, - /// Additional context about why the parse failed. pub context: String, } diff --git a/llama-cpp-bindings/src/model/params.rs b/llama-cpp-bindings/src/model/params.rs index 3b5bb2aa..58813490 100644 --- a/llama-cpp-bindings/src/model/params.rs +++ b/llama-cpp-bindings/src/model/params.rs @@ -1,5 +1,3 @@ -//! A safe wrapper around `llama_model_params`. - use crate::LlamaCppError; use crate::context::params::LlamaContextParams; use crate::error::{FitError, ModelParamsError}; @@ -18,15 +16,9 @@ pub mod kv_overrides; pub mod param_override_value; pub mod unknown_kv_override_tag; -/// The maximum number of devices supported. -/// -/// The real maximum number of devices is the lesser one of this value and the value returned by -/// `llama_cpp_bindings::max_devices()`. pub const LLAMA_CPP_MAX_DEVICES: usize = 16; -/// A safe wrapper around `llama_model_params`. pub struct LlamaModelParams { - /// The underlying `llama_model_params` from the C API. pub params: llama_cpp_bindings_sys::llama_model_params, kv_overrides: Vec, buft_overrides: Vec, @@ -50,47 +42,15 @@ impl Debug for LlamaModelParams { } impl LlamaModelParams { - /// See [`KvOverrides`] - /// - /// # Examples - /// - /// ```rust - /// # use llama_cpp_bindings::model::params::LlamaModelParams; - /// let params = Box::pin(LlamaModelParams::default()); - /// let kv_overrides = params.kv_overrides(); - /// let count = kv_overrides.into_iter().count(); - /// assert_eq!(count, 0); - /// ``` #[must_use] pub const fn kv_overrides(&self) -> KvOverrides<'_> { KvOverrides::new(self) } - /// Appends a key-value override to the model parameters. It must be pinned as this creates a self-referential struct. - /// /// # Errors /// Returns [`ModelParamsError`] if the internal override vector has no available slot, /// the slot is not empty, or the key contains invalid characters. /// - /// # Examples - /// - /// ```rust - /// # use std::ffi::{CStr, CString}; - /// use std::pin::pin; - /// # use llama_cpp_bindings::model::params::LlamaModelParams; - /// # use llama_cpp_bindings::model::params::param_override_value::ParamOverrideValue; - /// let mut params = pin!(LlamaModelParams::default()); - /// let key = CString::new("key").expect("CString::new failed"); - /// params.as_mut().append_kv_override(&key, ParamOverrideValue::Int(50)).unwrap(); - /// - /// let kv_overrides = params.kv_overrides().into_iter().collect::>(); - /// assert_eq!(kv_overrides.len(), 1); - /// - /// let (k, v) = &kv_overrides[0]; - /// assert_eq!(v, &ParamOverrideValue::Int(50)); - /// - /// assert_eq!(k.to_bytes(), b"key", "expected key to be 'key', was {:?}", k); - /// ``` pub fn append_kv_override( mut self: Pin<&mut Self>, key: &CStr, @@ -122,10 +82,6 @@ impl LlamaModelParams { Ok(()) } - /// Pushes the trailing zero-tag sentinel onto `kv_overrides` and refreshes - /// `params.kv_overrides`. The cached pointer is nulled before [`Vec::push`] - /// so that a relocation-induced panic never leaves a dangling pointer in - /// `params`. fn push_kv_override_terminator(mut self: Pin<&mut Self>) { self.params.kv_overrides = null(); @@ -143,8 +99,6 @@ impl LlamaModelParams { } impl LlamaModelParams { - /// Adds buffer type overrides to move all mixture-of-experts layers to CPU. - /// /// # Errors /// Returns [`ModelParamsError`] if the internal override vector has no available slot, /// the slot is not empty, or the key contains invalid characters. @@ -152,9 +106,6 @@ impl LlamaModelParams { self.add_cpu_buft_override(c"\\.ffn_(up|down|gate)_(ch|)exps") } - /// Appends a buffer type override to the model parameters, to move layers matching pattern to CPU. - /// It must be pinned as this creates a self-referential struct. - /// /// # Errors /// Returns [`ModelParamsError`] if the internal override vector has no available slot, /// the slot is not empty, or the key contains invalid characters. @@ -188,10 +139,6 @@ impl LlamaModelParams { Ok(()) } - /// Pushes the trailing null-pattern sentinel onto `buft_overrides` and - /// refreshes `params.tensor_buft_overrides`. The cached pointer is nulled - /// before [`Vec::push`] so that a relocation-induced panic never leaves a - /// dangling pointer in `params`. fn push_buft_override_terminator(mut self: Pin<&mut Self>) { self.params.tensor_buft_overrides = null(); @@ -206,45 +153,37 @@ impl LlamaModelParams { } impl LlamaModelParams { - /// Get the number of layers to offload to the GPU. #[must_use] pub const fn n_gpu_layers(&self) -> i32 { self.params.n_gpu_layers } - /// The GPU that is used for scratch and small tensors #[must_use] pub const fn main_gpu(&self) -> i32 { self.params.main_gpu } - /// only load the vocabulary, no weights #[must_use] pub const fn vocab_only(&self) -> bool { self.params.vocab_only } - /// use mmap if possible #[must_use] pub const fn use_mmap(&self) -> bool { self.params.use_mmap } - /// force system to keep model in RAM #[must_use] pub const fn use_mlock(&self) -> bool { self.params.use_mlock } - /// get the split mode - /// /// # Errors /// Returns `LlamaSplitModeParseError` if the unknown split mode is encountered. pub fn split_mode(&self) -> Result { LlamaSplitMode::try_from(self.params.split_mode) } - /// get the devices #[must_use] pub fn devices(&self) -> Vec { let mut backend_devices = Vec::new(); @@ -270,13 +209,6 @@ impl LlamaModelParams { devices } - /// sets the number of gpu layers to offload to the GPU. - /// ``` - /// # use llama_cpp_bindings::model::params::LlamaModelParams; - /// let params = LlamaModelParams::default(); - /// let params = params.with_n_gpu_layers(1); - /// assert_eq!(params.n_gpu_layers(), 1); - /// ``` #[must_use] pub fn with_n_gpu_layers(mut self, n_gpu_layers: u32) -> Self { let n_gpu_layers = i32::try_from(n_gpu_layers).unwrap_or(i32::MAX); @@ -284,54 +216,29 @@ impl LlamaModelParams { self } - /// sets the main GPU - /// - /// To enable this option, you must set `split_mode` to `LlamaSplitMode::None` to enable single GPU mode. #[must_use] pub const fn with_main_gpu(mut self, main_gpu: i32) -> Self { self.params.main_gpu = main_gpu; self } - /// sets `vocab_only` #[must_use] pub const fn with_vocab_only(mut self, vocab_only: bool) -> Self { self.params.vocab_only = vocab_only; self } - /// sets `use_mmap` - /// - /// # Examples - /// - /// ```rust - /// # use llama_cpp_bindings::model::params::LlamaModelParams; - /// let params = LlamaModelParams::default().with_use_mmap(false); - /// assert!(!params.use_mmap()); - /// ``` #[must_use] pub const fn with_use_mmap(mut self, use_mmap: bool) -> Self { self.params.use_mmap = use_mmap; self } - /// Get `no_alloc` #[must_use] pub const fn no_alloc(&self) -> bool { self.params.no_alloc } - /// Set `no_alloc`. When enabled, tensor data is not allocated. - /// Incompatible with `use_mmap`, so enabling this also disables mmap. - /// - /// # Examples - /// - /// ```rust - /// # use llama_cpp_bindings::model::params::LlamaModelParams; - /// let params = LlamaModelParams::default().with_no_alloc(true); - /// assert!(params.no_alloc()); - /// assert!(!params.use_mmap()); - /// ``` #[must_use] pub const fn with_no_alloc(mut self, no_alloc: bool) -> Self { self.params.no_alloc = no_alloc; @@ -341,28 +248,18 @@ impl LlamaModelParams { self } - /// sets `use_mlock` #[must_use] pub const fn with_use_mlock(mut self, use_mlock: bool) -> Self { self.params.use_mlock = use_mlock; self } - /// sets `split_mode` #[must_use] pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self { self.params.split_mode = split_mode.into(); self } - /// sets `devices` - /// - /// The devices are specified as indices that correspond to the ggml backend device indices. - /// - /// The maximum number of devices is 16. - /// - /// You don't need to specify CPU or ACCEL devices. - /// /// # Errors /// Returns `LlamaCppError::BackendDeviceNotFound` if any device index is invalid. pub fn with_devices(mut self, devices: &[usize]) -> Result { @@ -387,38 +284,6 @@ impl LlamaModelParams { } impl LlamaModelParams { - /// Automatically fit model and context parameters to available device memory. - /// - /// Wraps llama.cpp's `common_fit_params`. Given a model path, available per-device memory - /// margins, and a minimum context size, it fills in `n_gpu_layers`, `tensor_split`, and - /// `tensor_buft_overrides` to fit the model to the available VRAM, and may reduce - /// `cparams.n_ctx` if needed. On success the model and context params are updated in place. - /// - /// # Requirements - /// - /// Per the C API docstring, only parameters that still hold their default value are - /// modified. In practice this means: - /// - `n_gpu_layers` must be at its default (`-1`). Do not call - /// [`with_n_gpu_layers`](Self::with_n_gpu_layers) before this. - /// - No `tensor_buft_overrides` may be set. Do not call - /// [`add_cpu_buft_override`](Self::add_cpu_buft_override) or - /// [`add_cpu_moe_override`](Self::add_cpu_moe_override) before this. - /// - `cparams.n_ctx` is only auto-selected if it is `0`; otherwise it is left alone. - /// - /// # Arguments - /// - /// - `model_path` — path to the GGUF model file as a C string. - /// - `context_params` — context parameters; `n_ctx` may be modified (see above). - /// - `margins` — memory margin per device in bytes. Must have at least - /// `crate::max_devices()` elements. - /// - `n_ctx_min` — minimum context size to preserve when reducing memory usage. - /// - `log_level` — minimum log level for fitting output; lower levels go to the debug log. - /// - /// # Thread safety - /// - /// This function is **not** thread safe: the underlying C call mutates the global - /// llama logger state. - /// /// # Errors /// /// Returns one of the [`FitError`] variants matching the vendored wrapper's status code. @@ -499,19 +364,6 @@ impl LlamaModelParams { } } -/// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`) -/// ``` -/// # use llama_cpp_bindings::model::params::LlamaModelParams; -/// use llama_cpp_bindings::model::split_mode::LlamaSplitMode; -/// let params = LlamaModelParams::default(); -/// assert_eq!(params.n_gpu_layers(), -1, "n_gpu_layers should be -1"); -/// assert_eq!(params.main_gpu(), 0, "main_gpu should be 0"); -/// assert_eq!(params.vocab_only(), false, "vocab_only should be false"); -/// assert_eq!(params.use_mmap(), true, "use_mmap should be true"); -/// assert_eq!(params.use_mlock(), false, "use_mlock should be false"); -/// assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer), "split_mode should be LAYER"); -/// assert_eq!(params.devices().len(), 0, "devices should be empty"); -/// ``` impl Default for LlamaModelParams { fn default() -> Self { let default_params = unsafe { llama_cpp_bindings_sys::llama_model_default_params() }; diff --git a/llama-cpp-bindings/src/model/params/fit_result.rs b/llama-cpp-bindings/src/model/params/fit_result.rs index 2f89978b..655a1069 100644 --- a/llama-cpp-bindings/src/model/params/fit_result.rs +++ b/llama-cpp-bindings/src/model/params/fit_result.rs @@ -1,6 +1,4 @@ -/// Result of [`crate::model::params::LlamaModelParams::fit_params`]. #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub struct FitResult { - /// The context size after fitting (may have been reduced from the requested value). pub n_ctx: u32, } diff --git a/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs b/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs index 6073673d..8bcdb737 100644 --- a/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs +++ b/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs @@ -4,7 +4,6 @@ use std::fmt::Debug; use crate::model::params::LlamaModelParams; use crate::model::params::param_override_value::ParamOverrideValue; -/// An iterator over the key-value overrides for a model. #[derive(Debug)] pub struct KvOverrideValueIterator<'model_params> { model_params: &'model_params LlamaModelParams, @@ -33,8 +32,6 @@ impl Iterator for KvOverrideValueIterator<'_> { loop { // SAFETY: llama.cpp guarantees the last element contains an empty key. - // We've checked the previous one in the last iteration, the next one - // should be valid or 0 (and thus safe to deref). let current = unsafe { *overrides.add(self.current) }; if current.key[0] == 0 { diff --git a/llama-cpp-bindings/src/model/params/kv_overrides.rs b/llama-cpp-bindings/src/model/params/kv_overrides.rs index d3f46c28..618fd9cd 100644 --- a/llama-cpp-bindings/src/model/params/kv_overrides.rs +++ b/llama-cpp-bindings/src/model/params/kv_overrides.rs @@ -1,18 +1,14 @@ -//! Key-value overrides for a model. - use std::fmt::Debug; use crate::model::params::LlamaModelParams; use crate::model::params::kv_override_value_iterator::KvOverrideValueIterator; -/// A struct implementing [`IntoIterator`] over the key-value overrides for a model. #[derive(Debug)] pub struct KvOverrides<'model_params> { model_params: &'model_params LlamaModelParams, } impl KvOverrides<'_> { - /// Creates a new `KvOverrides` view over the given model parameters. #[must_use] pub const fn new(model_params: &LlamaModelParams) -> KvOverrides<'_> { KvOverrides { model_params } diff --git a/llama-cpp-bindings/src/model/params/param_override_value.rs b/llama-cpp-bindings/src/model/params/param_override_value.rs index b20e12af..041371a5 100644 --- a/llama-cpp-bindings/src/model/params/param_override_value.rs +++ b/llama-cpp-bindings/src/model/params/param_override_value.rs @@ -1,20 +1,14 @@ use crate::model::params::unknown_kv_override_tag::UnknownKvOverrideTag; -/// An override value for a model parameter. #[derive(Debug, Clone, Copy, PartialEq)] pub enum ParamOverrideValue { - /// A boolean value Bool(bool), - /// A float value Float(f64), - /// A integer value Int(i64), - /// A string value Str([std::os::raw::c_char; 128]), } impl ParamOverrideValue { - /// Returns the FFI tag corresponding to this override value variant. #[must_use] pub const fn tag(&self) -> llama_cpp_bindings_sys::llama_model_kv_override_type { match self { @@ -25,7 +19,6 @@ impl ParamOverrideValue { } } - /// Returns the FFI union value for this override. #[must_use] pub const fn value(&self) -> llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 { match self { diff --git a/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs b/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs index 67978bde..da7988d0 100644 --- a/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs +++ b/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs @@ -1,4 +1,3 @@ -/// Unknown KV override tag from the FFI layer. #[derive(Debug, thiserror::Error)] #[error("unknown KV override tag: {0}")] pub struct UnknownKvOverrideTag(pub llama_cpp_bindings_sys::llama_model_kv_override_type); diff --git a/llama-cpp-bindings/src/model/rope_type.rs b/llama-cpp-bindings/src/model/rope_type.rs index 35ddaa9c..2dce0526 100644 --- a/llama-cpp-bindings/src/model/rope_type.rs +++ b/llama-cpp-bindings/src/model/rope_type.rs @@ -1,18 +1,11 @@ -/// The Rope type that's used within the model. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RopeType { - /// Standard rotary positional encoding. Norm, - /// GPT-NeoX style rotary positional encoding. NeoX, - /// Multi-dimensional rotary positional encoding. MRope, - /// Vision model rotary positional encoding. Vision, } -/// Converts a raw llama.cpp rope type constant to a `RopeType`. -/// Returns `None` for unknown or "none" rope types. #[must_use] pub const fn rope_type_from_raw(raw: i32) -> Option { match raw { diff --git a/llama-cpp-bindings/src/model/split_mode.rs b/llama-cpp-bindings/src/model/split_mode.rs index 170c5596..d9328a1b 100644 --- a/llama-cpp-bindings/src/model/split_mode.rs +++ b/llama-cpp-bindings/src/model/split_mode.rs @@ -1,16 +1,12 @@ use crate::model::llama_split_mode_parse_error::LlamaSplitModeParseError; -/// A rusty wrapper around `llama_split_mode`. #[repr(i8)] -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub enum LlamaSplitMode { - /// Single GPU None = LLAMA_SPLIT_MODE_NONE, - /// Split layers and KV across GPUs + #[default] Layer = LLAMA_SPLIT_MODE_LAYER, - /// Split layers and KV across GPUs, use tensor parallelism if supported Row = LLAMA_SPLIT_MODE_ROW, - /// Experimental tensor parallelism across GPUs Tensor = LLAMA_SPLIT_MODE_TENSOR, } @@ -35,8 +31,6 @@ const LLAMA_SPLIT_MODE_ROW: i8 = llama_cpp_bindings_sys::LLAMA_SPLIT_MODE_ROW as )] const LLAMA_SPLIT_MODE_TENSOR: i8 = llama_cpp_bindings_sys::LLAMA_SPLIT_MODE_TENSOR as i8; -/// Create a `LlamaSplitMode` from a `i32`. -/// /// # Errors /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. impl TryFrom for LlamaSplitMode { @@ -63,8 +57,6 @@ impl TryFrom for LlamaSplitMode { } } -/// Create a `LlamaSplitMode` from a `u32`. -/// /// # Errors /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. impl TryFrom for LlamaSplitMode { @@ -92,7 +84,6 @@ impl TryFrom for LlamaSplitMode { } } -/// Create a `i32` from a `LlamaSplitMode`. impl From for i32 { fn from(value: LlamaSplitMode) -> Self { match value { @@ -104,7 +95,6 @@ impl From for i32 { } } -/// Create a `u32` from a `LlamaSplitMode`. impl From for u32 { fn from(value: LlamaSplitMode) -> Self { match value { @@ -116,13 +106,6 @@ impl From for u32 { } } -/// The default split mode is `Layer` in llama.cpp. -impl Default for LlamaSplitMode { - fn default() -> Self { - Self::Layer - } -} - #[cfg(test)] mod tests { use super::{ diff --git a/llama-cpp-bindings/src/model/vocab_type.rs b/llama-cpp-bindings/src/model/vocab_type.rs index 4c790755..14e15132 100644 --- a/llama-cpp-bindings/src/model/vocab_type.rs +++ b/llama-cpp-bindings/src/model/vocab_type.rs @@ -1,12 +1,9 @@ use crate::model::vocab_type_from_int_error::VocabTypeFromIntError; -/// a rusty equivalent of `llama_vocab_type` #[repr(u32)] #[derive(Debug, Eq, Copy, Clone, PartialEq)] pub enum VocabType { - /// Byte Pair Encoding BPE = llama_cpp_bindings_sys::LLAMA_VOCAB_TYPE_BPE as _, - /// Sentence Piece Tokenizer SPM = llama_cpp_bindings_sys::LLAMA_VOCAB_TYPE_SPM as _, } diff --git a/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs b/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs index 3e7bcf8e..7dd3694e 100644 --- a/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs +++ b/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs @@ -1,8 +1,5 @@ -/// Returned by [`crate::model::vocab_type::VocabType::try_from`] when the -/// integer value does not match a known `llama_vocab_type` discriminant. #[derive(thiserror::Error, Debug, Eq, PartialEq)] pub enum VocabTypeFromIntError { - /// The value is not a valid `llama_vocab_type`. Contains the int value that was invalid. #[error("Unknown Value {0}")] UnknownValue(llama_cpp_bindings_sys::llama_vocab_type), } diff --git a/llama-cpp-bindings/src/mtmd.rs b/llama-cpp-bindings/src/mtmd.rs index 7d87980a..393c255a 100644 --- a/llama-cpp-bindings/src/mtmd.rs +++ b/llama-cpp-bindings/src/mtmd.rs @@ -1,11 +1,3 @@ -//! Safe wrapper around multimodal (MTMD) functionality in llama.cpp. -//! -//! This module provides Rust bindings for llama.cpp's multimodal support, -//! allowing processing of text, image, and audio inputs through a unified interface. -//! -//! # Warning -//! This API is experimental and subject to breaking changes. - pub mod image_chunk_batch_size_mismatch; pub mod mtmd_bitmap; pub mod mtmd_bitmap_error; diff --git a/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs b/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs index dfac7f12..3763791b 100644 --- a/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs +++ b/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs @@ -1,10 +1,3 @@ -/// Carried by [`super::mtmd_eval_error::MtmdEvalError::ImageChunkExceedsBatchSize`]. -/// -/// `n_batch` is the per-decode batch budget enforced by `cparams.n_batch` in -/// llama.cpp; `image_tokens` is the number of tokens this image chunk would -/// hand to `llama_decode`. When `image_tokens > n_batch` the C-side -/// `GGML_ASSERT(n_tokens_all <= cparams.n_batch)` would abort the process — -/// the binding refuses the call instead. #[derive(Debug)] pub struct ImageChunkBatchSizeMismatch { pub image_tokens: u32, diff --git a/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs b/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs index 14ab3664..63dc0299 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs @@ -20,14 +20,8 @@ fn cstr_ptr_to_optional_string(ptr: *const c_char) -> Option { } } -/// Safe wrapper around `mtmd_bitmap`. -/// -/// Represents bitmap data for images or audio that can be processed -/// by the multimodal system. For images, data is stored in RGB format. -/// For audio, data is stored as PCM F32 samples. #[derive(Debug, Clone)] pub struct MtmdBitmap { - /// Raw pointer to the underlying `mtmd_bitmap`. pub bitmap: NonNull, } @@ -35,25 +29,11 @@ unsafe impl Send for MtmdBitmap {} unsafe impl Sync for MtmdBitmap {} impl MtmdBitmap { - /// Create a bitmap from image data in RGB format. - /// /// # Errors /// /// * `InvalidDataSize` - Data length doesn't match `nx * ny * 3` /// * `NullResult` - Underlying C function returned null /// - /// # Examples - /// - /// ``` - /// use llama_cpp_bindings::mtmd::MtmdBitmap; - /// - /// // Create a 2x2 red image - /// let red_pixel = [255, 0, 0]; // RGB values for red - /// let image_data = red_pixel.repeat(4); // 2x2 = 4 pixels - /// - /// let bitmap = MtmdBitmap::from_image_data(2, 2, &image_data); - /// assert!(bitmap.is_ok()); - /// ``` pub fn from_image_data(nx: u32, ny: u32, data: &[u8]) -> Result { if nx < 2 || ny < 2 { return Err(MtmdBitmapError::ImageDimensionsTooSmall(nx, ny)); @@ -70,25 +50,10 @@ impl MtmdBitmap { Ok(Self { bitmap }) } - /// Create a bitmap from audio data in PCM F32 format. - /// /// # Errors /// /// * `NullResult` - Underlying C function returned null /// - /// # Examples - /// - /// ``` - /// use llama_cpp_bindings::mtmd::MtmdBitmap; - /// - /// // Create a simple sine wave audio sample - /// let audio_data: Vec = (0..100) - /// .map(|sample_index| (sample_index as f32 * 0.1).sin()) - /// .collect(); - /// - /// let bitmap = MtmdBitmap::from_audio_data(&audio_data); - /// // Note: This will likely fail without proper MTMD context setup - /// ``` pub fn from_audio_data(data: &[f32]) -> Result { let bitmap = unsafe { llama_cpp_bindings_sys::mtmd_bitmap_init_from_audio(data.len(), data.as_ptr()) @@ -99,12 +64,6 @@ impl MtmdBitmap { Ok(Self { bitmap }) } - /// Create a bitmap from a file. - /// - /// Supported formats: - /// - Images: formats supported by `stb_image` (jpg, png, bmp, gif, etc.) - /// - Audio: formats supported by miniaudio (wav, mp3, flac) - /// /// # Errors /// /// Returns an [`MtmdBitmapError`] variant matching the wrapper's status code. @@ -149,12 +108,6 @@ impl MtmdBitmap { } } - /// Create a bitmap from a buffer containing file data. - /// - /// Supported formats: - /// - Images: formats supported by `stb_image` (jpg, png, bmp, gif, etc.) - /// - Audio: formats supported by miniaudio (wav, mp3, flac) - /// /// # Errors /// /// * `NullResult` - Buffer could not be processed @@ -172,22 +125,16 @@ impl MtmdBitmap { Ok(Self { bitmap }) } - /// Get bitmap width in pixels. #[must_use] pub fn nx(&self) -> u32 { unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_nx(self.bitmap.as_ptr()) } } - /// Get bitmap height in pixels. #[must_use] pub fn ny(&self) -> u32 { unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_ny(self.bitmap.as_ptr()) } } - /// Get bitmap data as a byte slice. - /// - /// For images: RGB format with length `nx * ny * 3` - /// For audio: PCM F32 format with length `n_samples * 4` #[must_use] pub fn data(&self) -> &[u8] { let ptr = unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_data(self.bitmap.as_ptr()) }; @@ -195,13 +142,11 @@ impl MtmdBitmap { unsafe { slice::from_raw_parts(ptr, len) } } - /// Check if this bitmap contains audio data (vs image data). #[must_use] pub fn is_audio(&self) -> bool { unsafe { llama_cpp_bindings_sys::mtmd_bitmap_is_audio(self.bitmap.as_ptr()) } } - /// Get the bitmap's optional ID string. #[must_use] pub fn id(&self) -> Option { let ptr = unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_id(self.bitmap.as_ptr()) }; @@ -209,22 +154,10 @@ impl MtmdBitmap { cstr_ptr_to_optional_string(ptr) } - /// Set the bitmap's ID string. - /// /// # Errors /// /// Returns an error if the ID string contains null bytes. /// - /// # Examples - /// - /// ```no_run - /// # use llama_cpp_bindings::mtmd::MtmdBitmap; - /// # fn example(bitmap: &MtmdBitmap) -> Result<(), Box> { - /// bitmap.set_id("image_001")?; - /// assert_eq!(bitmap.id(), Some("image_001".to_string())); - /// # Ok(()) - /// # } - /// ``` pub fn set_id(&self, id: &str) -> Result<(), std::ffi::NulError> { let id_cstr = CString::new(id)?; unsafe { diff --git a/llama-cpp-bindings/src/mtmd/mtmd_context.rs b/llama-cpp-bindings/src/mtmd/mtmd_context.rs index 21ab2c11..28d4091e 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_context.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_context.rs @@ -67,13 +67,8 @@ fn map_encode_chunk_status( } } -/// Safe wrapper around `mtmd_context`. -/// -/// This represents an initialized multimodal context that can process -/// text, images, and audio through llama.cpp's multimodal interface. #[derive(Debug)] pub struct MtmdContext { - /// Raw pointer to the underlying `mtmd_context`. pub context: NonNull, } @@ -81,8 +76,6 @@ unsafe impl Send for MtmdContext {} unsafe impl Sync for MtmdContext {} impl MtmdContext { - /// Initialize MTMD context from a multimodal projection file. - /// /// # Errors /// /// Returns an [`MtmdInitError`] variant matching the wrapper's status code. @@ -132,8 +125,6 @@ impl MtmdContext { } } - /// Check whether non-causal attention mask is needed before `llama_decode` - /// for the given input chunk. #[must_use] pub fn decode_use_non_causal(&self, chunk: &MtmdInputChunk) -> bool { unsafe { @@ -144,26 +135,21 @@ impl MtmdContext { } } - /// Check whether the current model uses M-RoPE for `llama_decode`. #[must_use] pub fn decode_use_mrope(&self) -> bool { unsafe { llama_cpp_bindings_sys::mtmd_decode_use_mrope(self.context.as_ptr()) } } - /// Check whether the current model supports vision input. #[must_use] pub fn support_vision(&self) -> bool { unsafe { llama_cpp_bindings_sys::mtmd_support_vision(self.context.as_ptr()) } } - /// Check whether the current model supports audio input. #[must_use] pub fn support_audio(&self) -> bool { unsafe { llama_cpp_bindings_sys::mtmd_support_audio(self.context.as_ptr()) } } - /// Get audio sample rate in Hz (e.g., 16000 for Whisper). - /// Returns None if audio is not supported. #[must_use] pub fn get_audio_sample_rate(&self) -> Option { let rate = @@ -171,12 +157,6 @@ impl MtmdContext { (rate > 0).then_some(rate.unsigned_abs()) } - /// Tokenize input text and bitmaps into chunks. - /// - /// The input text must contain media markers (default: `<__media__>`) that will be - /// replaced with the corresponding bitmap data from the `bitmaps` array. - /// The number of bitmaps must equal the number of markers in the text. - /// /// # Errors /// /// Returns an [`MtmdTokenizeError`] variant matching the wrapper's status code. @@ -217,8 +197,6 @@ impl MtmdContext { Ok(chunks) } - /// Encode a chunk for image/audio processing. - /// /// # Errors /// /// Returns an [`MtmdEncodeError`] variant matching the wrapper's status code. diff --git a/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs b/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs index ec6fe674..b850580b 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs @@ -1,29 +1,10 @@ use std::ffi::{CStr, CString}; -/// Configuration parameters for MTMD context -/// -/// # Examples -/// -/// ``` -/// use llama_cpp_bindings::mtmd::{MtmdContextParams, mtmd_default_marker}; -/// use std::ffi::CString; -/// -/// let params = MtmdContextParams { -/// use_gpu: false, -/// print_timings: true, -/// n_threads: 4, -/// media_marker: CString::new(mtmd_default_marker()).unwrap(), -/// }; -/// ``` #[derive(Debug, Clone)] pub struct MtmdContextParams { - /// Whether to use GPU acceleration pub use_gpu: bool, - /// Whether to print timing information pub print_timings: bool, - /// Number of threads to use for processing pub n_threads: i32, - /// Media marker string used to identify media positions in text pub media_marker: CString, } diff --git a/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs b/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs index 2d559b5e..5209e6f2 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs @@ -1,22 +1,5 @@ use std::ffi::CStr; -/// Get the default media marker string. -/// -/// Returns the default marker used to identify media positions in text -/// (typically `"<__media__>"`). This marker should be used in your input text -/// to indicate where media content should be inserted. -/// -/// # Examples -/// -/// ``` -/// use llama_cpp_bindings::mtmd::mtmd_default_marker; -/// -/// let marker = mtmd_default_marker(); -/// assert!(!marker.is_empty()); -/// -/// let text = format!("Describe this image: {}", marker); -/// assert!(text.contains(marker)); -/// ``` #[must_use] pub fn mtmd_default_marker() -> &'static str { unsafe { diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs index 88d1358c..f10a5bca 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs @@ -34,21 +34,13 @@ const unsafe fn tokens_from_raw_ptr<'chunk>( } } -/// Safe wrapper around `mtmd_input_chunk`. -/// -/// Represents a single chunk of input data, which can be either text tokens, -/// image tokens, or audio tokens. The chunk type determines what kind of -/// data and operations are available. #[derive(Debug)] pub struct MtmdInputChunk { - /// Raw pointer to the underlying `mtmd_input_chunk`. pub chunk: NonNull, pub owned: bool, } impl MtmdInputChunk { - /// Get the type of this chunk - /// /// # Errors /// Returns an error if the chunk type is unknown. pub fn chunk_type(&self) -> Result { @@ -57,9 +49,6 @@ impl MtmdInputChunk { MtmdInputChunkType::try_from(chunk_type) } - /// Get text tokens from this chunk. - /// - /// Only valid for text chunks. Returns `None` for image or audio chunks. #[must_use] pub fn text_tokens(&self) -> Option<&[LlamaToken]> { if self.chunk_type() != Ok(MtmdInputChunkType::Text) { @@ -77,21 +66,16 @@ impl MtmdInputChunk { unsafe { tokens_from_raw_ptr(tokens_ptr, n_tokens) } } - /// Get the number of tokens in this chunk #[must_use] pub fn n_tokens(&self) -> usize { unsafe { llama_cpp_bindings_sys::mtmd_input_chunk_get_n_tokens(self.chunk.as_ptr()) } } - /// Get the number of positions in this chunk. #[must_use] pub fn n_positions(&self) -> i32 { unsafe { llama_cpp_bindings_sys::mtmd_input_chunk_get_n_pos(self.chunk.as_ptr()) } } - /// Get chunk ID if available. - /// - /// Returns `None` for text chunks, may return an ID for image/audio chunks. #[must_use] pub fn id(&self) -> Option { let ptr = unsafe { llama_cpp_bindings_sys::mtmd_input_chunk_get_id(self.chunk.as_ptr()) }; @@ -105,8 +89,6 @@ impl MtmdInputChunk { } } - /// Create a copy of this chunk that you own. - /// /// # Errors /// /// Returns `MtmdInputChunkError::ChunkOperationFailed` if copying fails. @@ -117,19 +99,6 @@ impl MtmdInputChunk { Ok(Self { chunk, owned: true }) } - /// Evaluate this single chunk through the multimodal helper. - /// - /// Mirrors `MtmdInputChunks::eval_chunks` but for one chunk at a time, so - /// callers can interleave per-chunk decode with per-chunk bookkeeping - /// (token counting, marker state-machine replay) inside one loop instead - /// of running the helper-level all-chunks eval and a separate ingest pass. - /// - /// Image chunks are decoded as one `llama_decode` call inside the helper, - /// so their token count must fit in `n_batch`. When it would not, the - /// binding refuses the call up front because the C-side - /// `GGML_ASSERT(n_tokens_all <= cparams.n_batch)` would otherwise abort - /// the process. - /// /// # Errors /// /// Returns [`MtmdEvalError::ImageChunkExceedsBatchSize`] when this is an diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs index ef628b89..5392d85e 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs @@ -1,29 +1,10 @@ use crate::mtmd::mtmd_input_chunk_type_error::MtmdInputChunkTypeError; -/// Input chunk types for multimodal data -/// -/// # Examples -/// -/// ``` -/// use llama_cpp_bindings::mtmd::MtmdInputChunkType; -/// -/// let text_chunk = MtmdInputChunkType::Text; -/// let image_chunk = MtmdInputChunkType::Image; -/// let audio_chunk = MtmdInputChunkType::Audio; -/// -/// assert_eq!(text_chunk, MtmdInputChunkType::Text); -/// let converted: MtmdInputChunkType = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_TEXT.try_into().unwrap(); -/// assert_eq!(text_chunk, converted); -/// assert_ne!(text_chunk, image_chunk); -/// ``` #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[repr(u32)] pub enum MtmdInputChunkType { - /// Text input chunk Text = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_TEXT as _, - /// Image input chunk Image = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_IMAGE as _, - /// Audio input chunk Audio = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_AUDIO as _, } diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs index ae3ca7e8..0bc0a6c7 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs @@ -1,4 +1,3 @@ -/// Error when converting from an unknown MTMD input chunk type value. #[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("Unknown MTMD input chunk type: {0}")] pub struct MtmdInputChunkTypeError(pub llama_cpp_bindings_sys::mtmd_input_chunk_type); diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs index 9ac2705b..f592c42c 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs @@ -15,34 +15,17 @@ const fn check_eval_result(result: i32) -> Result<(), MtmdEvalError> { } } -/// Safe wrapper around `mtmd_input_chunks`. -/// -/// This is a collection of input chunks created from tokenizing text and media. -/// The chunks represent the tokenized input that can be processed by the model, -/// with text chunks containing tokens and media chunks containing embeddings. #[derive(Debug)] pub struct MtmdInputChunks { - /// Raw pointer to the underlying `mtmd_input_chunks`. pub chunks: NonNull, } impl MtmdInputChunks { - /// Create a new empty input chunks collection. - /// /// # Errors /// /// Returns `MtmdInputChunksError::ChunksCreationFailed` if the underlying llama.cpp function /// returns null. /// - /// # Examples - /// - /// ``` - /// use llama_cpp_bindings::mtmd::MtmdInputChunks; - /// - /// let chunks = MtmdInputChunks::new().unwrap(); - /// assert_eq!(chunks.len(), 0); - /// assert!(chunks.is_empty()); - /// ``` pub fn new() -> Result { let chunks = unsafe { llama_cpp_bindings_sys::mtmd_input_chunks_init() }; let chunks = NonNull::new(chunks).ok_or(MtmdInputChunksError::ChunksCreationFailed)?; @@ -50,19 +33,16 @@ impl MtmdInputChunks { Ok(Self { chunks }) } - /// Get the number of chunks #[must_use] pub fn len(&self) -> usize { unsafe { llama_cpp_bindings_sys::mtmd_input_chunks_size(self.chunks.as_ptr()) } } - /// Check if chunks collection is empty #[must_use] pub fn is_empty(&self) -> bool { self.len() == 0 } - /// Get a chunk by index #[must_use] pub fn get(&self, index: usize) -> Option { if index >= self.len() { @@ -78,20 +58,16 @@ impl MtmdInputChunks { }) } - /// Get total number of tokens across all chunks. #[must_use] pub fn total_tokens(&self) -> usize { unsafe { llama_cpp_bindings_sys::mtmd_helper_get_n_tokens(self.chunks.as_ptr()) } } - /// Get total position count across all chunks. #[must_use] pub fn total_positions(&self) -> i32 { unsafe { llama_cpp_bindings_sys::mtmd_helper_get_n_pos(self.chunks.as_ptr()) } } - /// Evaluate chunks using the multimodal context and LLAMA context. - /// /// # Errors /// /// Returns `MtmdEvalError::EvalFailure` if any encoding or decoding operation fails. @@ -113,11 +89,6 @@ impl MtmdInputChunks { }); } - // mtmd_helper_eval_chunks overwrites `*new_n_past` at the end of its - // chunk loop (mtmd-helper.cpp:413), so any seed would be fine — but - // we mirror the per-chunk wrapper's `start_position` / `final_position` - // shape here for parity, keeping the read-only input and write-only - // output strictly separated. let mut final_position: llama_cpp_bindings_sys::llama_pos = start_position; let result = unsafe { diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs index db61b6ec..4f99a8f6 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs @@ -1,22 +1,6 @@ -/// Text input configuration -/// -/// # Examples -/// -/// ``` -/// use llama_cpp_bindings::mtmd::MtmdInputText; -/// -/// let input = MtmdInputText { -/// text: "Describe this image.".to_string(), -/// add_special: true, -/// parse_special: true, -/// }; -/// ``` #[derive(Debug, Clone)] pub struct MtmdInputText { - /// The input text string pub text: String, - /// Whether to add special tokens pub add_special: bool, - /// Whether to parse special tokens pub parse_special: bool, } diff --git a/llama-cpp-bindings/src/resolved_tool_call_markers.rs b/llama-cpp-bindings/src/resolved_tool_call_markers.rs index ced6510c..3b2e6a3c 100644 --- a/llama-cpp-bindings/src/resolved_tool_call_markers.rs +++ b/llama-cpp-bindings/src/resolved_tool_call_markers.rs @@ -1,9 +1,3 @@ -/// Effective tool-call marker strings resolved from either the autoparser -/// output or the per-template override registry. -/// -/// Each side is independently optional because the autoparser may report only -/// one of the two strings, and the override registry may not match the -/// template at all. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ResolvedToolCallMarkers { pub open: Option, diff --git a/llama-cpp-bindings/src/sampled_token_classifier.rs b/llama-cpp-bindings/src/sampled_token_classifier.rs index aae24fc3..26fa65eb 100644 --- a/llama-cpp-bindings/src/sampled_token_classifier.rs +++ b/llama-cpp-bindings/src/sampled_token_classifier.rs @@ -70,15 +70,6 @@ impl<'model> SampledTokenClassifier<'model> { } } - /// Ingest one sampled token. Returns the outcomes that have finalised this - /// turn — typically a single outcome, occasionally zero (the classifier is - /// holding back tokens that may yet form a marker), or several when a - /// buffered marker prefix diverges and the held-back tokens flush. - /// - /// Each [`IngestOutcome`] carries both the [`SampledToken`] variant for - /// classification and the decoded `visible_piece` for streaming. Marker - /// boundaries get an empty `visible_piece` so their text never reaches - /// user-visible streams. pub fn ingest(&mut self, token: LlamaToken) -> Vec { if !self.markers.has_any() { self.usage.record_undeterminable_token(); @@ -120,15 +111,6 @@ impl<'model> SampledTokenClassifier<'model> { ) } - /// Replay one prompt token through the marker state machine so that the - /// section at end-of-prompt reflects the chat template's rendered tail - /// (e.g. for Qwen3.5/3.6 with `enable_thinking=false` the prompt ends with - /// a closed empty `...` block, leaving the section in - /// `Content`; with `enable_thinking=true` it ends inside an open ``, - /// leaving the section in `Reasoning`). - /// - /// Prompt tokens never produce [`IngestOutcome`]s and never increment usage - /// counters — they are not generated content. pub fn ingest_prompt_token(&mut self, token: LlamaToken) { if !self.markers.has_any() { return; @@ -156,9 +138,6 @@ impl<'model> SampledTokenClassifier<'model> { } } - /// Drain every still-buffered token. Call once at end of generation (EOG) - /// to make sure no decoded text is silently dropped. After `flush()` the - /// classifier behaves as if freshly constructed in terms of buffer state. pub fn flush(&mut self) -> Vec { self.probe_mode = ProbeMode::Idle; let mut outcomes = Vec::with_capacity(self.pending.len()); diff --git a/llama-cpp-bindings/src/sampling.rs b/llama-cpp-bindings/src/sampling.rs index ac1bfb5c..7be49c06 100644 --- a/llama-cpp-bindings/src/sampling.rs +++ b/llama-cpp-bindings/src/sampling.rs @@ -1,5 +1,3 @@ -//! Safe wrapper around `llama_sampler`. - use std::borrow::Borrow; use std::ffi::{CString, c_char}; use std::fmt::{Debug, Formatter}; @@ -41,9 +39,7 @@ fn checked_usize_as_i32_sampling(value: usize) -> Result { }) } -/// A safe wrapper around `llama_sampler`. pub struct LlamaSampler { - /// Raw pointer to the underlying `llama_sampler`. pub sampler: *mut llama_cpp_bindings_sys::llama_sampler, } @@ -54,8 +50,6 @@ impl Debug for LlamaSampler { } impl LlamaSampler { - /// Sample and accept a token from the idx-th output of the last evaluation. - /// /// # Errors /// /// Returns [`SampleError`] if the C++ sampler throws an exception or if the index is invalid. @@ -86,23 +80,16 @@ impl LlamaSampler { } } - /// Applies this sampler to a [`LlamaTokenDataArray`]. pub fn apply(&self, data_array: &mut LlamaTokenDataArray) { data_array.apply_sampler(self); } - /// Accepts a token from the sampler, possibly updating the internal state of certain samplers - /// (e.g. grammar, repetition, etc.) - /// /// # Errors /// Returns [`SamplerAcceptError`] if the underlying sampler rejects the token. pub fn accept(&mut self, token: LlamaToken) -> Result<(), SamplerAcceptError> { self.try_accept(token) } - /// Accepts several tokens from the sampler or context, possibly updating the internal state of - /// certain samplers (e.g. grammar, repetition, etc.) - /// /// # Errors /// Returns [`SamplerAcceptError`] if the underlying sampler rejects any token. pub fn accept_many( @@ -116,9 +103,6 @@ impl LlamaSampler { Ok(()) } - /// Accepts several tokens from the sampler or context, possibly updating the internal state of - /// certain samplers (e.g. grammar, repetition, etc.) - /// /// # Errors /// Returns [`SamplerAcceptError`] if the underlying sampler rejects any token. pub fn with_tokens( @@ -130,8 +114,6 @@ impl LlamaSampler { Ok(self) } - /// Try accepting a token from the sampler. Returns an error if the sampler throws. - /// /// # Errors /// Returns an error if the underlying sampler rejects the token. pub fn try_accept(&mut self, token: LlamaToken) -> Result<(), SamplerAcceptError> { @@ -148,32 +130,17 @@ impl LlamaSampler { check_sampler_accept_status(status, error_ptr) } - /// Resets the internal state of the sampler. - /// - /// This can be useful when you want to start fresh with a sampler without creating a new instance. pub fn reset(&mut self) { unsafe { llama_cpp_bindings_sys::llama_sampler_reset(self.sampler); } } - /// Gets the random seed used by this sampler. - /// - /// Returns: - /// - For random samplers (dist, mirostat, `mirostat_v2)`: returns their current seed - /// - For sampler chains: returns the first non-default seed found in reverse order - /// - For all other samplers: returns 0xFFFFFFFF #[must_use] pub fn get_seed(&self) -> u32 { unsafe { llama_cpp_bindings_sys::llama_sampler_get_seed(self.sampler) } } - /// Combines a list of samplers into a single sampler that applies each component sampler one - /// after another. - /// - /// If you are using a chain to select a token, the chain should always end with one of - /// [`LlamaSampler::greedy`], [`LlamaSampler::dist`], [`LlamaSampler::mirostat`], and - /// [`LlamaSampler::mirostat_v2`]. #[must_use] pub fn chain(samplers: impl IntoIterator, no_perf: bool) -> Self { unsafe { @@ -190,74 +157,17 @@ impl LlamaSampler { } } - /// Same as [`Self::chain`] with `no_perf = false`. - /// - /// # Example - /// ```rust - /// use llama_cpp_bindings::token::{ - /// LlamaToken, - /// data::LlamaTokenData, - /// data_array::LlamaTokenDataArray - /// }; - /// use llama_cpp_bindings::sampling::LlamaSampler; - /// use llama_cpp_bindings::llama_backend::LlamaBackend; - /// let backend = LlamaBackend::init().unwrap(); - /// - /// let mut data_array = LlamaTokenDataArray::new(vec![ - /// LlamaTokenData::new(LlamaToken(0), 0., 0.), - /// LlamaTokenData::new(LlamaToken(1), 1., 0.), - /// LlamaTokenData::new(LlamaToken(2), 2., 0.), - /// ], false); - /// - /// data_array.apply_sampler(&mut LlamaSampler::chain_simple([ - /// LlamaSampler::temp(0.5), - /// LlamaSampler::greedy(), - /// ])); - /// - /// assert_eq!(data_array.data[0].logit(), 0.); - /// assert_eq!(data_array.data[1].logit(), 2.); - /// assert_eq!(data_array.data[2].logit(), 4.); - /// - /// assert_eq!(data_array.data.len(), 3); - /// assert_eq!(data_array.selected_token(), Some(LlamaToken(2))); - /// ``` #[must_use] pub fn chain_simple(samplers: impl IntoIterator) -> Self { Self::chain(samplers, false) } - /// Updates the logits `l_i' = l_i/t`. When `t <= 0.0`, the maximum logit is kept at its original - /// value, the rest are set to -inf - /// - /// # Example: - /// ```rust - /// use llama_cpp_bindings::token::{ - /// LlamaToken, - /// data::LlamaTokenData, - /// data_array::LlamaTokenDataArray - /// }; - /// use llama_cpp_bindings::sampling::LlamaSampler; - /// - /// let mut data_array = LlamaTokenDataArray::new(vec![ - /// LlamaTokenData::new(LlamaToken(0), 0., 0.), - /// LlamaTokenData::new(LlamaToken(1), 1., 0.), - /// LlamaTokenData::new(LlamaToken(2), 2., 0.), - /// ], false); - /// - /// data_array.apply_sampler(&mut LlamaSampler::temp(0.5)); - /// - /// assert_eq!(data_array.data[0].logit(), 0.); - /// assert_eq!(data_array.data[1].logit(), 2.); - /// assert_eq!(data_array.data[2].logit(), 4.); - /// ``` #[must_use] pub fn temp(t: f32) -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_temp(t) }; Self { sampler } } - /// Dynamic temperature implementation (a.k.a. entropy) described in the paper - /// . #[must_use] pub fn temp_ext(t: f32, delta: f32, exponent: f32) -> Self { let sampler = @@ -265,91 +175,36 @@ impl LlamaSampler { Self { sampler } } - /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" - /// - /// - /// # Example: - /// ```rust - /// use llama_cpp_bindings::token::{ - /// LlamaToken, - /// data::LlamaTokenData, - /// data_array::LlamaTokenDataArray - /// }; - /// use llama_cpp_bindings::sampling::LlamaSampler; - /// - /// let mut data_array = LlamaTokenDataArray::new(vec![ - /// LlamaTokenData::new(LlamaToken(0), 0., 0.), - /// LlamaTokenData::new(LlamaToken(1), 1., 0.), - /// LlamaTokenData::new(LlamaToken(2), 2., 0.), - /// LlamaTokenData::new(LlamaToken(3), 3., 0.), - /// ], false); - /// - /// data_array.apply_sampler(&mut LlamaSampler::top_k(2)); - /// - /// assert_eq!(data_array.data.len(), 2); - /// assert_eq!(data_array.data[0].id(), LlamaToken(3)); - /// assert_eq!(data_array.data[1].id(), LlamaToken(2)); - /// ``` #[must_use] pub fn top_k(k: i32) -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_top_k(k) }; Self { sampler } } - /// Top-nσ sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" - /// - /// - /// This method filters logits by selecting only those within *n* standard deviations of the mean. - /// - /// # Parameters - /// - `n`: Number of standard deviations from the mean to include in sampling - /// - /// # Example - /// ```rust - /// use llama_cpp_bindings::sampling::LlamaSampler; - /// use llama_cpp_bindings::token::{ - /// LlamaToken, - /// data::LlamaTokenData, - /// data_array::LlamaTokenDataArray - /// }; - /// - /// let mut data_array = LlamaTokenDataArray::new(vec![ - /// LlamaTokenData::new(LlamaToken(0), 0.0, 0.0), - /// LlamaTokenData::new(LlamaToken(1), 1.0, 0.0), - /// LlamaTokenData::new(LlamaToken(2), 2.0, 0.0), - /// ], false); - /// - /// data_array.apply_sampler(&mut LlamaSampler::top_n_sigma(2.0)); - /// ``` #[must_use] pub fn top_n_sigma(n: f32) -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_top_n_sigma(n) }; Self { sampler } } - /// Locally Typical Sampling implementation described in the paper . #[must_use] pub fn typical(p: f32, min_keep: usize) -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_typical(p, min_keep) }; Self { sampler } } - /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" - /// #[must_use] pub fn top_p(p: f32, min_keep: usize) -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_top_p(p, min_keep) }; Self { sampler } } - /// Minimum P sampling as described in #[must_use] pub fn min_p(p: f32, min_keep: usize) -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_min_p(p, min_keep) }; Self { sampler } } - /// XTC sampler as described in #[must_use] pub fn xtc(p: f32, t: f32, min_keep: usize, seed: u32) -> Self { let sampler = @@ -357,8 +212,6 @@ impl LlamaSampler { Self { sampler } } - /// Grammar sampler - /// /// # Errors /// Returns an error if the grammar is invalid or the sampler cannot be initialized. pub fn grammar( @@ -401,10 +254,6 @@ impl LlamaSampler { } } - /// Lazy grammar sampler, introduced in - /// - /// This sampler enforces grammar rules only when specific trigger words or tokens are encountered. - /// /// # Errors /// Returns an error if the grammar or trigger words are invalid. pub fn grammar_lazy( @@ -457,12 +306,6 @@ impl LlamaSampler { } } - /// Lazy grammar sampler using regex trigger patterns. - /// - /// Trigger patterns are regular expressions matched from the start of the - /// generation output. The grammar sampler will be fed content starting from - /// the first match group. - /// /// # Errors /// Returns an error if the grammar or trigger patterns are invalid. pub fn grammar_lazy_patterns( @@ -519,11 +362,6 @@ impl LlamaSampler { } } - /// `LLGuidance` sampler for constrained decoding. - /// - /// Uses the `llguidance` and `toktrie` Rust crates to enforce grammar constraints - /// during token sampling. Supports JSON schema, regex, Lark, and other grammar types. - /// /// # Errors /// /// Returns [`GrammarError`] if the grammar is invalid or the sampler cannot be initialized. @@ -567,10 +405,6 @@ impl LlamaSampler { .collect() } - /// DRY sampler, designed by p-e-w, as described in: - /// , porting Koboldcpp - /// implementation authored by pi6am: - /// /// # Errors /// Returns an error if any string in `seq_breakers` contains null bytes. pub fn dry( @@ -612,13 +446,6 @@ impl LlamaSampler { Ok(Self { sampler }) } - /// Penalizes tokens for being present in the context. - /// - /// Parameters: - /// - ``penalty_last_n``: last n tokens to penalize (0 = disable penalty, -1 = context size) - /// - ``penalty_repeat``: 1.0 = disabled - /// - ``penalty_freq``: 0.0 = disabled - /// - ``penalty_present``: 0.0 = disabled #[must_use] pub fn penalties( penalty_last_n: i32, @@ -637,21 +464,6 @@ impl LlamaSampler { Self { sampler } } - /// Mirostat 1.0 algorithm described in the paper . Uses tokens instead of words. - /// - /// # Parameters: - /// - ``n_vocab``: [`LlamaModel::n_vocab`] - /// - ``seed``: Seed to initialize random generation with. - /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the - /// generated text. A higher value corresponds to more surprising or less predictable text, - /// while a lower value corresponds to less surprising or more predictable text. - /// - ``eta``: The learning rate used to update `mu` based on the error between the target and - /// observed surprisal of the sampled word. A larger learning rate will cause `mu` to be - /// updated more quickly, while a smaller learning rate will result in slower updates. - /// - ``m``: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary - /// value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. - /// In the paper, they use `m = 100`, but you can experiment with different values to see how - /// it affects the performance of the algorithm. #[must_use] pub fn mirostat(n_vocab: i32, seed: u32, tau: f32, eta: f32, m: i32) -> Self { let sampler = unsafe { @@ -660,16 +472,6 @@ impl LlamaSampler { Self { sampler } } - /// Mirostat 2.0 algorithm described in the paper . Uses tokens instead of words. - /// - /// # Parameters: - /// - ``seed``: Seed to initialize random generation with. - /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the - /// generated text. A higher value corresponds to more surprising or less predictable text, - /// while a lower value corresponds to less surprising or more predictable text. - /// - ``eta``: The learning rate used to update `mu` based on the error between the target and - /// observed surprisal of the sampled word. A larger learning rate will cause `mu` to be - /// updated more quickly, while a smaller learning rate will result in slower updates. #[must_use] pub fn mirostat_v2(seed: u32, tau: f32, eta: f32) -> Self { let sampler = @@ -677,62 +479,21 @@ impl LlamaSampler { Self { sampler } } - /// Selects a token at random based on each token's probabilities #[must_use] pub fn dist(seed: u32) -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_dist(seed) }; Self { sampler } } - /// Selects the most likely token - /// - /// # Example: - /// ```rust - /// use llama_cpp_bindings::token::{ - /// LlamaToken, - /// data::LlamaTokenData, - /// data_array::LlamaTokenDataArray - /// }; - /// use llama_cpp_bindings::sampling::LlamaSampler; - /// - /// let mut data_array = LlamaTokenDataArray::new(vec![ - /// LlamaTokenData::new(LlamaToken(0), 0., 0.), - /// LlamaTokenData::new(LlamaToken(1), 1., 0.), - /// ], false); - /// - /// data_array.apply_sampler(&mut LlamaSampler::greedy()); - /// - /// assert_eq!(data_array.data.len(), 2); - /// assert_eq!(data_array.selected_token(), Some(LlamaToken(1))); - /// ``` #[must_use] pub fn greedy() -> Self { let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_greedy() }; Self { sampler } } - /// Creates a sampler that applies bias values to specific tokens during sampling. - /// - /// # Parameters - /// - ``n_vocab``: [`LlamaModel::n_vocab`] - /// - ``biases``: Slice of [`LlamaLogitBias`] values specifying token-bias pairs - /// /// # Errors /// Returns [`SamplingError::IntegerOverflow`] if `biases.len()` exceeds `i32::MAX`. /// - /// # Example - /// ```rust - /// use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias}; - /// use llama_cpp_bindings::sampling::LlamaSampler; - /// - /// let biases = vec![ - /// LlamaLogitBias::new(LlamaToken(1), 1.5), // Increase probability of token 1 - /// LlamaLogitBias::new(LlamaToken(2), -1.0), // Decrease probability of token 2 - /// ]; - /// - /// // Assuming vocab_size of 32000 - /// let sampler = LlamaSampler::logit_bias(32000, &biases).unwrap(); - /// ``` pub fn logit_bias(n_vocab: i32, biases: &[LlamaLogitBias]) -> Result { let bias_count = checked_usize_as_i32_sampling(biases.len())?; let data = biases diff --git a/llama-cpp-bindings/src/streaming_json_probe.rs b/llama-cpp-bindings/src/streaming_json_probe.rs index 3560be7b..9e17bd9a 100644 --- a/llama-cpp-bindings/src/streaming_json_probe.rs +++ b/llama-cpp-bindings/src/streaming_json_probe.rs @@ -449,8 +449,6 @@ mod tests { #[test] fn syntactically_malformed_object_is_failed() { - // Input starts with `{` (passes the cheap prefix check) but cannot parse — the syntax - // error path classifies as `Category::Syntax`, surfacing the `Failed` arm. assert_eq!( JsonProbeOutcome::validate_prefix("{,}"), JsonProbeOutcome::Failed, diff --git a/llama-cpp-bindings/src/streaming_markers.rs b/llama-cpp-bindings/src/streaming_markers.rs index 9eaaddf2..e34636f7 100644 --- a/llama-cpp-bindings/src/streaming_markers.rs +++ b/llama-cpp-bindings/src/streaming_markers.rs @@ -8,11 +8,6 @@ pub enum MarkerKind { ToolCallClose, } -/// Tokenized marker sequences (token IDs, not strings). -/// -/// Each marker is a `Vec` of length `>= 1`; absent markers are -/// `None`. Sequence matching at every `ingest()` is by token-ID equality, -/// never by substring scanning of decoded text. #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct StreamingMarkers { pub reasoning_open: Option>, diff --git a/llama-cpp-bindings/src/timing.rs b/llama-cpp-bindings/src/timing.rs index 5c07eab8..e0ea3482 100644 --- a/llama-cpp-bindings/src/timing.rs +++ b/llama-cpp-bindings/src/timing.rs @@ -1,23 +1,11 @@ -//! Safe wrapper around `llama_timings`. use std::fmt::{Debug, Display, Formatter}; -/// A wrapper around `llama_timings`. #[derive(Clone, Copy, Debug)] pub struct LlamaTimings { - /// The underlying `llama_perf_context_data` from the C API. pub timings: llama_cpp_bindings_sys::llama_perf_context_data, } impl LlamaTimings { - /// Create a new `LlamaTimings`. - /// ``` - /// # use llama_cpp_bindings::timing::LlamaTimings; - /// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6, 1); - /// let timings_str = "load time = 2.00 ms - /// prompt eval time = 3.00 ms / 5 tokens (0.60 ms per token, 1666.67 tokens per second) - /// eval time = 4.00 ms / 6 runs (0.67 ms per token, 1500.00 tokens per second)\n"; - /// assert_eq!(timings_str, format!("{}", timings)); - /// ``` #[must_use] pub const fn new( t_start_ms: f64, @@ -41,68 +29,56 @@ impl LlamaTimings { } } - /// Get the start time in milliseconds. #[must_use] pub const fn t_start_ms(&self) -> f64 { self.timings.t_start_ms } - /// Get the load time in milliseconds. #[must_use] pub const fn t_load_ms(&self) -> f64 { self.timings.t_load_ms } - /// Get the prompt evaluation time in milliseconds. #[must_use] pub const fn t_p_eval_ms(&self) -> f64 { self.timings.t_p_eval_ms } - /// Get the evaluation time in milliseconds. #[must_use] pub const fn t_eval_ms(&self) -> f64 { self.timings.t_eval_ms } - /// Get the number of prompt evaluations. #[must_use] pub const fn n_p_eval(&self) -> i32 { self.timings.n_p_eval } - /// Get the number of evaluations. #[must_use] pub const fn n_eval(&self) -> i32 { self.timings.n_eval } - /// Set the start time in milliseconds. pub const fn set_t_start_ms(&mut self, t_start_ms: f64) { self.timings.t_start_ms = t_start_ms; } - /// Set the load time in milliseconds. pub const fn set_t_load_ms(&mut self, t_load_ms: f64) { self.timings.t_load_ms = t_load_ms; } - /// Set the prompt evaluation time in milliseconds. pub const fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) { self.timings.t_p_eval_ms = t_p_eval_ms; } - /// Set the evaluation time in milliseconds. pub const fn set_t_eval_ms(&mut self, t_eval_ms: f64) { self.timings.t_eval_ms = t_eval_ms; } - /// Set the number of prompt evaluations. pub const fn set_n_p_eval(&mut self, n_p_eval: i32) { self.timings.n_p_eval = n_p_eval; } - /// Set the number of evaluations. pub const fn set_n_eval(&mut self, n_eval: i32) { self.timings.n_eval = n_eval; } diff --git a/llama-cpp-bindings/src/token.rs b/llama-cpp-bindings/src/token.rs index 5249baa9..4b87459e 100644 --- a/llama-cpp-bindings/src/token.rs +++ b/llama-cpp-bindings/src/token.rs @@ -1,5 +1,3 @@ -//! Safe wrappers around `llama_token_data` and `llama_token_data_array`. - use std::fmt::Debug; use std::fmt::Display; @@ -7,7 +5,6 @@ pub mod data; pub mod data_array; pub mod logit_bias; -/// A safe wrapper for `llama_token`. #[repr(transparent)] #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] pub struct LlamaToken(pub llama_cpp_bindings_sys::llama_token); @@ -19,13 +16,6 @@ impl Display for LlamaToken { } impl LlamaToken { - /// Create a new `LlamaToken` from a i32. - /// - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// let token = LlamaToken::new(0); - /// assert_eq!(token, LlamaToken(0)); - /// ``` #[must_use] pub const fn new(token_id: i32) -> Self { Self(token_id) diff --git a/llama-cpp-bindings/src/token/data.rs b/llama-cpp-bindings/src/token/data.rs index 7f75203f..ce4b2eaf 100644 --- a/llama-cpp-bindings/src/token/data.rs +++ b/llama-cpp-bindings/src/token/data.rs @@ -1,10 +1,5 @@ -//! Safe wrapper around `llama_token_data`. use crate::token::LlamaToken; -/// A transparent wrapper around `llama_token_data`. -/// -/// Do not rely on `repr(transparent)` for this type. It should be considered an implementation -/// detail and may change across minor versions. #[derive(Clone, Copy, Debug, PartialEq)] #[repr(transparent)] pub struct LlamaTokenData { @@ -12,92 +7,35 @@ pub struct LlamaTokenData { } impl LlamaTokenData { - /// Create a new token data from a token, logit, and probability. - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// let token = LlamaToken::new(1); - /// let token_data = LlamaTokenData::new(token, 1.0, 1.0); #[must_use] pub const fn new(LlamaToken(id): LlamaToken, logit: f32, p: f32) -> Self { Self { data: llama_cpp_bindings_sys::llama_token_data { id, logit, p }, } } - /// Get the token's id - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// let token = LlamaToken::new(1); - /// let token_data = LlamaTokenData::new(token, 1.0, 1.0); - /// assert_eq!(token_data.id(), token); - /// ``` #[must_use] pub const fn id(&self) -> LlamaToken { LlamaToken(self.data.id) } - /// Get the token's logit - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// let token = LlamaToken::new(1); - /// let token_data = LlamaTokenData::new(token, 1.0, 1.0); - /// assert_eq!(token_data.logit(), 1.0); - /// ``` #[must_use] pub const fn logit(&self) -> f32 { self.data.logit } - /// Get the token's probability - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// let token = LlamaToken::new(1); - /// let token_data = LlamaTokenData::new(token, 1.0, 1.0); - /// assert_eq!(token_data.p(), 1.0); - /// ``` #[must_use] pub const fn p(&self) -> f32 { self.data.p } - /// Set the token's id - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// let token = LlamaToken::new(1); - /// let mut token_data = LlamaTokenData::new(token, 1.0, 1.0); - /// token_data.set_id(LlamaToken::new(2)); - /// assert_eq!(token_data.id(), LlamaToken::new(2)); - /// ``` pub const fn set_id(&mut self, id: LlamaToken) { self.data.id = id.0; } - /// Set the token's logit - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// let token = LlamaToken::new(1); - /// let mut token_data = LlamaTokenData::new(token, 1.0, 1.0); - /// token_data.set_logit(2.0); - /// assert_eq!(token_data.logit(), 2.0); - /// ``` pub const fn set_logit(&mut self, logit: f32) { self.data.logit = logit; } - /// Set the token's probability - /// ``` - /// # use llama_cpp_bindings::token::LlamaToken; - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// let token = LlamaToken::new(1); - /// let mut token_data = LlamaTokenData::new(token, 1.0, 1.0); - /// token_data.set_p(2.0); - /// assert_eq!(token_data.p(), 2.0); - /// ``` pub const fn set_p(&mut self, p: f32) { self.data.p = p; } diff --git a/llama-cpp-bindings/src/token/data_array.rs b/llama-cpp-bindings/src/token/data_array.rs index 40933d7a..3e9f901d 100644 --- a/llama-cpp-bindings/src/token/data_array.rs +++ b/llama-cpp-bindings/src/token/data_array.rs @@ -1,4 +1,3 @@ -//! an rusty equivalent of `llama_token_data_array`. use std::ptr; use crate::error::TokenSamplingError; @@ -7,31 +6,14 @@ use crate::token::data::LlamaTokenData; use super::LlamaToken; -/// a safe wrapper around `llama_token_data_array`. #[derive(Debug, Clone, PartialEq)] pub struct LlamaTokenDataArray { - /// the underlying data pub data: Vec, - /// the index of the selected token in ``data`` pub selected: Option, - /// is the data sorted? pub sorted: bool, } impl LlamaTokenDataArray { - /// Create a new `LlamaTokenDataArray` from a vector and whether or not the data is sorted. - /// - /// ``` - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// # use llama_cpp_bindings::token::data_array::LlamaTokenDataArray; - /// # use llama_cpp_bindings::token::LlamaToken; - /// let array = LlamaTokenDataArray::new(vec![ - /// LlamaTokenData::new(LlamaToken(0), 0.0, 0.0), - /// LlamaTokenData::new(LlamaToken(1), 0.1, 0.1) - /// ], false); - /// assert_eq!(array.data.len(), 2); - /// assert_eq!(array.sorted, false); - /// ``` #[must_use] pub const fn new(data: Vec, sorted: bool) -> Self { Self { @@ -41,17 +23,6 @@ impl LlamaTokenDataArray { } } - /// Create a new `LlamaTokenDataArray` from an iterator and whether or not the data is sorted. - /// ``` - /// # use llama_cpp_bindings::token::data::LlamaTokenData; - /// # use llama_cpp_bindings::token::data_array::LlamaTokenDataArray; - /// # use llama_cpp_bindings::token::LlamaToken; - /// let array = LlamaTokenDataArray::from_iter([ - /// LlamaTokenData::new(LlamaToken(0), 0.0, 0.0), - /// LlamaTokenData::new(LlamaToken(1), 0.1, 0.1) - /// ], false); - /// assert_eq!(array.data.len(), 2); - /// assert_eq!(array.sorted, false); pub fn from_iter(data: TIterator, sorted: bool) -> Self where TIterator: IntoIterator, @@ -59,7 +30,6 @@ impl LlamaTokenDataArray { Self::new(data.into_iter().collect(), sorted) } - /// Returns the current selected token, if one exists. #[must_use] pub fn selected_token(&self) -> Option { self.data.get(self.selected?).map(LlamaTokenData::id) @@ -67,8 +37,6 @@ impl LlamaTokenDataArray { } impl LlamaTokenDataArray { - /// Modify the underlying data as a `llama_token_data_array`. and reconstruct the `LlamaTokenDataArray`. - /// /// # Panics /// /// Panics if some of the safety conditions are not met. (we cannot check all of them at @@ -125,8 +93,6 @@ impl LlamaTokenDataArray { result } - /// Modifies the data array by applying a sampler to it. - /// /// # Panics /// /// Panics if the vendored sampler throws a C++ exception. `llama_sampler_apply` is @@ -149,15 +115,12 @@ impl LlamaTokenDataArray { } } - /// Modifies the data array by applying a sampler to it #[must_use] pub fn with_sampler(mut self, sampler: &mut LlamaSampler) -> Self { self.apply_sampler(sampler); self } - /// Randomly selects a token from the candidates based on their probabilities. - /// /// # Errors /// Returns [`TokenSamplingError::NoTokenSelected`] if the sampler fails to select a token. pub fn sample_token(&mut self, seed: u32) -> Result { @@ -166,8 +129,6 @@ impl LlamaTokenDataArray { .ok_or(TokenSamplingError::NoTokenSelected) } - /// Selects the token with the highest probability. - /// /// # Errors /// Returns [`TokenSamplingError::NoTokenSelected`] if the sampler fails to select a token. pub fn sample_token_greedy(&mut self) -> Result { diff --git a/llama-cpp-bindings/src/token/logit_bias.rs b/llama-cpp-bindings/src/token/logit_bias.rs index 52d91522..6d5a1502 100644 --- a/llama-cpp-bindings/src/token/logit_bias.rs +++ b/llama-cpp-bindings/src/token/logit_bias.rs @@ -1,13 +1,5 @@ -//! Safe wrapper around `llama_logit_bias`. use crate::token::LlamaToken; -/// A transparent wrapper around `llama_logit_bias`. -/// -/// Represents a bias to be applied to a specific token during text generation. -/// The bias modifies the likelihood of the token being selected. -/// -/// Do not rely on `repr(transparent)` for this type. It should be considered an implementation -/// detail and may change across minor versions. #[derive(Clone, Copy, Debug, PartialEq)] #[repr(transparent)] pub struct LlamaLogitBias { @@ -15,14 +7,6 @@ pub struct LlamaLogitBias { } impl LlamaLogitBias { - /// Creates a new logit bias for a specific token with the given bias value. - /// - /// # Examples - /// ``` - /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias}; - /// let token = LlamaToken::new(1); - /// let bias = LlamaLogitBias::new(token, 1.5); - /// ``` #[must_use] pub const fn new(LlamaToken(token): LlamaToken, bias: f32) -> Self { Self { @@ -30,59 +14,20 @@ impl LlamaLogitBias { } } - /// Gets the token this bias applies to. - /// - /// # Examples - /// ``` - /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias}; - /// let token = LlamaToken::new(1); - /// let bias = LlamaLogitBias::new(token, 1.5); - /// assert_eq!(bias.token(), token); - /// ``` #[must_use] pub const fn token(&self) -> LlamaToken { LlamaToken(self.logit_bias.token) } - /// Gets the bias value. - /// - /// # Examples - /// ``` - /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias}; - /// let token = LlamaToken::new(1); - /// let bias = LlamaLogitBias::new(token, 1.5); - /// assert_eq!(bias.bias(), 1.5); - /// ``` #[must_use] pub const fn bias(&self) -> f32 { self.logit_bias.bias } - /// Sets the token this bias applies to. - /// - /// # Examples - /// ``` - /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias}; - /// let token = LlamaToken::new(1); - /// let mut bias = LlamaLogitBias::new(token, 1.5); - /// let new_token = LlamaToken::new(2); - /// bias.set_token(new_token); - /// assert_eq!(bias.token(), new_token); - /// ``` pub const fn set_token(&mut self, token: LlamaToken) { self.logit_bias.token = token.0; } - /// Sets the bias value. - /// - /// # Examples - /// ``` - /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias}; - /// let token = LlamaToken::new(1); - /// let mut bias = LlamaLogitBias::new(token, 1.5); - /// bias.set_bias(2.0); - /// assert_eq!(bias.bias(), 2.0); - /// ``` pub const fn set_bias(&mut self, bias: f32) { self.logit_bias.bias = bias; } diff --git a/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs b/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs index 2ed0cd89..b27878fb 100644 --- a/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs +++ b/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs @@ -208,9 +208,6 @@ mod tests { #[test] fn rejects_truncated_json_arguments_with_unterminated_failure() { - // serde_json's iterator returns None when the deserializer has no token to start from. - // Constructing such an input requires whitespace-only input after the separator — the - // iterator finds nothing parseable and yields None, surfacing the Unterminated arm. let failure = parse( "[TOOL_CALLS]get_weather[ARGS] ", &mistral3_markers(), @@ -226,8 +223,6 @@ mod tests { #[test] fn returns_empty_vec_for_separator_with_only_whitespace_name() { - // `get_weather` is replaced with whitespace before the separator, so `name.trim()` is - // empty and the parser returns `ParseStep::Done` — covers the empty-name early return. let parsed = parse( "[TOOL_CALLS] [ARGS]{\"x\":1}", &mistral3_markers(), @@ -240,8 +235,6 @@ mod tests { #[test] fn returns_empty_vec_when_shape_has_empty_separator() { - // When `name_args_separator` is empty, `parse` short-circuits to `Vec::new()` — - // covers the early-return guard. let mut shape = mistral3_shape(); shape.name_args_separator.clear(); let parsed = parse( diff --git a/llama-cpp-bindings/src/tool_call_format/json_object.rs b/llama-cpp-bindings/src/tool_call_format/json_object.rs index c9038152..af9f58ea 100644 --- a/llama-cpp-bindings/src/tool_call_format/json_object.rs +++ b/llama-cpp-bindings/src/tool_call_format/json_object.rs @@ -187,9 +187,6 @@ mod tests { #[test] fn returns_empty_when_object_is_not_a_tool_call_shape() { - // The body opens with `{` (so try_parse_one_object enters the JSON path) but the parsed - // value is a top-level non-object — the early `let Value::Object(map) = value else - // { return Ok(None) };` arm fires. let parsed = parse("{ \"foo\": 1 }", &qwen3_shape()).expect("must parse"); assert!(parsed.is_empty()); diff --git a/llama-cpp-log-decoder/src/lib.rs b/llama-cpp-log-decoder/src/lib.rs index b7a96a37..369e4837 100644 --- a/llama-cpp-log-decoder/src/lib.rs +++ b/llama-cpp-log-decoder/src/lib.rs @@ -1,13 +1,3 @@ -//! Decoder for the llama.cpp / ggml log callback stream. -//! -//! The C side delivers log lines in fragments: a missing trailing newline -//! signals that more fragments will follow at `GGML_LOG_LEVEL_CONT`. This -//! crate is a pure `&mut self` transducer — feed `(level, text)` pairs, get -//! complete [`LogLine`]s back when the trailing newline arrives. No globals, -//! no atomics, no FFI, no logger. -//! -//! [`LogLine`]: log_line::LogLine - pub mod decode_anomaly; pub mod decode_output; pub mod decode_result; diff --git a/llama-cpp-test-harness-macros/src/lib.rs b/llama-cpp-test-harness-macros/src/lib.rs index b36048fc..4021ea43 100644 --- a/llama-cpp-test-harness-macros/src/lib.rs +++ b/llama-cpp-test-harness-macros/src/lib.rs @@ -1,9 +1,3 @@ -//! Procedural macros for `llama-cpp-test-harness`. -//! -//! Provides the `#[llama_test(...)]` attribute that declaratively binds a test function to a -//! specific GGUF model and inference parameter set. The macro emits the original function plus -//! an `inventory::submit!` block that registers the test with the harness runtime. - mod expand; mod parsed_args; mod parsed_context_params; @@ -22,9 +16,6 @@ fn dispatch(attribute: TokenStream2, item: TokenStream2) -> TokenStream2 { } } -/// Registers a function as a llama-cpp test with explicit model + inference parameters. -/// -/// See the `llama-cpp-test-harness` crate for the full attribute schema and usage. #[proc_macro_attribute] pub fn llama_test(attribute: TokenStream, item: TokenStream) -> TokenStream { dispatch(attribute.into(), item.into()).into() diff --git a/llama-cpp-test-harness-macros/src/parsed_args.rs b/llama-cpp-test-harness-macros/src/parsed_args.rs index 795261f3..c5b50788 100644 --- a/llama-cpp-test-harness-macros/src/parsed_args.rs +++ b/llama-cpp-test-harness-macros/src/parsed_args.rs @@ -869,8 +869,6 @@ mod tests { #[test] fn unparseable_attribute_token_stream_is_rejected() { - // `Punctuated::parse_terminated` rejects input that can't be split into Meta items by - // commas; passing a stray symbol surfaces that `?` Err arm in `ParsedArgs::parse`. let result = parse("@&^!"); assert!( diff --git a/llama-cpp-bindings-tests/fixtures/ggml-vocab-bert-bge.gguf b/llama-cpp-test-harness/fixtures/ggml-vocab-bert-bge.gguf similarity index 100% rename from llama-cpp-bindings-tests/fixtures/ggml-vocab-bert-bge.gguf rename to llama-cpp-test-harness/fixtures/ggml-vocab-bert-bge.gguf diff --git a/llama-cpp-bindings-tests/fixtures/llamas.jpg b/llama-cpp-test-harness/fixtures/llamas.jpg similarity index 100% rename from llama-cpp-bindings-tests/fixtures/llamas.jpg rename to llama-cpp-test-harness/fixtures/llamas.jpg diff --git a/llama-cpp-test-harness/src/download_model.rs b/llama-cpp-test-harness/src/download_model.rs index 3ffd5a5b..e7cf1aa2 100644 --- a/llama-cpp-test-harness/src/download_model.rs +++ b/llama-cpp-test-harness/src/download_model.rs @@ -2,8 +2,6 @@ use std::path::PathBuf; use anyhow::Result; -/// Downloads a single file from a Hugging Face repo via `hf-hub`'s sync API. -/// /// # Errors /// /// Returns an error if the HF client cannot be built or the file cannot be downloaded diff --git a/llama-cpp-test-harness/src/execution_plan.rs b/llama-cpp-test-harness/src/execution_plan.rs index 927c87a8..669b7524 100644 --- a/llama-cpp-test-harness/src/execution_plan.rs +++ b/llama-cpp-test-harness/src/execution_plan.rs @@ -1,18 +1,3 @@ -//! Deterministic execution plan for the test harness. -//! -//! [`ExecutionPlan::from_registrations`] takes the registrations collected from `inventory` and -//! groups them into [`ExecutionPhase`]s by [`crate::LoadKey`]. The result is a sorted list of -//! phases — each phase corresponds to exactly one model-load cycle (load → run trials → drop). -//! -//! # Invariants -//! -//! - For every distinct [`crate::LoadKey`] the planner produces exactly one -//! [`ExecutionPhase`]; the same key never produces two phases. -//! - Phases are sorted by [`crate::LoadKey`] (lexicographic order on the full key tuple). -//! - Registrations inside a phase are sorted by their `name`. -//! - [`crate::ContextParams`] differences within registrations sharing a key do **not** split a -//! phase — the model loads once and each trial constructs its own `LlamaContext`. - use std::collections::BTreeMap; use std::sync::Arc; diff --git a/llama-cpp-bindings-tests/src/test_model.rs b/llama-cpp-test-harness/src/fixtures_dir.rs similarity index 72% rename from llama-cpp-bindings-tests/src/test_model.rs rename to llama-cpp-test-harness/src/fixtures_dir.rs index 22082498..55f44c60 100644 --- a/llama-cpp-bindings-tests/src/test_model.rs +++ b/llama-cpp-test-harness/src/fixtures_dir.rs @@ -1,8 +1,5 @@ -//! Path helper for image and audio fixtures used by multimodal integration tests. - use std::path::PathBuf; -/// Returns the absolute path to the test fixtures directory. #[must_use] pub fn fixtures_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures") diff --git a/llama-cpp-test-harness/src/lib.rs b/llama-cpp-test-harness/src/lib.rs index 8f112b9f..656513fc 100644 --- a/llama-cpp-test-harness/src/lib.rs +++ b/llama-cpp-test-harness/src/lib.rs @@ -1,16 +1,8 @@ -//! Declarative, deterministic, phase-batched integration-test harness for `llama-cpp-bindings`. -//! -//! Tests tag their functions with `#[llama_test(model_source = HuggingFace("…", "…"), …)]` -//! (or `model_source = LocalPath("…")` for a local GGUF). The harness groups tests with -//! identical [`LoadKey`]s into [`ExecutionPhase`]s, loads each phase's model exactly once, and -//! runs every test in the phase sequentially against the shared [`LlamaFixture`]. -//! -//! See the workspace README and `tests/` directory for usage examples. - pub mod context_params; pub mod download_model; pub mod execution_phase; pub mod execution_plan; +pub mod fixtures_dir; pub mod harness_arguments_error; pub mod llama_fixture; pub mod llama_test_fn; diff --git a/llama-cpp-test-harness/src/llama_fixture.rs b/llama-cpp-test-harness/src/llama_fixture.rs index 04ae60b4..f427f8aa 100644 --- a/llama-cpp-test-harness/src/llama_fixture.rs +++ b/llama-cpp-test-harness/src/llama_fixture.rs @@ -1,5 +1,7 @@ use std::path::Path; +use anyhow::Result; +use llama_cpp_bindings::context::LlamaContext; use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::mtmd::MtmdContext; @@ -13,3 +15,15 @@ pub struct LlamaFixture<'fixture> { pub mtmd_context: Option<&'fixture MtmdContext>, pub model_path: &'fixture Path, } + +impl LlamaFixture<'_> { + /// # Errors + /// Forwards [`LlamaContext::from_model`] errors verbatim. + pub fn build_context(&self) -> Result> { + Ok(LlamaContext::from_model( + self.model, + self.backend, + (*self.context_params).into_llama_context_params(), + )?) + } +} diff --git a/llama-cpp-test-harness/src/llama_tests_main_macro.rs b/llama-cpp-test-harness/src/llama_tests_main_macro.rs index fc047cfc..18ed6dab 100644 --- a/llama-cpp-test-harness/src/llama_tests_main_macro.rs +++ b/llama-cpp-test-harness/src/llama_tests_main_macro.rs @@ -1,6 +1,3 @@ -/// Generates a `fn main() -> ExitCode` that dispatches via the harness. -/// -/// Place once at module scope in a test binary that uses `#[llama_test(...)]`. #[macro_export] macro_rules! llama_tests_main { () => { diff --git a/llama-cpp-test-harness/src/load_key.rs b/llama-cpp-test-harness/src/load_key.rs index af34b972..5fad7200 100644 --- a/llama-cpp-test-harness/src/load_key.rs +++ b/llama-cpp-test-harness/src/load_key.rs @@ -1,22 +1,3 @@ -//! Identity of one model-load operation. -//! -//! Two registrations with different [`LoadKey`]s require separate model loads. Two registrations -//! with identical [`LoadKey`]s share one load — even if every other attribute (such as -//! [`crate::ContextParams`]) differs. -//! -//! # What forces a model reload -//! -//! Only the fields of [`LoadKey`]: the model source ([`crate::ModelSource`]), the mmproj source -//! (optional [`crate::MmprojSource`]), and the [`crate::ModelLoadParams`] (`n_gpu_layers`, -//! `use_mmap`, `use_mlock`). -//! -//! # What is runtime-flexible -//! -//! Every `LlamaContextParams` setter (`n_ctx`, `n_batch`, `n_ubatch`, `n_seq_max`, -//! `n_threads_batch`, `embeddings`, and the further setters not yet surfaced in the attribute -//! schema). The harness builds a fresh `LlamaContext` per trial from `fixture.context_params`, -//! so differences here never reload the model. - use std::sync::Arc; use anyhow::Result; @@ -38,9 +19,6 @@ pub struct LoadKey { } impl LoadKey { - /// Downloads (or resolves) the model and optional mmproj, loads them, and returns the live - /// [`PhaseState`] that the harness keeps alive for the duration of the phase. - /// /// # Errors /// /// Returns an error if any of: source resolution fails, loading the model into llama.cpp @@ -139,12 +117,7 @@ mod tests { assert_ne!(baseline(), other); } - // The next three tests exercise the three error-propagation paths inside - // `load_phase_state` — model load failure, mmproj download failure, and mmproj load failure. - // Each constructs a LoadKey whose resolution succeeds (so the path is computed) but whose - // subsequent load step deliberately fails, then asserts the appropriate `Err` propagates. // - // They share BACKEND_INIT_GATE because `LlamaBackend::init` is once-per-process. use std::sync::Arc; @@ -152,9 +125,6 @@ mod tests { use crate::test_backend_gate::BACKEND_INIT_GATE; - /// Path to the workspace `Cargo.toml`, which exists at test time but isn't a valid GGUF and - /// isn't a valid mmproj — perfect for exercising the `load_from_file` / `init_from_file` - /// error arms in `load_phase_state`. const NON_GGUF_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.toml"); #[test] diff --git a/llama-cpp-test-harness/src/mmproj_source.rs b/llama-cpp-test-harness/src/mmproj_source.rs index e33fa0c4..6fb3b7fb 100644 --- a/llama-cpp-test-harness/src/mmproj_source.rs +++ b/llama-cpp-test-harness/src/mmproj_source.rs @@ -1,9 +1,3 @@ -//! Identity of the mmproj GGUF file the harness optionally loads for a phase. -//! -//! Same shape and semantics as [`crate::ModelSource`], but for the multimodal projection file. -//! Independent of the model's source — a test may mix any combination (HF model + local mmproj, -//! local model + HF mmproj, both local, both HF). - use std::path::PathBuf; use anyhow::Result; @@ -20,8 +14,6 @@ pub enum MmprojSource { } impl MmprojSource { - /// Resolves the source to an on-disk path. - /// /// # Errors /// /// Returns an error if the HF download fails. `LocalPath` is infallible here — file diff --git a/llama-cpp-test-harness/src/model_source.rs b/llama-cpp-test-harness/src/model_source.rs index c29d9205..b3dd347d 100644 --- a/llama-cpp-test-harness/src/model_source.rs +++ b/llama-cpp-test-harness/src/model_source.rs @@ -1,14 +1,3 @@ -//! Identity of the GGUF file the harness loads for a phase. -//! -//! Two variants, mutually exclusive by construction: -//! - [`ModelSource::HuggingFace`] — pull via `hf-hub` (cached); the on-disk path is wherever the -//! cache resolves to. -//! - [`ModelSource::LocalPath`] — use the file at the given absolute path verbatim; no download, -//! no cache. -//! -//! Mutual exclusion is enforced at compile time by the enum's variant set. There is no string -//! heuristic anywhere — the proc-macro dispatches on syntactic path identifiers. - use std::path::PathBuf; use anyhow::Result; @@ -25,8 +14,6 @@ pub enum ModelSource { } impl ModelSource { - /// Resolves the source to an on-disk path. - /// /// # Errors /// /// Returns an error if the HF download fails. `LocalPath` is infallible here — file diff --git a/llama-cpp-test-harness/src/no_op.rs b/llama-cpp-test-harness/src/no_op.rs index 7672de54..95c62c54 100644 --- a/llama-cpp-test-harness/src/no_op.rs +++ b/llama-cpp-test-harness/src/no_op.rs @@ -1,11 +1,5 @@ use crate::llama_fixture::LlamaFixture; -/// No-op test function with the [`crate::LlamaTestFn`] signature. Always returns `Ok(())`. -/// -/// Useful as a placeholder for [`crate::LlamaTestRegistration`] in unit tests that exercise -/// grouping/sorting logic without needing real trial bodies. Also covered by a self-test -/// trial so the function shows up in coverage. -/// /// # Errors /// /// Never; always returns `Ok(())`. The `Result` return type matches `LlamaTestFn`. diff --git a/llama-cpp-test-harness/src/parse_harness_arguments.rs b/llama-cpp-test-harness/src/parse_harness_arguments.rs index b4b3ce72..176f3df5 100644 --- a/llama-cpp-test-harness/src/parse_harness_arguments.rs +++ b/llama-cpp-test-harness/src/parse_harness_arguments.rs @@ -12,11 +12,6 @@ fn validate(mut arguments: Arguments) -> Result = std::sync::Mutex::new(()); diff --git a/llama-cpp-test-harness/tests/harness_self_test.rs b/llama-cpp-test-harness/tests/harness_self_test.rs index eea30660..db17915d 100644 --- a/llama-cpp-test-harness/tests/harness_self_test.rs +++ b/llama-cpp-test-harness/tests/harness_self_test.rs @@ -12,10 +12,6 @@ use llama_cpp_test_harness::llama_test; use llama_cpp_test_harness::no_op; use llama_cpp_test_harness::run_to_conclusions; -// Phase A: small Qwen text model, three trials sharing the exact same attribute tuple. -// Two of these pass, one bails — exercising both branches of trial-body dispatch on the same -// loaded model. - #[llama_test( model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), n_gpu_layers = 999, @@ -59,8 +55,6 @@ fn phase_a_intentionally_failing_trial(_fixture: &LlamaFixture<'_>) -> Result<() bail!("intentional failure to exercise the trial-failure dispatch path"); } -// Phase B: distinct model (smaller embedding GGUF). Two trials share this key. - #[llama_test( model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), n_gpu_layers = 999, @@ -89,13 +83,7 @@ fn phase_b_second_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> { Ok(()) } -// Phase C: intentionally invalid HF repo. The phase-setup path fails to download the model, -// which routes the trial through `failing_trials` (one failed trial per registration). // -// The trial function is shared with an additional Phase A registration so that the function -// itself is exercised at least once (Phase A's setup succeeds and dispatches into the body). -// Phase C's setup fails before reaching the body, but the registration still exercises the -// `failing_trials` path in `ExecutionPhase::run`. #[llama_test( model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), @@ -136,16 +124,10 @@ fn phase_b_second_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> { mmproj_source = LocalPath("/nonexistent/llama-cpp-test-harness/no-such-mmproj.gguf"), )] fn shared_setup_failure_and_phase_a_trial(fixture: &LlamaFixture<'_>) -> Result<()> { - // Phase A reaches the body and verifies the fixture is wired up; the failure phases - // (Phase C model download, mmproj download, mmproj load) never reach it. assert!(fixture.model_path.exists()); Ok(()) } -// Phase D: same text model as Phase A but with mmproj — exercises the multimodal-load path -// in LoadKey::load_phase_state. Distinct LoadKey (mmproj_file differs) → distinct phase + -// distinct model load. - #[llama_test( model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), n_gpu_layers = 999,