diff --git a/llama-cpp-bindings-build/src/android_ndk.rs b/llama-cpp-bindings-build/src/android_ndk.rs
index 5c6c193f..0d0123c1 100644
--- a/llama-cpp-bindings-build/src/android_ndk.rs
+++ b/llama-cpp-bindings-build/src/android_ndk.rs
@@ -27,8 +27,6 @@ pub enum AndroidNdkDetectionError {
     UnsupportedAndroidTarget { target_triple: String },
 }
 
-/// Consolidated Android NDK configuration, computed once and shared between
-/// bindgen and `CMake` configuration steps.
 #[derive(Debug)]
 pub struct AndroidNdk {
     pub ndk_path: String,
diff --git a/llama-cpp-bindings-build/src/cmake_config.rs b/llama-cpp-bindings-build/src/cmake_config.rs
index 90b608d4..7d306df2 100644
--- a/llama-cpp-bindings-build/src/cmake_config.rs
+++ b/llama-cpp-bindings-build/src/cmake_config.rs
@@ -217,9 +217,6 @@ fn configure_platform_specific(
     }
 }
 
-/// Work around a cmake-rs bug where debug Rust builds under MSVC strip
-/// optimization flags from Release-profile C/C++ builds.
-/// See: <https://github.com/rust-lang/cmake-rs/issues/240>
 fn configure_msvc_release_workaround(config: &mut Config, profile: &str) {
     let is_release_profile = matches!(profile, "Release" | "RelWithDebInfo" | "MinSizeRel");
 
@@ -269,14 +266,6 @@ fn configure_android_cmake(config: &mut Config, ndk: &AndroidNdk, _target_triple
     println!("cargo:rustc-link-lib=android");
 }
 
-/// macOS BSD ar (from cctools) does not accept GNU ar's `-D` (deterministic)
-/// flag. cmake's default archive recipe is `<CMAKE_AR> qcD …`, which produces
-/// `illegal option -- D` warnings during every static-library link.
-///
-/// We override the archive command for every language used by llama.cpp's
-/// build — C, C++, Objective-C and Objective-C++ (the latter two appear once
-/// `GGML_METAL=ON` enables the Metal backend). Plain `qc` keeps the
-/// quick-create semantics; `<CMAKE_RANLIB>` still runs as ARCHIVE_FINISH.
 fn override_archive_commands_for_apple_ar(config: &mut Config) {
     for language in ["C", "CXX", "OBJC", "OBJCXX"] {
         config.define(
diff --git a/llama-cpp-bindings-build/src/lib.rs b/llama-cpp-bindings-build/src/lib.rs
index f9336583..48809244 100644
--- a/llama-cpp-bindings-build/src/lib.rs
+++ b/llama-cpp-bindings-build/src/lib.rs
@@ -1,5 +1,3 @@
-//! Build system for llama-cpp-bindings-sys FFI bindings to llama.cpp.
-
 mod android_ndk;
 mod bindgen_config;
 mod cmake_config;
@@ -30,7 +28,6 @@ macro_rules! debug_log {
     };
 }
 
-/// Shared state passed between build phases.
 #[derive(Debug)]
 pub struct BuildContext {
     pub out_dir: PathBuf,
@@ -124,9 +121,6 @@ fn set_cmake_parallelism() {
     }
 }
 
-/// Main entry point for the llama.cpp build system.
-///
-/// Call this from `build.rs` in `llama-cpp-bindings-sys`.
 pub fn build() {
     let context = BuildContext::detect();
 
diff --git a/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs b/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs
index 79186be5..87877e80 100644
--- a/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs
+++ b/llama-cpp-bindings-build/src/stable_cmake_build_dir.rs
@@ -17,13 +17,6 @@ const CMAKE_AFFECTING_FEATURES: &[(&str, bool)] = &[
     ("static-stdcxx", cfg!(feature = "static-stdcxx")),
 ];
 
-/// Compute a stable, persistent cmake build directory under the workspace
-/// `target/` tree, keyed only by inputs that materially change cmake compile
-/// commands. Toggling features that don't affect cmake (e.g. `mtmd`, `llguidance`)
-/// returns the same path, allowing cmake's incremental build (and ccache) to
-/// reuse all prior artifacts — including `nvcc`-built CUDA kernels.
-///
-/// `LLAMA_CMAKE_BUILD_DIR_OVERRIDE` overrides the path entirely when set.
 pub fn stable_cmake_build_dir(
     target_dir: &Path,
     target_triple: &str,
diff --git a/llama-cpp-bindings-sys/src/lib.rs b/llama-cpp-bindings-sys/src/lib.rs
index e3dbbeba..6b429eb4 100644
--- a/llama-cpp-bindings-sys/src/lib.rs
+++ b/llama-cpp-bindings-sys/src/lib.rs
@@ -1,5 +1,3 @@
-//! See [llama-cpp-bindings](https://crates.io/crates/llama-cpp-bindings) for a documented and safe API.
-
 #![expect(
     non_camel_case_types,
     reason = "bindgen emits C struct and enum names verbatim and they don't follow Rust naming"
diff --git a/llama-cpp-bindings-tests/src/classify_sample_loop.rs b/llama-cpp-bindings-tests/src/classify_sample_loop.rs
index d5b070c4..8240c74f 100644
--- a/llama-cpp-bindings-tests/src/classify_sample_loop.rs
+++ b/llama-cpp-bindings-tests/src/classify_sample_loop.rs
@@ -7,14 +7,6 @@ use llama_cpp_bindings::sampled_token::SampledToken;
 use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier;
 use llama_cpp_bindings::sampling::LlamaSampler;
 
-/// Drives a classifier through the full sample/decode/flush loop.
-///
-/// Suppresses EOG outcomes (so `generated_raw` and the per-section streams
-/// never contain end-of-generation marker text) and captures per-section
-/// counts. Tests that need to exercise classifier behaviour during real
-/// inference should construct one of these and call
-/// [`ClassifySampleLoop::run`] instead of re-implementing the loop. The
-/// strict per-test assertions then run on [`ClassifySampleLoopOutcome`].
 pub struct ClassifySampleLoop<'borrow, 'model, 'tokens> {
     pub model: &'model LlamaModel,
     pub classifier: &'borrow mut SampledTokenClassifier<'model>,
@@ -59,10 +51,6 @@ impl ClassifySampleLoop<'_, '_, '_> {
                 } else {
                     outcome.generated_raw.push_str(&ingest_outcome.raw_piece);
                 }
-                // Counters always include EOG so they match the classifier's
-                // internal usage counters (which include every sampled token).
-                // EOG text is suppressed from `generated_raw` and the per-section
-                // streams so callers can assert exact textual equality.
                 record_outcome(ingest_outcome, &mut outcome, is_eog);
             }
 
@@ -115,3 +103,30 @@ fn record_outcome(ingest: &IngestOutcome, outcome: &mut ClassifySampleLoopOutcom
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use llama_cpp_bindings::ingest_outcome::IngestOutcome;
+    use llama_cpp_bindings::sampled_token::SampledToken;
+    use llama_cpp_bindings::token::LlamaToken;
+
+    use super::ClassifySampleLoopOutcome;
+    use super::record_outcome;
+
+    #[test]
+    fn record_outcome_tool_call_token() {
+        let ingest = IngestOutcome {
+            sampled_token: SampledToken::ToolCall(LlamaToken(42)),
+            visible_piece: String::new(),
+            raw_piece: String::new(),
+        };
+        let mut outcome = ClassifySampleLoopOutcome::default();
+
+        record_outcome(&ingest, &mut outcome, false);
+
+        assert_eq!(outcome.observed_tool_call, 1);
+        assert_eq!(outcome.observed_content, 0);
+        assert_eq!(outcome.observed_reasoning, 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+    }
+}
diff --git a/llama-cpp-bindings-tests/src/lib.rs b/llama-cpp-bindings-tests/src/lib.rs
index 00686c59..b48fe749 100644
--- a/llama-cpp-bindings-tests/src/lib.rs
+++ b/llama-cpp-bindings-tests/src/lib.rs
@@ -1,8 +1,2 @@
-//! Integration test fixtures for `llama-cpp-bindings`.
-//!
-//! This crate hosts test-only helpers used by the integration tests in `tests/`:
-//! [`classify_sample_loop`] for sampling-loop drivers and [`test_model::fixtures_dir`]
-//! for locating image fixtures.
-
 pub mod classify_sample_loop;
-pub mod test_model;
+pub mod prime_kv_cache;
diff --git a/llama-cpp-bindings-tests/src/prime_kv_cache.rs b/llama-cpp-bindings-tests/src/prime_kv_cache.rs
new file mode 100644
index 00000000..570cf77c
--- /dev/null
+++ b/llama-cpp-bindings-tests/src/prime_kv_cache.rs
@@ -0,0 +1,15 @@
+use anyhow::Result;
+use llama_cpp_bindings::context::LlamaContext;
+use llama_cpp_bindings::llama_batch::LlamaBatch;
+use llama_cpp_bindings::model::AddBos;
+use llama_cpp_test_harness::LlamaFixture;
+
+/// # Errors
+/// Forwards tokenization, batch construction, and [`LlamaContext::decode`] errors verbatim.
+pub fn prime_kv_cache(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> {
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+    Ok(())
+}
diff --git a/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs
index a7e18245..a6bf7ce3 100644
--- a/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs
+++ b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs
@@ -1,567 +1,551 @@
-use llama_cpp_test_harness::llama_tests_main;
-
-mod model_chat_template {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
+#![expect(
+    clippy::unnecessary_wraps,
+    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+)]
 
-    use anyhow::Result;
-    use llama_cpp_bindings::ChatTemplateError;
-    use llama_cpp_bindings::model::LlamaChatMessage;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let template = fixture.model.chat_template(None);
-        assert!(template.is_ok());
-        Ok(())
-    }
+use anyhow::Result;
+use anyhow::bail;
+use llama_cpp_bindings::ChatMessageParseOutcome;
+use llama_cpp_bindings::ChatTemplateError;
+use llama_cpp_bindings::model::LlamaChatMessage;
+use llama_cpp_test_harness::LlamaFixture;
+use llama_cpp_test_harness::llama_test;
+use llama_cpp_test_harness::llama_tests_main;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let template = model.chat_template(None)?;
-        let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?;
-        let prompt = model.apply_chat_template(&template, &[message], true);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let template = fixture.model.chat_template(None);
+    assert!(template.is_ok());
+    Ok(())
+}
 
-        assert!(prompt.is_ok());
-        assert!(!prompt?.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let template = model.chat_template(None)?;
+    let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?;
+    let prompt = model.apply_chat_template(&template, &[message], true);
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn apply_chat_template_buffer_resize_with_long_messages(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let template = model.chat_template(None)?;
-        let long_content = "a".repeat(2000);
-        let message = LlamaChatMessage::new("user".to_string(), long_content)?;
-        let prompt = model.apply_chat_template(&template, &[message], true);
+    assert!(prompt.is_ok());
+    assert!(!prompt?.is_empty());
+    Ok(())
+}
 
-        assert!(prompt.is_ok());
-        assert!(!prompt?.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn apply_chat_template_buffer_resize_with_long_messages(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let template = model.chat_template(None)?;
+    let long_content = "a".repeat(2000);
+    let message = LlamaChatMessage::new("user".to_string(), long_content)?;
+    let prompt = model.apply_chat_template(&template, &[message], true);
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = fixture
-            .model
-            .chat_template(Some("nonexistent_template_name_xyz"));
-        assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate);
-        Ok(())
-    }
+    assert!(prompt.is_ok());
+    assert!(!prompt?.is_empty());
+    Ok(())
 }
 
-mod parse_chat_message {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = fixture
+        .model
+        .chat_template(Some("nonexistent_template_name_xyz"));
+    assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message("[]", "hello world", false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let outcome = fixture
+        .model
+        .parse_chat_message("[]", "hello world", false)?;
 
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!("expected Recognized for plain content; got Unrecognized");
-        };
-        assert!(parsed.tool_calls.is_empty());
-        assert!(!parsed.is_empty());
-        assert!(parsed.content.contains("hello world"));
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!("expected Recognized for plain content; got Unrecognized");
+    };
+    assert!(parsed.tool_calls.is_empty());
+    assert!(!parsed.is_empty());
+    assert!(parsed.content.contains("hello world"));
 
-        Ok(())
-    }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let input = "<think>step one, step two</think>\n\nactual response";
-        let outcome = fixture.model.parse_chat_message("[]", input, false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let input = "<think>step one, step two</think>\n\nactual response";
+    let outcome = fixture.model.parse_chat_message("[]", input, false)?;
 
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!("expected Recognized for reasoning section; got Unrecognized");
-        };
-        assert!(
-            parsed.reasoning_content.contains("step") || parsed.content.contains("step"),
-            "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}",
-            parsed.content,
-            parsed.reasoning_content
-        );
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!("expected Recognized for reasoning section; got Unrecognized");
+    };
+    assert!(
+        parsed.reasoning_content.contains("step") || parsed.content.contains("step"),
+        "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}",
+        parsed.content,
+        parsed.reasoning_content
+    );
 
-        Ok(())
-    }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome = fixture.model.parse_chat_message("[]", "", false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let outcome = fixture.model.parse_chat_message("[]", "", false)?;
 
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!("expected Recognized for empty input; got Unrecognized");
-        };
-        assert!(parsed.tool_calls.is_empty());
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!("expected Recognized for empty input; got Unrecognized");
+    };
+    assert!(parsed.tool_calls.is_empty());
 
-        Ok(())
-    }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn parses_malformed_tools_json_returns_tools_json_invalid_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let result = fixture
-            .model
-            .parse_chat_message("not_a_json[}", "hello", false);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn parses_malformed_tools_json_returns_tools_json_invalid_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let result = fixture
+        .model
+        .parse_chat_message("not_a_json[}", "hello", false);
 
-        assert!(matches!(
-            result,
-            Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
-                _
-            ))
-        ));
-        Ok(())
-    }
+    assert!(matches!(
+        result,
+        Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
+            _
+        ))
+    ));
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn parses_non_array_tools_json_returns_tools_json_not_array_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let result = fixture
-            .model
-            .parse_chat_message("{\"foo\": 1}", "hello", false);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn parses_non_array_tools_json_returns_tools_json_not_array_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let result = fixture
+        .model
+        .parse_chat_message("{\"foo\": 1}", "hello", false);
 
-        assert!(matches!(
-            result,
-            Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray)
-        ));
-        Ok(())
-    }
+    assert!(matches!(
+        result,
+        Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray)
+    ));
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn parses_with_tools_null_byte_returns_tools_json_invalid_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let result = fixture
-            .model
-            .parse_chat_message("[]\0extra", "hello", false);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn parses_with_tools_null_byte_returns_tools_json_invalid_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let result = fixture
+        .model
+        .parse_chat_message("[]\0extra", "hello", false);
 
-        assert!(matches!(
-            result,
-            Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
-                _
-            ))
-        ));
-        Ok(())
-    }
+    assert!(matches!(
+        result,
+        Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
+            _
+        ))
+    ));
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn parses_with_input_null_byte_returns_tools_serialization_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let result = fixture
-            .model
-            .parse_chat_message("[]", "hello\0world", false);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn parses_with_input_null_byte_returns_tools_serialization_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let result = fixture
+        .model
+        .parse_chat_message("[]", "hello\0world", false);
 
-        assert!(matches!(
-            result,
-            Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_))
-        ));
-        Ok(())
-    }
+    assert!(matches!(
+        result,
+        Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_))
+    ));
+    Ok(())
 }
 
 llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs
index cebd47c1..2d5e5823 100644
--- a/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs
+++ b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs
@@ -1,707 +1,630 @@
+#![expect(
+    clippy::unnecessary_wraps,
+    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+)]
+
+use std::num::NonZeroU8;
+use std::time::Duration;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::bail;
+use llama_cpp_bindings::context::LlamaContext;
+use llama_cpp_bindings::ggml_time_us;
+use llama_cpp_bindings::llama_batch::LlamaBatch;
+use llama_cpp_bindings::model::AddBos;
+use llama_cpp_bindings_tests::prime_kv_cache::prime_kv_cache;
+use llama_cpp_test_harness::LlamaFixture;
+use llama_cpp_test_harness::llama_test;
 use llama_cpp_test_harness::llama_tests_main;
 
-mod embeddings {
-    use std::time::Duration;
+fn normalize(input: &[f32]) -> Vec<f32> {
+    let magnitude = input
+        .iter()
+        .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
+        .sqrt();
 
-    use anyhow::{Context, Result};
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::ggml_time_us;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    input.iter().map(|&value| value / magnitude).collect()
+}
 
-    fn normalize(input: &[f32]) -> Vec<f32> {
-        let magnitude = input
-            .iter()
-            .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
-            .sqrt();
+fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 {
+    vec_a
+        .iter()
+        .zip(vec_b.iter())
+        .map(|(left, right)| left * right)
+        .sum::<f32>()
+}
 
-        input.iter().map(|&value| value / magnitude).collect()
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    n_threads_batch = 8,
+    embeddings = true,
+)]
+fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+
+    let mut ctx = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )
+    .with_context(|| "unable to create context")?;
+
+    let prompt = "Hello my name is";
+    let tokens = model
+        .str_to_token(prompt, AddBos::Always)
+        .with_context(|| format!("failed to tokenize {prompt}"))?;
+    let prompt_token_count = u64::try_from(tokens.len())?;
+
+    let n_ctx = usize::try_from(ctx.n_ctx())?;
+    assert!(tokens.len() <= n_ctx, "prompt exceeds context window size");
+
+    let t_main_start = ggml_time_us();
+
+    let mut classifier = model.sampled_token_classifier();
+    let mut batch = LlamaBatch::new(n_ctx, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+    assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+    assert_eq!(classifier.usage().prompt_tokens, 0);
+
+    ctx.clear_kv_cache();
+    ctx.decode(&mut batch)
+        .with_context(|| "llama_decode() failed")?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let embedding = ctx
+        .embeddings_seq_ith(0)
+        .with_context(|| "failed to get embeddings")?;
+    let normalized = normalize(embedding);
+
+    let t_main_end = ggml_time_us();
+    let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+
+    eprintln!(
+        "created embedding with {} dimensions in {:.2} s",
+        normalized.len(),
+        duration.as_secs_f32()
+    );
+
+    assert!(
+        !normalized.is_empty(),
+        "embedding should have at least one dimension"
+    );
+
+    let magnitude: f32 = normalized
+        .iter()
+        .map(|value| value * value)
+        .sum::<f32>()
+        .sqrt();
+    assert!(
+        (magnitude - 1.0).abs() < 0.01,
+        "normalized embedding magnitude should be approximately 1.0, got {magnitude}"
+    );
+
+    let usage = classifier.into_usage();
+    assert_eq!(usage.prompt_tokens, prompt_token_count);
+    assert_eq!(usage.completion_tokens(), 0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    n_seq_max = 2,
+    n_threads_batch = 8,
+    embeddings = true,
+)]
+fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+
+    let query = "What is machine learning?";
+    let documents = [
+        "Machine learning is a subset of artificial intelligence.",
+        "The weather today is sunny and warm.",
+    ];
+
+    let document_count = documents.len();
+    assert_eq!(
+        u32::try_from(document_count)?,
+        fixture.context_params.n_seq_max,
+        "attribute n_seq_max must match the document count this trial expects",
+    );
+
+    let mut ctx = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )
+    .with_context(|| "unable to create context")?;
+
+    let prompt_lines: Vec<String> = documents
+        .iter()
+        .map(|document| format!("{query}</s><s>{document}"))
+        .collect();
+
+    let tokens_lines_list = prompt_lines
+        .iter()
+        .map(|line| model.str_to_token(line, AddBos::Always))
+        .collect::<std::result::Result<Vec<_>, _>>()
+        .with_context(|| "failed to tokenize prompts")?;
+
+    let n_ctx = usize::try_from(ctx.n_ctx())?;
+
+    if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) {
+        bail!("one of the provided prompts exceeds the size of the context window");
     }
 
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        n_threads_batch = 8,
-        embeddings = true,
-    )]
-    fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
+    let mut classifier = model.sampled_token_classifier();
+    let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?;
+    let t_main_start = ggml_time_us();
 
-        let mut ctx = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )
-        .with_context(|| "unable to create context")?;
+    for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() {
+        classifier.feed_prompt_sequence_to_batch(
+            &mut batch,
+            tokens,
+            i32::try_from(sequence_index)?,
+            false,
+        )?;
+    }
 
-        let prompt = "Hello my name is";
-        let tokens = model
-            .str_to_token(prompt, AddBos::Always)
-            .with_context(|| format!("failed to tokenize {prompt}"))?;
-        let prompt_token_count = u64::try_from(tokens.len())?;
+    let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
+    let total_token_count = u64::try_from(total_tokens)?;
 
-        let n_ctx = usize::try_from(ctx.n_ctx())?;
-        assert!(tokens.len() <= n_ctx, "prompt exceeds context window size");
+    assert_eq!(classifier.pending_prompt_tokens(), total_token_count);
+    assert_eq!(classifier.usage().prompt_tokens, 0);
 
-        let t_main_start = ggml_time_us();
+    ctx.clear_kv_cache();
+    ctx.decode(&mut batch)
+        .with_context(|| "llama_decode() failed")?;
 
-        let mut classifier = model.sampled_token_classifier();
-        let mut batch = LlamaBatch::new(n_ctx, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, total_token_count);
 
-        assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
-        assert_eq!(classifier.usage().prompt_tokens, 0);
+    let mut embeddings = Vec::with_capacity(document_count);
 
-        ctx.clear_kv_cache();
-        ctx.decode(&mut batch)
-            .with_context(|| "llama_decode() failed")?;
+    for sequence_index in 0..document_count {
+        let raw_embedding = ctx
+            .embeddings_seq_ith(i32::try_from(sequence_index)?)
+            .with_context(|| "failed to get sequence embeddings")?;
+        embeddings.push(normalize(raw_embedding));
+    }
 
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
+    let t_main_end = ggml_time_us();
+    let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
 
-        let embedding = ctx
-            .embeddings_seq_ith(0)
-            .with_context(|| "failed to get embeddings")?;
-        let normalized = normalize(embedding);
+    #[expect(
+        clippy::cast_precision_loss,
+        reason = "logged throughput tolerates f32 precision"
+    )]
+    let tokens_per_second = total_tokens as f32 / duration.as_secs_f32();
 
-        let t_main_end = ggml_time_us();
-        let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+    eprintln!(
+        "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
+        duration.as_secs_f32(),
+    );
 
-        eprintln!(
-            "created embedding with {} dimensions in {:.2} s",
-            normalized.len(),
-            duration.as_secs_f32()
-        );
+    assert_eq!(
+        embeddings.len(),
+        document_count,
+        "should produce one embedding per document"
+    );
 
+    for (index, embedding) in embeddings.iter().enumerate() {
         assert!(
-            !normalized.is_empty(),
-            "embedding should have at least one dimension"
+            !embedding.is_empty(),
+            "embedding {index} should not be empty"
         );
+    }
 
-        let magnitude: f32 = normalized
-            .iter()
-            .map(|value| value * value)
-            .sum::<f32>()
-            .sqrt();
-        assert!(
-            (magnitude - 1.0).abs() < 0.01,
-            "normalized embedding magnitude should be approximately 1.0, got {magnitude}"
-        );
+    let similarity = cosine_similarity(&embeddings[0], &embeddings[1]);
+    eprintln!("cosine similarity between document embeddings: {similarity:.4}");
 
-        let usage = classifier.into_usage();
-        assert_eq!(usage.prompt_tokens, prompt_token_count);
-        assert_eq!(usage.completion_tokens(), 0);
+    assert!(
+        similarity.is_finite(),
+        "cosine similarity should be a finite number"
+    );
 
-        Ok(())
-    }
-}
+    let usage = classifier.into_usage();
+    assert_eq!(usage.prompt_tokens, total_token_count);
+    assert_eq!(usage.completion_tokens(), 0);
 
-mod reranker {
-    use std::time::Duration;
+    Ok(())
+}
 
-    use anyhow::{Context, Result, bail};
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::ggml_time_us;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    embeddings = true,
+)]
+fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+
+    let result = context.decode(&mut batch);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
 
-    fn normalize(input: &[f32]) -> Vec<f32> {
-        let magnitude = input
-            .iter()
-            .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
-            .sqrt();
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    embeddings = true,
+)]
+fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let embeddings = context.embeddings_seq_ith(0)?;
+
+    assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
+
+    Ok(())
+}
 
-        input.iter().map(|&value| value / magnitude).collect()
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    n_seq_max = 4,
+    embeddings = true,
+)]
+fn multi_sequence_embeddings_returns_one_embedding_per_sequence(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let inputs = [
+        "alpha is here",
+        "beta runs fast",
+        "gamma waits",
+        "delta jumps",
+    ];
+    let mut batch = LlamaBatch::new(64, 4)?;
+
+    for (sequence_index, text) in inputs.iter().enumerate() {
+        let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
+        let sequence_id = i32::try_from(sequence_index)?;
+
+        batch.add_sequence(&tokens, sequence_id, true)?;
     }
 
-    fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 {
-        vec_a
-            .iter()
-            .zip(vec_b.iter())
-            .map(|(left, right)| left * right)
-            .sum::<f32>()
-    }
+    context.decode(&mut batch)?;
 
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        n_seq_max = 2,
-        n_threads_batch = 8,
-        embeddings = true,
-    )]
-    fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
+    let n_embd = usize::try_from(fixture.model.n_embd())?;
+    let mut collected: Vec<Vec<f32>> = Vec::with_capacity(inputs.len());
 
-        let query = "What is machine learning?";
-        let documents = [
-            "Machine learning is a subset of artificial intelligence.",
-            "The weather today is sunny and warm.",
-        ];
+    for sequence_index in 0..inputs.len() {
+        let sequence_id = i32::try_from(sequence_index)?;
+        let embedding = context.embeddings_seq_ith(sequence_id)?;
 
-        let document_count = documents.len();
         assert_eq!(
-            u32::try_from(document_count)?,
-            fixture.context_params.n_seq_max,
-            "attribute n_seq_max must match the document count this trial expects",
+            embedding.len(),
+            n_embd,
+            "sequence {sequence_index} embedding length mismatch"
         );
 
-        let mut ctx = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )
-        .with_context(|| "unable to create context")?;
-
-        let prompt_lines: Vec<String> = documents
-            .iter()
-            .map(|document| format!("{query}</s><s>{document}"))
-            .collect();
-
-        let tokens_lines_list = prompt_lines
-            .iter()
-            .map(|line| model.str_to_token(line, AddBos::Always))
-            .collect::<std::result::Result<Vec<_>, _>>()
-            .with_context(|| "failed to tokenize prompts")?;
-
-        let n_ctx = usize::try_from(ctx.n_ctx())?;
-
-        if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) {
-            bail!("one of the provided prompts exceeds the size of the context window");
-        }
-
-        let mut classifier = model.sampled_token_classifier();
-        let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?;
-        let t_main_start = ggml_time_us();
-
-        for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() {
-            classifier.feed_prompt_sequence_to_batch(
-                &mut batch,
-                tokens,
-                i32::try_from(sequence_index)?,
-                false,
-            )?;
-        }
-
-        let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
-        let total_token_count = u64::try_from(total_tokens)?;
-
-        assert_eq!(classifier.pending_prompt_tokens(), total_token_count);
-        assert_eq!(classifier.usage().prompt_tokens, 0);
-
-        ctx.clear_kv_cache();
-        ctx.decode(&mut batch)
-            .with_context(|| "llama_decode() failed")?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, total_token_count);
-
-        let mut embeddings = Vec::with_capacity(document_count);
-
-        for sequence_index in 0..document_count {
-            let raw_embedding = ctx
-                .embeddings_seq_ith(i32::try_from(sequence_index)?)
-                .with_context(|| "failed to get sequence embeddings")?;
-            embeddings.push(normalize(raw_embedding));
-        }
-
-        let t_main_end = ggml_time_us();
-        let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
-
-        #[expect(
-            clippy::cast_precision_loss,
-            reason = "logged throughput tolerates f32 precision"
-        )]
-        let tokens_per_second = total_tokens as f32 / duration.as_secs_f32();
-
-        eprintln!(
-            "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
-            duration.as_secs_f32(),
-        );
-
-        assert_eq!(
-            embeddings.len(),
-            document_count,
-            "should produce one embedding per document"
-        );
+        collected.push(embedding.to_vec());
+    }
 
-        for (index, embedding) in embeddings.iter().enumerate() {
-            assert!(
-                !embedding.is_empty(),
-                "embedding {index} should not be empty"
+    for (left_index, left) in collected.iter().enumerate() {
+        for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
+            assert_ne!(
+                left, right,
+                "embedding for sequence {left_index} must differ from sequence {right_index}",
             );
         }
-
-        let similarity = cosine_similarity(&embeddings[0], &embeddings[1]);
-        eprintln!("cosine similarity between document embeddings: {similarity:.4}");
-
-        assert!(
-            similarity.is_finite(),
-            "cosine similarity should be a finite number"
-        );
-
-        let usage = classifier.into_usage();
-        assert_eq!(usage.prompt_tokens, total_token_count);
-        assert_eq!(usage.completion_tokens(), 0);
-
-        Ok(())
     }
-}
-
-mod context_embedding_and_encoder {
-
-    use anyhow::Result;
-
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    // =========================================================================================
-    // Group A: default Qwen model, embeddings=false. Most context tests fall here.
-    // =========================================================================================
-
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        embeddings = true,
-    )]
-    fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-
-        let result = context.decode(&mut batch);
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        embeddings = true,
-    )]
-    fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let embeddings = context.embeddings_seq_ith(0)?;
-
-        assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
 
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        n_seq_max = 4,
-        embeddings = true,
-    )]
-    fn multi_sequence_embeddings_returns_one_embedding_per_sequence(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let inputs = [
-            "alpha is here",
-            "beta runs fast",
-            "gamma waits",
-            "delta jumps",
-        ];
-        let mut batch = LlamaBatch::new(64, 4)?;
+    Ok(())
+}
 
-        for (sequence_index, text) in inputs.iter().enumerate() {
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    n_seq_max = 4,
+    embeddings = true,
+)]
+fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let iterations = [
+        [
+            "This is the first document with enough content to contribute meaningfully to the batch size calculation",
+            "This is the second document that should be processed in a potentially different batch from the first",
+        ],
+        [
+            "This is the third document adding more content to ensure the total exceeds the configured chunk limit",
+            "This is the fourth document which should demonstrate that batching distributes across agent requests",
+        ],
+    ];
+
+    let n_embd = usize::try_from(fixture.model.n_embd())?;
+    let mut batch = LlamaBatch::new(64, 4)?;
+    let mut collected: Vec<Vec<f32>> = Vec::new();
+
+    for iteration_inputs in iterations {
+        for (sequence_index, text) in iteration_inputs.iter().enumerate() {
             let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
             let sequence_id = i32::try_from(sequence_index)?;
 
             batch.add_sequence(&tokens, sequence_id, true)?;
         }
 
+        context.clear_kv_cache();
         context.decode(&mut batch)?;
 
-        let n_embd = usize::try_from(fixture.model.n_embd())?;
-        let mut collected: Vec<Vec<f32>> = Vec::with_capacity(inputs.len());
-
-        for sequence_index in 0..inputs.len() {
+        for sequence_index in 0..iteration_inputs.len() {
             let sequence_id = i32::try_from(sequence_index)?;
             let embedding = context.embeddings_seq_ith(sequence_id)?;
 
             assert_eq!(
                 embedding.len(),
                 n_embd,
-                "sequence {sequence_index} embedding length mismatch"
+                "iteration sequence {sequence_index} embedding length mismatch"
             );
 
             collected.push(embedding.to_vec());
         }
 
-        for (left_index, left) in collected.iter().enumerate() {
-            for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
-                assert_ne!(
-                    left, right,
-                    "embedding for sequence {left_index} must differ from sequence {right_index}",
-                );
-            }
-        }
-
-        Ok(())
+        batch.clear();
     }
 
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        n_seq_max = 4,
-        embeddings = true,
-    )]
-    fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let iterations = [
-            [
-                "This is the first document with enough content to contribute meaningfully to the batch size calculation",
-                "This is the second document that should be processed in a potentially different batch from the first",
-            ],
-            [
-                "This is the third document adding more content to ensure the total exceeds the configured chunk limit",
-                "This is the fourth document which should demonstrate that batching distributes across agent requests",
-            ],
-        ];
-
-        let n_embd = usize::try_from(fixture.model.n_embd())?;
-        let mut batch = LlamaBatch::new(64, 4)?;
-        let mut collected: Vec<Vec<f32>> = Vec::new();
-
-        for iteration_inputs in iterations {
-            for (sequence_index, text) in iteration_inputs.iter().enumerate() {
-                let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
-                let sequence_id = i32::try_from(sequence_index)?;
-
-                batch.add_sequence(&tokens, sequence_id, true)?;
-            }
-
-            context.clear_kv_cache();
-            context.decode(&mut batch)?;
-
-            for sequence_index in 0..iteration_inputs.len() {
-                let sequence_id = i32::try_from(sequence_index)?;
-                let embedding = context.embeddings_seq_ith(sequence_id)?;
-
-                assert_eq!(
-                    embedding.len(),
-                    n_embd,
-                    "iteration sequence {sequence_index} embedding length mismatch"
-                );
-
-                collected.push(embedding.to_vec());
-            }
-
-            batch.clear();
-        }
-
-        assert_eq!(
-            collected.len(),
-            iterations.iter().flatten().count(),
-            "expected one embedding per input across every iteration"
-        );
-
-        for (left_index, left) in collected.iter().enumerate() {
-            for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
-                assert_ne!(
-                    left, right,
-                    "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations",
-                );
-            }
+    assert_eq!(
+        collected.len(),
+        iterations.iter().flatten().count(),
+        "expected one embedding per input across every iteration"
+    );
+
+    for (left_index, left) in collected.iter().enumerate() {
+        for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
+            assert_ne!(
+                left, right,
+                "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations",
+            );
         }
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        embeddings = true,
-    )]
-    fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let last_index = i32::try_from(tokens.len() - 1)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let embeddings = context.embeddings_ith(last_index)?;
-
-        assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        embeddings = true,
-    )]
-    fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let result = context.embeddings_ith(999);
-
-        assert!(result.is_err());
-
-        Ok(())
     }
 
-    #[llama_test(
-        model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        embeddings = true,
-    )]
-    fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Never)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-
-        let result = context.encode(&mut batch);
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
+    Ok(())
 }
 
-mod context_kv_cache_embedding {
-    use std::num::NonZeroU8;
-
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    fn build_context<'context>(
-        fixture: &'context LlamaFixture<'_>,
-    ) -> Result<LlamaContext<'context>> {
-        Ok(LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?)
-    }
-
-    fn decode_hello_world(
-        fixture: &LlamaFixture<'_>,
-        context: &mut LlamaContext<'_>,
-    ) -> Result<()> {
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        let result = context.kv_cache_seq_add(0, Some(0), None, 1);
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    embeddings = true,
+)]
+fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let last_index = i32::try_from(tokens.len() - 1)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let embeddings = context.embeddings_ith(last_index)?;
+
+    assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
+
+    Ok(())
+}
 
-        assert!(result.is_ok());
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    embeddings = true,
+)]
+fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let result = context.embeddings_ith(999);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    embeddings = true,
+)]
+fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Never)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+
+    let result = context.encode(&mut batch);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
 
-        decode_hello_world(fixture, &mut context)?;
+    prime_kv_cache(fixture, &mut context)?;
 
-        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-        let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
+    let result = context.kv_cache_seq_add(0, Some(0), None, 1);
 
-        assert!(result.is_ok());
+    assert!(result.is_ok());
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod model_helpers_embedding {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
-    )]
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
 
-    use anyhow::Result;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128
-    )]
-    fn embedding_model_tool_call_markers_call_does_not_panic(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let _markers = fixture.model.tool_call_markers();
+    prime_kv_cache(fixture, &mut context)?;
 
-        Ok(())
-    }
+    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+    let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
 
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128
-    )]
-    fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let _markers = fixture.model.streaming_markers()?;
+    assert!(result.is_ok());
 
-        Ok(())
-    }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128
-    )]
-    fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let env = fixture.model.approximate_tok_env();
-        let env_again = fixture.model.approximate_tok_env();
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128
+)]
+fn embedding_model_tool_call_markers_call_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let _markers = fixture.model.tool_call_markers();
+
+    Ok(())
+}
 
-        assert!(
-            std::sync::Arc::ptr_eq(&env, &env_again),
-            "approximate_tok_env must return the same cached Arc for any model, including \
-             the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)"
-        );
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128
+)]
+fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let _markers = fixture.model.streaming_markers()?;
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128
+)]
+fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let env = fixture.model.approximate_tok_env();
+    let env_again = fixture.model.approximate_tok_env();
+
+    assert!(
+        std::sync::Arc::ptr_eq(&env, &env_again),
+        "approximate_tok_env must return the same cached Arc for any model, including \
+         the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)"
+    );
+
+    Ok(())
 }
 
 llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs
index de316e42..fa20f3a7 100644
--- a/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs
+++ b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs
@@ -1,2836 +1,2752 @@
+#![expect(
+    clippy::unnecessary_wraps,
+    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+)]
+
+use std::num::NonZeroU8;
+use std::ptr::NonNull;
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+
+use anyhow::Result;
+use llama_cpp_bindings::DecodeError;
+use llama_cpp_bindings::LogitsError;
+use llama_cpp_bindings::context::LlamaContext;
+use llama_cpp_bindings::context::kv_cache::KvCacheConversionError;
+use llama_cpp_bindings::error::KvCacheSeqAddError;
+use llama_cpp_bindings::error::KvCacheSeqDivError;
+use llama_cpp_bindings::llama_batch::LlamaBatch;
+use llama_cpp_bindings::model::AddBos;
+use llama_cpp_bindings::model::LlamaLoraAdapter;
+use llama_cpp_bindings_tests::prime_kv_cache::prime_kv_cache;
+use llama_cpp_test_harness::LlamaFixture;
+use llama_cpp_test_harness::llama_test;
 use llama_cpp_test_harness::llama_tests_main;
 
-mod model_context_creation {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        assert!(context.n_ctx() > 0);
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4294967295,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4294967295,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4294967295,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4294967295,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        );
-
-        assert!(result.is_err());
-        Ok(())
-    }
-}
-
-mod context {
-    use std::ptr::NonNull;
-    use std::sync::Arc;
-    use std::sync::atomic::AtomicBool;
-
-    use anyhow::Result;
-    use llama_cpp_bindings::DecodeError;
-    use llama_cpp_bindings::LogitsError;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::model::LlamaLoraAdapter;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    // =========================================================================================
-    // Group A: default Qwen model, embeddings=false. Most context tests fall here.
-    // =========================================================================================
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        assert!(context.n_ctx() > 0);
-        assert!(context.n_batch() > 0);
-        assert!(context.n_ubatch() > 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-
-        let decode_result = context.decode(&mut batch);
-        assert!(decode_result.is_ok());
-
-        let logits = context.get_logits()?;
-        assert!(!logits.is_empty());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.reset_timings();
-        let timings = context.timings();
-        assert!(timings.t_start_ms() >= 0.0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let token_data_array = context.token_data_array()?;
-
-        assert!(!token_data_array.data.is_empty());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let last_index = i32::try_from(tokens.len() - 1)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let logits = context.get_logits_ith(last_index)?;
-
-        assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let last_index = i32::try_from(tokens.len() - 1)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let token_data_array = context.token_data_array_ith(last_index)?;
-
-        assert_eq!(
-            token_data_array.data.len(),
-            usize::try_from(fixture.model.n_vocab())?
-        );
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn embeddings_ith_returns_error_when_embeddings_disabled(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let result = context.embeddings_ith(0);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn embeddings_seq_ith_returns_error_when_embeddings_disabled(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let result = context.embeddings_seq_ith(0);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let count = context.candidates()?.count();
-
-        assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let debug_output = format!("{context:?}");
-
-        assert!(debug_output.contains("LlamaContext"));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let last_index = i32::try_from(tokens.len() - 1)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let count = context.candidates_ith(last_index)?.count();
-
-        assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let mut adapter = LlamaLoraAdapter {
-            lora_adapter: NonNull::dangling(),
-        };
-
-        let result = context.lora_adapter_remove(&mut adapter);
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-
-        let result = context.encode(&mut batch);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let mut adapter = LlamaLoraAdapter {
-            lora_adapter: NonNull::dangling(),
-        };
-
-        let result = context.lora_adapter_set(&mut adapter, 1.0);
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        embeddings = true,
-    )]
-    fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let result = context.embeddings_seq_ith(999);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-
-        let result = context.decode(&mut batch);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let abort_flag = Arc::new(AtomicBool::new(true));
-        context.set_abort_flag(abort_flag);
-
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-
-        let result = context.decode(&mut batch);
-
-        assert_eq!(result, Err(DecodeError::Aborted));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let abort_flag = Arc::new(AtomicBool::new(false));
-        context.set_abort_flag(abort_flag);
-
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-
-        let result = context.decode(&mut batch);
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let abort_flag = Arc::new(AtomicBool::new(true));
-        context.set_abort_flag(abort_flag);
-        context.clear_abort_callback();
-
-        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-
-        let result = context.decode(&mut batch);
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.synchronize();
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.detach_threadpool();
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn get_logits_ith_returns_token_not_initialized_for_unknown_index(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let result = context.get_logits_ith(7);
-
-        assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7))));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 64,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let huge_index = i32::try_from(context.n_ctx())?;
-        context.mark_logits_initialized(huge_index);
-        let result = context.get_logits_ith(huge_index);
-
-        assert!(matches!(
-            result,
-            Err(LogitsError::TokenIndexExceedsContext { .. })
-        ));
-
-        Ok(())
-    }
-}
-
-mod context_kv_cache {
-    use std::num::NonZeroU8;
-
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::context::kv_cache::KvCacheConversionError;
-    use llama_cpp_bindings::error::KvCacheSeqAddError;
-    use llama_cpp_bindings::error::KvCacheSeqDivError;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    fn build_context<'context>(
-        fixture: &'context LlamaFixture<'_>,
-    ) -> Result<LlamaContext<'context>> {
-        Ok(LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?)
-    }
-
-    fn decode_hello_world(
-        fixture: &LlamaFixture<'_>,
-        context: &mut LlamaContext<'_>,
-    ) -> Result<()> {
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        context.clear_kv_cache();
-        assert_eq!(context.kv_cache_seq_pos_max(0), -1);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        assert!(context.kv_cache_seq_pos_max(0) >= 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1));
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        let result = context.copy_kv_cache_seq(0, 1, None, None);
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        let pos_max = context.kv_cache_seq_pos_max(0);
-        context.copy_cache(0, 1, pos_max + 1);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        let result = context.kv_cache_seq_add(0, Some(0), None, 1);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheSeqAddError::IncompatibleRopeType,
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-        let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheSeqDivError::IncompatibleRopeType,
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        context.kv_cache_seq_keep(0);
-
-        assert!(context.kv_cache_seq_pos_max(0) >= 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        decode_hello_world(fixture, &mut context)?;
-
-        let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1));
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = build_context(fixture)?;
-
-        let result = context.kv_cache_seq_pos_max(999);
-
-        assert_eq!(result, -1);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheConversionError::P0TooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX));
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheConversionError::P1TooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheConversionError::SeqIdTooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheConversionError::P0TooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX));
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheConversionError::P1TooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheSeqAddError::P0TooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheSeqAddError::P1TooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-        let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheSeqDivError::P0TooLarge(_),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-        let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor);
-
-        assert!(matches!(
-            result.unwrap_err(),
-            KvCacheSeqDivError::P1TooLarge(_),
-        ));
-
-        Ok(())
-    }
-}
-
-mod context_session {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    fn build_context<'context>(
-        fixture: &'context LlamaFixture<'_>,
-    ) -> Result<LlamaContext<'context>> {
-        Ok(LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?)
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let session_path = std::env::temp_dir().join("llama_test_session.bin");
-        context.state_save_file(&session_path, &tokens)?;
-
-        let loaded_tokens = context.state_load_file(&session_path, 512)?;
-        assert_eq!(loaded_tokens, tokens);
-
-        std::fs::remove_file(&session_path)?;
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let context = build_context(fixture)?;
-
-        assert!(context.get_state_size() > 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let session_path = std::env::temp_dir().join("llama_test_seq_state.bin");
-        let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?;
-        assert!(bytes_written > 0);
-
-        let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?;
-        assert_eq!(loaded_tokens, tokens);
-        assert!(bytes_read > 0);
-
-        std::fs::remove_file(&session_path)?;
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let state_size = context.get_state_size();
-        let mut state_data = vec![0u8; state_size];
-        let bytes_copied = unsafe { context.copy_state_data(&mut state_data) };
-        assert!(bytes_copied > 0);
-
-        let bytes_read = unsafe { context.set_state_data(&state_data) };
-        assert!(bytes_read > 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_load_file_with_nonexistent_file_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.state_load_file("/nonexistent/session.bin", 512);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_load_file_with_nonexistent_file_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_save_file_to_invalid_directory_returns_failed_to_save(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = build_context(fixture)?;
-
-        let result = context.state_save_file("/nonexistent_dir/session.bin", &[]);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_save_file_to_invalid_directory_returns_failed_to_save(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = build_context(fixture)?;
-
-        let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_load_file_with_zero_max_tokens_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin");
-        context.state_save_file(&session_path, &tokens)?;
-
-        let result = context.state_load_file(&session_path, 0);
-
-        assert!(result.is_err());
-        let _ = std::fs::remove_file(&session_path);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_load_file_with_zero_max_tokens_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin");
-        context.state_seq_save_file(&session_path, 0, &tokens)?;
-
-        let result = context.state_seq_load_file(&session_path, 0, 0);
-
-        assert!(result.is_err());
-        let _ = std::fs::remove_file(&session_path);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_load_file_with_insufficient_max_tokens_returns_length_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token(
-            "Hello world this is a longer string for more tokens",
-            AddBos::Always,
-        )?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin");
-        context.state_save_file(&session_path, &tokens)?;
-
-        let result = context.state_load_file(&session_path, 1);
-
-        assert!(result.is_err());
-        let _ = std::fs::remove_file(&session_path);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token(
-            "Hello world this is a longer string for more tokens",
-            AddBos::Always,
-        )?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin");
-        context.state_seq_save_file(&session_path, 0, &tokens)?;
-
-        let result = context.state_seq_load_file(&session_path, 0, 1);
-
-        assert!(result.is_err());
-        let _ = std::fs::remove_file(&session_path);
-
-        Ok(())
-    }
-
-    #[cfg(unix)]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        use std::ffi::OsStr;
-        use std::os::unix::ffi::OsStrExt;
-
-        let context = build_context(fixture)?;
-
-        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-        let result = context.state_save_file(non_utf8_path, &[]);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[cfg(unix)]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        use std::ffi::OsStr;
-        use std::os::unix::ffi::OsStrExt;
-
-        let mut context = build_context(fixture)?;
-
-        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-        let result = context.state_load_file(non_utf8_path, 512);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[cfg(unix)]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_save_file_with_non_utf8_path_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        use std::ffi::OsStr;
-        use std::os::unix::ffi::OsStrExt;
-
-        let context = build_context(fixture)?;
-
-        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-        let result = context.state_seq_save_file(non_utf8_path, 0, &[]);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[cfg(unix)]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_load_file_with_non_utf8_path_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        use std::ffi::OsStr;
-        use std::os::unix::ffi::OsStrExt;
-
-        let mut context = build_context(fixture)?;
-
-        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-        let result = context.state_seq_load_file(non_utf8_path, 0, 512);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_save_file_with_null_byte_in_path_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = build_context(fixture)?;
-
-        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-        let result = context.state_save_file(path_with_null, &[]);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_load_file_with_null_byte_in_path_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-        let result = context.state_load_file(path_with_null, 512);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_save_file_with_null_byte_in_path_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let context = build_context(fixture)?;
-
-        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-        let result = context.state_seq_save_file(path_with_null, 0, &[]);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_load_file_with_null_byte_in_path_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mut context = build_context(fixture)?;
-
-        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-        let result = context.state_seq_load_file(path_with_null, 0, 512);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_get_size_ext_returns_size_for_decoded_sequence(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
-
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let flags = LlamaStateSeqFlags::empty();
-        let size = context.state_seq_get_size_ext(0, &flags);
-
-        assert!(size > 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn state_seq_get_data_ext_and_set_data_ext_round_trip(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
-
-        let mut context = build_context(fixture)?;
-
-        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let flags = LlamaStateSeqFlags::empty();
-        let size = context.state_seq_get_size_ext(0, &flags);
-        let mut buffer = vec![0u8; size];
-        let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) };
-
-        assert!(bytes_written > 0);
-
-        let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) };
-
-        assert!(bytes_read > 0);
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    assert!(context.n_ctx() > 0);
+    Ok(())
 }
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4294967295,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4294967295,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4294967295,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4294967295,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = fixture.build_context();
+
+    assert!(result.is_err());
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    assert!(context.n_ctx() > 0);
+    assert!(context.n_batch() > 0);
+    assert!(context.n_ubatch() > 0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+
+    let decode_result = context.decode(&mut batch);
+    assert!(decode_result.is_ok());
+
+    let logits = context.get_logits()?;
+    assert!(!logits.is_empty());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.reset_timings();
+    let timings = context.timings();
+    assert!(timings.t_start_ms() >= 0.0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let token_data_array = context.token_data_array()?;
+
+    assert!(!token_data_array.data.is_empty());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let last_index = i32::try_from(tokens.len() - 1)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let logits = context.get_logits_ith(last_index)?;
+
+    assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let last_index = i32::try_from(tokens.len() - 1)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let token_data_array = context.token_data_array_ith(last_index)?;
+
+    assert_eq!(
+        token_data_array.data.len(),
+        usize::try_from(fixture.model.n_vocab())?
+    );
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn embeddings_ith_returns_error_when_embeddings_disabled(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let result = context.embeddings_ith(0);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn embeddings_seq_ith_returns_error_when_embeddings_disabled(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let result = context.embeddings_seq_ith(0);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let count = context.candidates()?.count();
+
+    assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let debug_output = format!("{context:?}");
+
+    assert!(debug_output.contains("LlamaContext"));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let last_index = i32::try_from(tokens.len() - 1)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let count = context.candidates_ith(last_index)?.count();
+
+    assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let mut adapter = LlamaLoraAdapter {
+        lora_adapter: NonNull::dangling(),
+    };
+
+    let result = context.lora_adapter_remove(&mut adapter);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+
+    let result = context.encode(&mut batch);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let mut adapter = LlamaLoraAdapter {
+        lora_adapter: NonNull::dangling(),
+    };
+
+    let result = context.lora_adapter_set(&mut adapter, 1.0);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    embeddings = true,
+)]
+fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let result = context.embeddings_seq_ith(999);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+
+    let result = context.decode(&mut batch);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let abort_flag = Arc::new(AtomicBool::new(true));
+    context.set_abort_flag(abort_flag);
+
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+
+    let result = context.decode(&mut batch);
+
+    assert_eq!(result, Err(DecodeError::Aborted));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let abort_flag = Arc::new(AtomicBool::new(false));
+    context.set_abort_flag(abort_flag);
+
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+
+    let result = context.decode(&mut batch);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let abort_flag = Arc::new(AtomicBool::new(true));
+    context.set_abort_flag(abort_flag);
+    context.clear_abort_callback();
+
+    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+
+    let result = context.decode(&mut batch);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.synchronize();
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.detach_threadpool();
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn get_logits_ith_returns_token_not_initialized_for_unknown_index(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let result = context.get_logits_ith(7);
+
+    assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7))));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 64,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let huge_index = i32::try_from(context.n_ctx())?;
+    context.mark_logits_initialized(huge_index);
+    let result = context.get_logits_ith(huge_index);
+
+    assert!(matches!(
+        result,
+        Err(LogitsError::TokenIndexExceedsContext { .. })
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    context.clear_kv_cache();
+    assert_eq!(context.kv_cache_seq_pos_max(0), -1);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    assert!(context.kv_cache_seq_pos_max(0) >= 0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1));
+    assert!(result.is_ok());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    let result = context.copy_kv_cache_seq(0, 1, None, None);
+    assert!(result.is_ok());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    let pos_max = context.kv_cache_seq_pos_max(0);
+    context.copy_cache(0, 1, pos_max + 1);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    let result = context.kv_cache_seq_add(0, Some(0), None, 1);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheSeqAddError::IncompatibleRopeType,
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+    let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheSeqDivError::IncompatibleRopeType,
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    context.kv_cache_seq_keep(0);
+
+    assert!(context.kv_cache_seq_pos_max(0) >= 0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    prime_kv_cache(fixture, &mut context)?;
+
+    let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1));
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = fixture.build_context()?;
+
+    let result = context.kv_cache_seq_pos_max(999);
+
+    assert_eq!(result, -1);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheConversionError::P0TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX));
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheConversionError::P1TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheConversionError::SeqIdTooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheConversionError::P0TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX));
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheConversionError::P1TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheSeqAddError::P0TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheSeqAddError::P1TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+    let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheSeqDivError::P0TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+    let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor);
+
+    assert!(matches!(
+        result.unwrap_err(),
+        KvCacheSeqDivError::P1TooLarge(_),
+    ));
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let session_path = std::env::temp_dir().join("llama_test_session.bin");
+    context.state_save_file(&session_path, &tokens)?;
+
+    let loaded_tokens = context.state_load_file(&session_path, 512)?;
+    assert_eq!(loaded_tokens, tokens);
+
+    std::fs::remove_file(&session_path)?;
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = fixture.build_context()?;
+
+    assert!(context.get_state_size() > 0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let session_path = std::env::temp_dir().join("llama_test_seq_state.bin");
+    let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?;
+    assert!(bytes_written > 0);
+
+    let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?;
+    assert_eq!(loaded_tokens, tokens);
+    assert!(bytes_read > 0);
+
+    std::fs::remove_file(&session_path)?;
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let state_size = context.get_state_size();
+    let mut state_data = vec![0u8; state_size];
+    let bytes_copied = unsafe { context.copy_state_data(&mut state_data) };
+    assert!(bytes_copied > 0);
+
+    let bytes_read = unsafe { context.set_state_data(&state_data) };
+    assert!(bytes_read > 0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_load_file_with_nonexistent_file_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.state_load_file("/nonexistent/session.bin", 512);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_load_file_with_nonexistent_file_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_save_file_to_invalid_directory_returns_failed_to_save(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = fixture.build_context()?;
+
+    let result = context.state_save_file("/nonexistent_dir/session.bin", &[]);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_save_file_to_invalid_directory_returns_failed_to_save(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = fixture.build_context()?;
+
+    let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_load_file_with_zero_max_tokens_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin");
+    context.state_save_file(&session_path, &tokens)?;
+
+    let result = context.state_load_file(&session_path, 0);
+
+    assert!(result.is_err());
+    let _ = std::fs::remove_file(&session_path);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_load_file_with_zero_max_tokens_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin");
+    context.state_seq_save_file(&session_path, 0, &tokens)?;
+
+    let result = context.state_seq_load_file(&session_path, 0, 0);
+
+    assert!(result.is_err());
+    let _ = std::fs::remove_file(&session_path);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_load_file_with_insufficient_max_tokens_returns_length_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token(
+        "Hello world this is a longer string for more tokens",
+        AddBos::Always,
+    )?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin");
+    context.state_save_file(&session_path, &tokens)?;
+
+    let result = context.state_load_file(&session_path, 1);
+
+    assert!(result.is_err());
+    let _ = std::fs::remove_file(&session_path);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token(
+        "Hello world this is a longer string for more tokens",
+        AddBos::Always,
+    )?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin");
+    context.state_seq_save_file(&session_path, 0, &tokens)?;
+
+    let result = context.state_seq_load_file(&session_path, 0, 1);
+
+    assert!(result.is_err());
+    let _ = std::fs::remove_file(&session_path);
+
+    Ok(())
+}
+
+#[cfg(unix)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use std::ffi::OsStr;
+    use std::os::unix::ffi::OsStrExt;
+
+    let context = fixture.build_context()?;
+
+    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+    let result = context.state_save_file(non_utf8_path, &[]);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[cfg(unix)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use std::ffi::OsStr;
+    use std::os::unix::ffi::OsStrExt;
+
+    let mut context = fixture.build_context()?;
+
+    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+    let result = context.state_load_file(non_utf8_path, 512);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[cfg(unix)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use std::ffi::OsStr;
+    use std::os::unix::ffi::OsStrExt;
+
+    let context = fixture.build_context()?;
+
+    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+    let result = context.state_seq_save_file(non_utf8_path, 0, &[]);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[cfg(unix)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use std::ffi::OsStr;
+    use std::os::unix::ffi::OsStrExt;
+
+    let mut context = fixture.build_context()?;
+
+    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+    let result = context.state_seq_load_file(non_utf8_path, 0, 512);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_save_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let context = fixture.build_context()?;
+
+    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+    let result = context.state_save_file(path_with_null, &[]);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_load_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+    let result = context.state_load_file(path_with_null, 512);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_save_file_with_null_byte_in_path_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let context = fixture.build_context()?;
+
+    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+    let result = context.state_seq_save_file(path_with_null, 0, &[]);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_load_file_with_null_byte_in_path_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mut context = fixture.build_context()?;
+
+    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+    let result = context.state_seq_load_file(path_with_null, 0, 512);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_get_size_ext_returns_size_for_decoded_sequence(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
+
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let flags = LlamaStateSeqFlags::empty();
+    let size = context.state_seq_get_size_ext(0, &flags);
+
+    assert!(size > 0);
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn state_seq_get_data_ext_and_set_data_ext_round_trip(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
+
+    let mut context = fixture.build_context()?;
+
+    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let flags = LlamaStateSeqFlags::empty();
+    let size = context.state_seq_get_size_ext(0, &flags);
+    let mut buffer = vec![0u8; size];
+    let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) };
+
+    assert!(bytes_written > 0);
+
+    let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) };
+
+    assert!(bytes_read > 0);
+
+    Ok(())
+}
 llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/multimodal_vision.rs b/llama-cpp-bindings-tests/tests/multimodal_vision.rs
index 7e596be6..a137bb99 100644
--- a/llama-cpp-bindings-tests/tests/multimodal_vision.rs
+++ b/llama-cpp-bindings-tests/tests/multimodal_vision.rs
@@ -1,1099 +1,1005 @@
+#![expect(
+    clippy::unnecessary_wraps,
+    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+)]
+
+use anyhow::Context;
+use anyhow::Result;
+use llama_cpp_bindings::SampledToken;
+use llama_cpp_bindings::SampledTokenClassifier;
+use llama_cpp_bindings::TokenUsage;
+use llama_cpp_bindings::context::LlamaContext;
+use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk;
+use llama_cpp_bindings::llama_batch::LlamaBatch;
+use llama_cpp_bindings::model::LlamaChatMessage;
+use llama_cpp_bindings::model::LlamaModel;
+use llama_cpp_bindings::mtmd::MtmdBitmap;
+use llama_cpp_bindings::mtmd::MtmdContext;
+use llama_cpp_bindings::mtmd::MtmdContextParams;
+use llama_cpp_bindings::mtmd::MtmdEvalError;
+use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+use llama_cpp_bindings::mtmd::MtmdInputChunks;
+use llama_cpp_bindings::mtmd::MtmdInputText;
+use llama_cpp_bindings::mtmd::mtmd_default_marker;
+use llama_cpp_bindings::sampling::LlamaSampler;
+use llama_cpp_bindings_sys::llama_pos;
+use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+use llama_cpp_test_harness::LlamaFixture;
+use llama_cpp_test_harness::fixtures_dir::fixtures_dir;
+use llama_cpp_test_harness::llama_test;
 use llama_cpp_test_harness::llama_tests_main;
 
-mod mtmd_bitmap {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings_tests::test_model;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let fixtures = test_model::fixtures_dir();
-        let image_path = fixtures.join("llamas.jpg");
-        let image_bytes = std::fs::read(&image_path)?;
-        let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?;
-
-        assert!(bitmap.nx() > 0);
-        assert!(bitmap.ny() > 0);
-        assert!(!bitmap.is_audio());
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let fixtures = fixtures_dir();
+    let image_path = fixtures.join("llamas.jpg");
+    let image_bytes = std::fs::read(&image_path)?;
+    let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?;
+
+    assert!(bitmap.nx() > 0);
+    assert!(bitmap.ny() > 0);
+    assert!(!bitmap.is_audio());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null");
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null");
+
+    assert!(result.is_err());
+
+    Ok(())
 }
 
-mod mtmd_chunk_operations {
-    use anyhow::Result;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let image_data = vec![128u8; 64 * 64 * 3];
-        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-        let input_text = MtmdInputText {
-            text: "Hello <__media__>".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-        let first_chunk = chunks
-            .get(0)
-            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-        let copied = first_chunk.copy()?;
-
-        assert!(copied.owned);
-        assert_eq!(copied.n_tokens(), first_chunk.n_tokens());
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let image_data = vec![128u8; 64 * 64 * 3];
+    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+    let input_text = MtmdInputText {
+        text: "Hello <__media__>".to_string(),
+        add_special: true,
+        parse_special: true,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+    let first_chunk = chunks
+        .get(0)
+        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+    let copied = first_chunk.copy()?;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let image_data = vec![128u8; 64 * 64 * 3];
-        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-        let input_text = MtmdInputText {
-            text: "Describe: <__media__>".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-        for chunk_index in 0..chunks.len() {
-            let chunk = chunks
-                .get(chunk_index)
-                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-                let result = mtmd_ctx.encode_chunk(&chunk);
-                assert!(result.is_ok());
-                return Ok(());
-            }
+    assert!(copied.owned);
+    assert_eq!(copied.n_tokens(), first_chunk.n_tokens());
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let image_data = vec![128u8; 64 * 64 * 3];
+    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+    let input_text = MtmdInputText {
+        text: "Describe: <__media__>".to_string(),
+        add_special: true,
+        parse_special: true,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+    for chunk_index in 0..chunks.len() {
+        let chunk = chunks
+            .get(chunk_index)
+            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+            let result = mtmd_ctx.encode_chunk(&chunk);
+            assert!(result.is_ok());
+            return Ok(());
         }
-        Ok(())
     }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn decode_use_non_causal_returns_bool_for_image_chunk(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let image_data = vec![128u8; 64 * 64 * 3];
-        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-        let input_text = MtmdInputText {
-            text: "Describe: <__media__>".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-        for chunk_index in 0..chunks.len() {
-            let chunk = chunks
-                .get(chunk_index)
-                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-                let value = mtmd_ctx.decode_use_non_causal(&chunk);
-                let printed = format!("{value:?}");
-                assert!(
-                    !printed.is_empty(),
-                    "decode_use_non_causal must return a Debug-printable bool"
-                );
-                return Ok(());
-            }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn decode_use_non_causal_returns_bool_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let image_data = vec![128u8; 64 * 64 * 3];
+    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+    let input_text = MtmdInputText {
+        text: "Describe: <__media__>".to_string(),
+        add_special: true,
+        parse_special: true,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+    for chunk_index in 0..chunks.len() {
+        let chunk = chunks
+            .get(chunk_index)
+            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+            let value = mtmd_ctx.decode_use_non_causal(&chunk);
+            let printed = format!("{value:?}");
+            assert!(
+                !printed.is_empty(),
+                "decode_use_non_causal must return a Debug-printable bool"
+            );
+            return Ok(());
         }
-        anyhow::bail!("tokenization should produce at least one Image chunk");
     }
+    anyhow::bail!("tokenization should produce at least one Image chunk");
 }
 
-mod mtmd_chunk_structure {
-    use anyhow::Result;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    fn tokenize_synthetic(
-        fixture: &LlamaFixture<'_>,
-        prompt: &str,
-    ) -> Result<llama_cpp_bindings::mtmd::MtmdInputChunks> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let image_data = vec![128u8; 64 * 64 * 3];
-        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-        let input_text = MtmdInputText {
-            text: prompt.to_owned(),
-            add_special: true,
-            parse_special: true,
-        };
-        Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?)
-    }
+fn tokenize_synthetic(fixture: &LlamaFixture<'_>, prompt: &str) -> Result<MtmdInputChunks> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let image_data = vec![128u8; 64 * 64 * 3];
+    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+    let input_text = MtmdInputText {
+        text: prompt.to_owned(),
+        add_special: true,
+        parse_special: true,
+    };
+    Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?)
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-        let first_chunk = chunks
-            .get(0)
-            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-        assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+    let first_chunk = chunks
+        .get(0)
+        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+    assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-        let first_chunk = chunks
-            .get(0)
-            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-        let tokens = first_chunk.text_tokens();
-        assert!(tokens.is_some());
-        assert!(!tokens.expect("tokens should be some").is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+    let first_chunk = chunks
+        .get(0)
+        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+    let tokens = first_chunk.text_tokens();
+    assert!(tokens.is_some());
+    assert!(!tokens.expect("tokens should be some").is_empty());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-        let first_chunk = chunks
-            .get(0)
-            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-        assert!(first_chunk.n_tokens() > 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+    let first_chunk = chunks
+        .get(0)
+        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+    assert!(first_chunk.n_tokens() > 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-        let first_chunk = chunks
-            .get(0)
-            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-        assert!(first_chunk.n_positions() > 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+    let first_chunk = chunks
+        .get(0)
+        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+    assert!(first_chunk.n_positions() > 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
-        let first_chunk = chunks
-            .get(0)
-            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-        assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
-        assert!(first_chunk.id().is_none());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+    let first_chunk = chunks
+        .get(0)
+        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+    assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
+    assert!(first_chunk.id().is_none());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
-        for chunk_index in 0..chunks.len() {
-            let chunk = chunks
-                .get(chunk_index)
-                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-                assert!(chunk.text_tokens().is_none());
-                return Ok(());
-            }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+    for chunk_index in 0..chunks.len() {
+        let chunk = chunks
+            .get(chunk_index)
+            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+            assert!(chunk.text_tokens().is_none());
+            return Ok(());
         }
-        Ok(())
     }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
-        for chunk_index in 0..chunks.len() {
-            let chunk = chunks
-                .get(chunk_index)
-                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-                assert!(chunk.id().is_some());
-                return Ok(());
-            }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+    for chunk_index in 0..chunks.len() {
+        let chunk = chunks
+            .get(chunk_index)
+            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+            assert!(chunk.id().is_some());
+            return Ok(());
         }
-        Ok(())
     }
+    Ok(())
 }
 
-mod mtmd_context {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::mtmd::MtmdContext;
-    use llama_cpp_bindings::mtmd::MtmdContextParams;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        assert!(mtmd_ctx.support_vision());
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn init_from_file_with_null_byte_in_path_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mtmd_params = MtmdContextParams::default();
-        let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params);
-
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        assert!(
-            mtmd_ctx.decode_use_mrope(),
-            "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true"
-        );
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        assert!(
-            !mtmd_ctx.support_audio(),
-            "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    assert!(mtmd_ctx.support_vision());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn get_audio_sample_rate_is_none_for_vision_only_mmproj(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        assert!(
-            mtmd_ctx.get_audio_sample_rate().is_none(),
-            "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn init_from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_params = MtmdContextParams::default();
+    let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params);
+
+    assert!(result.is_err());
+    Ok(())
 }
 
-mod mtmd_evaluation {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdEvalError;
-    use llama_cpp_bindings::mtmd::MtmdInputChunks;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_bindings_tests::test_model;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let image_data = vec![128u8; (width as usize) * (height as usize) * 3];
-        let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?;
-        let input_text = MtmdInputText {
-            text: "Describe: <__media__>".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-        let n_positions = chunks.total_positions();
-        let required_n_ctx = u32::try_from(n_positions + 256)?;
-        if fixture.context_params.n_ctx < required_n_ctx {
-            anyhow::bail!(
-                "fixture n_ctx ({}) below required ({}) for {}x{} image",
-                fixture.context_params.n_ctx,
-                required_n_ctx,
-                width,
-                height,
-            );
-        }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    assert!(
+        mtmd_ctx.decode_use_mrope(),
+        "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true"
+    );
+    Ok(())
+}
 
-        let llama_ctx = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let n_batch = i32::try_from(llama_ctx.n_batch())?;
-        chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?;
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    assert!(
+        !mtmd_ctx.support_audio(),
+        "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 64,
-        n_batch = 64,
-        n_ubatch = 32,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 64,
-        n_batch = 64,
-        n_ubatch = 32,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let llama_ctx = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let chunks = MtmdInputChunks::new()?;
-        let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?;
-
-        let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false);
-
-        assert!(matches!(
-            result,
-            Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. })
-        ));
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn get_audio_sample_rate_is_none_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    assert!(
+        mtmd_ctx.get_audio_sample_rate().is_none(),
+        "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let fixtures = test_model::fixtures_dir();
-        let image_path = fixtures.join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-        let input_text = MtmdInputText {
-            text: "What is in this image? <__media__>".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-        let n_positions = chunks.total_positions();
-        let required_n_ctx = u32::try_from(n_positions + 256)?;
-        assert!(
-            fixture.context_params.n_ctx >= required_n_ctx,
-            "fixture n_ctx ({}) below required ({}); update the attribute literal",
+fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let image_data = vec![128u8; (width as usize) * (height as usize) * 3];
+    let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?;
+    let input_text = MtmdInputText {
+        text: "Describe: <__media__>".to_string(),
+        add_special: true,
+        parse_special: true,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+    let n_positions = chunks.total_positions();
+    let required_n_ctx = u32::try_from(n_positions + 256)?;
+    if fixture.context_params.n_ctx < required_n_ctx {
+        anyhow::bail!(
+            "fixture n_ctx ({}) below required ({}) for {}x{} image",
             fixture.context_params.n_ctx,
             required_n_ctx,
+            width,
+            height,
         );
-
-        let llama_ctx = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let n_batch = i32::try_from(llama_ctx.n_batch())?;
-        let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false);
-
-        assert!(result.is_ok());
-
-        Ok(())
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)];
-
-        for (width, height) in test_dimensions {
-            let result = eval_synthetic_bitmap(fixture, width, height);
-            assert!(
-                result.is_ok(),
-                "dimension {width}x{height} should succeed: {result:?}"
-            );
-        }
+    let llama_ctx = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let n_batch = i32::try_from(llama_ctx.n_batch())?;
+    chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?;
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 64,
+    n_batch = 64,
+    n_ubatch = 32,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 64,
+    n_batch = 64,
+    n_ubatch = 32,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let llama_ctx = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let chunks = MtmdInputChunks::new()?;
+    let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?;
+
+    let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false);
+
+    assert!(matches!(
+        result,
+        Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. })
+    ));
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn eval_chunks_with_extreme_dimensions_does_not_crash(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let extreme_dimensions: [(u32, u32); 6] = [
-            (1, 1),
-            (7, 13),
-            (3, 1000),
-            (1000, 3),
-            (1920, 1080),
-            (4096, 4096),
-        ];
-
-        let mut any_reached_eval = false;
-
-        for (width, height) in extreme_dimensions {
-            match eval_synthetic_bitmap(fixture, width, height) {
-                Ok(()) => any_reached_eval = true,
-                Err(error) => eprintln!("  {width}x{height} failed: {error}"),
-            }
-        }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let fixtures = fixtures_dir();
+    let image_path = fixtures.join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+    let input_text = MtmdInputText {
+        text: "What is in this image? <__media__>".to_string(),
+        add_special: true,
+        parse_special: true,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+    let n_positions = chunks.total_positions();
+    let required_n_ctx = u32::try_from(n_positions + 256)?;
+    assert!(
+        fixture.context_params.n_ctx >= required_n_ctx,
+        "fixture n_ctx ({}) below required ({}); update the attribute literal",
+        fixture.context_params.n_ctx,
+        required_n_ctx,
+    );
+
+    let llama_ctx = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let n_batch = i32::try_from(llama_ctx.n_batch())?;
+    let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)];
+
+    for (width, height) in test_dimensions {
+        let result = eval_synthetic_bitmap(fixture, width, height);
         assert!(
-            any_reached_eval,
-            "at least one extreme dimension should reach eval_chunks"
+            result.is_ok(),
+            "dimension {width}x{height} should succeed: {result:?}"
         );
-
-        Ok(())
     }
+
+    Ok(())
 }
 
-mod mtmd_tokenization {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let image_data = vec![128u8; 64 * 64 * 3];
-        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-        let input_text = MtmdInputText {
-            text: "Describe this image: <__media__>".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-        assert!(!chunks.is_empty());
-        assert!(chunks.total_tokens() > 0);
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn eval_chunks_with_extreme_dimensions_does_not_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let extreme_dimensions: [(u32, u32); 6] = [
+        (1, 1),
+        (7, 13),
+        (3, 1000),
+        (1000, 3),
+        (1920, 1080),
+        (4096, 4096),
+    ];
+
+    let mut any_reached_eval = false;
+
+    for (width, height) in extreme_dimensions {
+        match eval_synthetic_bitmap(fixture, width, height) {
+            Ok(()) => any_reached_eval = true,
+            Err(error) => eprintln!("  {width}x{height} failed: {error}"),
+        }
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let input_text = MtmdInputText {
-            text: "No media markers here".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let image_data = vec![128u8; 64 * 64 * 3];
-        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-        let result = mtmd_ctx.tokenize(input_text, &[&bitmap]);
-        assert!(result.is_err());
-        Ok(())
-    }
+    assert!(
+        any_reached_eval,
+        "at least one extreme dimension should reach eval_chunks"
+    );
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-        let input_text = MtmdInputText {
-            text: "text\0null".to_string(),
-            add_special: true,
-            parse_special: true,
-        };
-        let result = mtmd_ctx.tokenize(input_text, &[]);
-        assert!(result.is_err());
-        Ok(())
-    }
+    Ok(())
 }
 
-mod multimodal {
-    use anyhow::{Context, Result};
-    use llama_cpp_bindings::SampledTokenClassifier;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel};
-    use llama_cpp_bindings::mtmd::{
-        MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText,
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let image_data = vec![128u8; 64 * 64 * 3];
+    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+    let input_text = MtmdInputText {
+        text: "Describe this image: <__media__>".to_string(),
+        add_special: true,
+        parse_special: true,
     };
-    use llama_cpp_bindings::sampled_token::SampledToken;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_sys::llama_pos;
-    use llama_cpp_bindings_tests::test_model;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    struct ChunkTokenBreakdown {
-        text: u64,
-        image: u64,
-        audio: u64,
-    }
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
 
-    fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result<ChunkTokenBreakdown> {
-        let mut breakdown = ChunkTokenBreakdown {
-            text: 0,
-            image: 0,
-            audio: 0,
-        };
-        for index in 0..chunks.len() {
-            let chunk = chunks
-                .get(index)
-                .with_context(|| format!("chunk index {index} is missing"))?;
-            let n_tokens = u64::try_from(chunk.n_tokens())?;
-            match chunk.chunk_type()? {
-                MtmdInputChunkType::Text => breakdown.text += n_tokens,
-                MtmdInputChunkType::Image => breakdown.image += n_tokens,
-                MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
-            }
-        }
+    assert!(!chunks.is_empty());
+    assert!(chunks.total_tokens() > 0);
+    Ok(())
+}
 
-        Ok(breakdown)
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let input_text = MtmdInputText {
+        text: "No media markers here".to_string(),
+        add_special: true,
+        parse_special: true,
+    };
+    let image_data = vec![128u8; 64 * 64 * 3];
+    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+    let result = mtmd_ctx.tokenize(input_text, &[&bitmap]);
+    assert!(result.is_err());
+    Ok(())
+}
 
-    fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result<String> {
-        let marker = llama_cpp_bindings::mtmd::mtmd_default_marker();
-        let user_content = format!("{marker}{question}");
-        let chat_template = model.chat_template(None)?;
-        let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+    let input_text = MtmdInputText {
+        text: "text\0null".to_string(),
+        add_special: true,
+        parse_special: true,
+    };
+    let result = mtmd_ctx.tokenize(input_text, &[]);
+    assert!(result.is_err());
+    Ok(())
+}
+struct ChunkTokenBreakdown {
+    text: u64,
+    image: u64,
+    audio: u64,
+}
 
-        Ok(model.apply_chat_template(&chat_template, &messages, true)?)
+fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result<ChunkTokenBreakdown> {
+    let mut breakdown = ChunkTokenBreakdown {
+        text: 0,
+        image: 0,
+        audio: 0,
+    };
+    for index in 0..chunks.len() {
+        let chunk = chunks
+            .get(index)
+            .with_context(|| format!("chunk index {index} is missing"))?;
+        let n_tokens = u64::try_from(chunk.n_tokens())?;
+        match chunk.chunk_type()? {
+            MtmdInputChunkType::Text => breakdown.text += n_tokens,
+            MtmdInputChunkType::Image => breakdown.image += n_tokens,
+            MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
+        }
     }
 
-    struct SamplingTotals {
-        generated: String,
-        observed_content: u64,
-        observed_reasoning: u64,
-    }
+    Ok(breakdown)
+}
 
-    fn drive_sampling_loop(
-        classifier: &mut SampledTokenClassifier,
-        model: &LlamaModel,
-        ctx: &mut LlamaContext,
-        starting_position: llama_pos,
-        max_tokens: usize,
-    ) -> Result<SamplingTotals> {
-        let mut sampler = LlamaSampler::greedy();
-        let mut totals = SamplingTotals {
-            generated: String::new(),
-            observed_content: 0,
-            observed_reasoning: 0,
-        };
-        let mut batch = LlamaBatch::new(512, 1)?;
-
-        for (current_position, _) in (starting_position..).zip(0..max_tokens) {
-            let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?;
-            for outcome in &outcomes {
-                totals.generated.push_str(&outcome.raw_piece);
-                match outcome.sampled_token {
-                    SampledToken::Content(_) => totals.observed_content += 1,
-                    SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
-                    SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
-                }
-            }
+fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result<String> {
+    let marker = mtmd_default_marker();
+    let user_content = format!("{marker}{question}");
+    let chat_template = model.chat_template(None)?;
+    let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];
 
-            let raw_as_sampled = SampledToken::Content(raw_token);
-            if model.is_eog_token(&raw_as_sampled) {
-                break;
-            }
+    Ok(model.apply_chat_template(&chat_template, &messages, true)?)
+}
 
-            batch.clear();
-            batch.add(&raw_as_sampled, current_position, &[0], true)?;
+struct SamplingTotals {
+    generated: String,
+    observed_content: u64,
+    observed_reasoning: u64,
+}
 
-            ctx.decode(&mut batch)
-                .with_context(|| "failed to decode generated token")?;
-        }
+fn drive_sampling_loop(
+    classifier: &mut SampledTokenClassifier,
+    model: &LlamaModel,
+    ctx: &mut LlamaContext,
+    starting_position: llama_pos,
+    max_tokens: usize,
+) -> Result<SamplingTotals> {
+    let mut sampler = LlamaSampler::greedy();
+    let mut totals = SamplingTotals {
+        generated: String::new(),
+        observed_content: 0,
+        observed_reasoning: 0,
+    };
+    let mut batch = LlamaBatch::new(512, 1)?;
 
-        for outcome in classifier.flush() {
+    for (current_position, _) in (starting_position..).zip(0..max_tokens) {
+        let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?;
+        for outcome in &outcomes {
             totals.generated.push_str(&outcome.raw_piece);
             match outcome.sampled_token {
                 SampledToken::Content(_) => totals.observed_content += 1,
@@ -1102,900 +1008,797 @@ mod multimodal {
             }
         }
 
-        Ok(totals)
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let mut ctx = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )
-        .with_context(|| "unable to create llama context")?;
-
-        assert!(
-            mtmd_ctx.support_vision(),
-            "model should support vision input"
-        );
-
-        let image_path = test_model::fixtures_dir().join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .with_context(|| "image path is not valid UTF-8")?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)
-            .with_context(|| "failed to load image from file")?;
-
-        let formatted_prompt =
-            build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?;
+        let raw_as_sampled = SampledToken::Content(raw_token);
+        if model.is_eog_token(&raw_as_sampled) {
+            break;
+        }
 
-        let input_text = MtmdInputText {
-            text: formatted_prompt,
-            add_special: false,
-            parse_special: true,
-        };
+        batch.clear();
+        batch.add(&raw_as_sampled, current_position, &[0], true)?;
 
-        let chunks = mtmd_ctx
-            .tokenize(input_text, &[&bitmap])
-            .with_context(|| "failed to tokenize multimodal input")?;
+        ctx.decode(&mut batch)
+            .with_context(|| "failed to decode generated token")?;
+    }
 
-        assert!(
-            !chunks.is_empty(),
-            "tokenization should produce at least one chunk"
-        );
+    for outcome in classifier.flush() {
+        totals.generated.push_str(&outcome.raw_piece);
+        match outcome.sampled_token {
+            SampledToken::Content(_) => totals.observed_content += 1,
+            SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
+            SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
+        }
+    }
 
-        let expected = count_chunk_tokens_by_type(&chunks)?;
+    Ok(totals)
+}
 
-        eprintln!(
-            "tokenized into {} chunks, text {} image {} audio {}",
-            chunks.len(),
-            expected.text,
-            expected.image,
-            expected.audio
-        );
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let mut ctx = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )
+    .with_context(|| "unable to create llama context")?;
+
+    assert!(
+        mtmd_ctx.support_vision(),
+        "model should support vision input"
+    );
+
+    let image_path = fixtures_dir().join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .with_context(|| "image path is not valid UTF-8")?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)
+        .with_context(|| "failed to load image from file")?;
+
+    let formatted_prompt =
+        build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?;
+
+    let input_text = MtmdInputText {
+        text: formatted_prompt,
+        add_special: false,
+        parse_special: true,
+    };
 
-        assert!(
-            expected.image > 0,
-            "vision input must produce at least one image chunk"
-        );
+    let chunks = mtmd_ctx
+        .tokenize(input_text, &[&bitmap])
+        .with_context(|| "failed to tokenize multimodal input")?;
 
-        let mut classifier = model.sampled_token_classifier();
-        let n_past = classifier
-            .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true)
-            .with_context(|| "failed to evaluate chunks")?;
+    assert!(
+        !chunks.is_empty(),
+        "tokenization should produce at least one chunk"
+    );
 
-        eprintln!("evaluated chunks, n_past = {n_past}");
+    let expected = count_chunk_tokens_by_type(&chunks)?;
 
-        {
-            let usage = classifier.usage();
-            assert_eq!(usage.prompt_tokens, expected.text);
-            assert_eq!(usage.input_image_tokens, expected.image);
-            assert_eq!(usage.input_audio_tokens, expected.audio);
-        }
+    eprintln!(
+        "tokenized into {} chunks, text {} image {} audio {}",
+        chunks.len(),
+        expected.text,
+        expected.image,
+        expected.audio
+    );
 
-        let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?;
+    assert!(
+        expected.image > 0,
+        "vision input must produce at least one image chunk"
+    );
 
-        eprintln!("generated text: {}", totals.generated);
+    let mut classifier = model.sampled_token_classifier();
+    let n_past = classifier
+        .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true)
+        .with_context(|| "failed to evaluate chunks")?;
 
-        assert!(
-            !totals.generated.is_empty(),
-            "model should generate at least one token from image input"
-        );
+    eprintln!("evaluated chunks, n_past = {n_past}");
 
-        let usage = classifier.into_usage();
+    {
+        let usage = classifier.usage();
         assert_eq!(usage.prompt_tokens, expected.text);
         assert_eq!(usage.input_image_tokens, expected.image);
         assert_eq!(usage.input_audio_tokens, expected.audio);
-        assert_eq!(usage.content_tokens, totals.observed_content);
-        assert_eq!(usage.reasoning_tokens, totals.observed_reasoning);
-        assert_eq!(
-            usage.completion_tokens(),
-            totals.observed_content + totals.observed_reasoning
-        );
-
-        Ok(())
-    }
-}
-
-mod eval_multimodal_chunks_records_exact_token_counts {
-    use anyhow::Result;
-    use llama_cpp_bindings::TokenUsage;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-    use llama_cpp_bindings::mtmd::MtmdInputChunks;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_bindings::mtmd::mtmd_default_marker;
-    use llama_cpp_bindings_tests::test_model::fixtures_dir;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    const PROMPT_QUESTION: &str = "What animals do you see in this image?";
-
-    struct ExpectedChunkTotals {
-        text: u64,
-        image: u64,
-        audio: u64,
     }
 
-    fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result<ExpectedChunkTotals> {
-        let mut totals = ExpectedChunkTotals {
-            text: 0,
-            image: 0,
-            audio: 0,
-        };
-        for index in 0..chunks.len() {
-            let chunk = chunks
-                .get(index)
-                .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?;
-            let n_tokens = u64::try_from(chunk.n_tokens())?;
-            match chunk.chunk_type()? {
-                MtmdInputChunkType::Text => {
-                    totals.text = totals.text.saturating_add(n_tokens);
-                }
-                MtmdInputChunkType::Image => {
-                    totals.image = totals.image.saturating_add(n_tokens);
-                }
-                MtmdInputChunkType::Audio => {
-                    totals.audio = totals.audio.saturating_add(n_tokens);
-                }
-            }
-        }
-        Ok(totals)
-    }
+    let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?;
 
-    fn build_multimodal_chunks_and_eval_into_usage(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<(TokenUsage, ExpectedChunkTotals)> {
-        let model = fixture.model;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
+    eprintln!("generated text: {}", totals.generated);
 
-        let image_path = fixtures_dir().join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+    assert!(
+        !totals.generated.is_empty(),
+        "model should generate at least one token from image input"
+    );
 
-        let marker = mtmd_default_marker();
-        let prompt = format!("{marker}{PROMPT_QUESTION}");
+    let usage = classifier.into_usage();
+    assert_eq!(usage.prompt_tokens, expected.text);
+    assert_eq!(usage.input_image_tokens, expected.image);
+    assert_eq!(usage.input_audio_tokens, expected.audio);
+    assert_eq!(usage.content_tokens, totals.observed_content);
+    assert_eq!(usage.reasoning_tokens, totals.observed_reasoning);
+    assert_eq!(
+        usage.completion_tokens(),
+        totals.observed_content + totals.observed_reasoning
+    );
 
-        let input_text = MtmdInputText {
-            text: prompt,
-            add_special: false,
-            parse_special: true,
-        };
+    Ok(())
+}
 
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-        let expected = sum_chunk_token_counts_by_type(&chunks)?;
+const PROMPT_QUESTION: &str = "What animals do you see in this image?";
+
+fn build_multimodal_chunks_and_eval_into_usage(
+    fixture: &LlamaFixture<'_>,
+) -> Result<(TokenUsage, ChunkTokenBreakdown)> {
+    let model = fixture.model;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let image_path = fixtures_dir().join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+    let marker = mtmd_default_marker();
+    let prompt = format!("{marker}{PROMPT_QUESTION}");
+
+    let input_text = MtmdInputText {
+        text: prompt,
+        add_special: false,
+        parse_special: true,
+    };
 
-        let context_params = (*fixture.context_params).into_llama_context_params();
-        let context = LlamaContext::from_model(model, fixture.backend, context_params)?;
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+    let expected = count_chunk_tokens_by_type(&chunks)?;
 
-        let mut classifier = model.sampled_token_classifier();
-        classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+    let context_params = (*fixture.context_params).into_llama_context_params();
+    let context = LlamaContext::from_model(model, fixture.backend, context_params)?;
 
-        Ok((classifier.into_usage(), expected))
-    }
+    let mut classifier = model.sampled_token_classifier();
+    classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-        if usage.prompt_tokens != expected.text {
-            anyhow::bail!(
-                "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}",
-                expected.text,
-                usage.prompt_tokens
-            );
-        }
+    Ok((classifier.into_usage(), expected))
+}
 
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+    if usage.prompt_tokens != expected.text {
+        anyhow::bail!(
+            "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}",
+            expected.text,
+            usage.prompt_tokens
+        );
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-        if usage.input_image_tokens != expected.image {
-            anyhow::bail!(
-                "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}",
-                expected.image,
-                usage.input_image_tokens
-            );
-        }
+    Ok(())
+}
 
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+    if usage.input_image_tokens != expected.image {
+        anyhow::bail!(
+            "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}",
+            expected.image,
+            usage.input_image_tokens
+        );
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-        if expected.audio != 0 {
-            anyhow::bail!(
-                "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}",
-                expected.audio
-            );
-        }
-        if usage.input_audio_tokens != 0 {
-            anyhow::bail!(
-                "input_audio_tokens must be zero when no audio chunks are evaluated; got {}",
-                usage.input_audio_tokens
-            );
-        }
+    Ok(())
+}
 
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+    if expected.audio != 0 {
+        anyhow::bail!(
+            "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}",
+            expected.audio
+        );
+    }
+    if usage.input_audio_tokens != 0 {
+        anyhow::bail!(
+            "input_audio_tokens must be zero when no audio chunks are evaluated; got {}",
+            usage.input_audio_tokens
+        );
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn completion_tokens_are_zero_after_eval_before_generation(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-        if usage.completion_tokens() != 0 {
-            anyhow::bail!(
-                "completion_tokens must be zero immediately after eval (no generation has occurred); got {}",
-                usage.completion_tokens()
-            );
-        }
+    Ok(())
+}
 
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn completion_tokens_are_zero_after_eval_before_generation(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+    if usage.completion_tokens() != 0 {
+        anyhow::bail!(
+            "completion_tokens must be zero immediately after eval (no generation has occurred); got {}",
+            usage.completion_tokens()
+        );
     }
+
+    Ok(())
 }
 
-mod ingest_prompt_chunk {
-    use anyhow::Result;
-    use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_bindings::mtmd::mtmd_default_marker;
-    use llama_cpp_bindings_tests::test_model::fixtures_dir;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let input_text = MtmdInputText {
-            text: "hello world".to_owned(),
-            add_special: false,
-            parse_special: false,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[])?;
-
-        let text_chunk = (0..chunks.len())
-            .filter_map(|index| chunks.get(index))
-            .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text))
-            .ok_or_else(|| {
-                anyhow::anyhow!("text-only tokenization should produce at least one text chunk")
-            })?;
-
-        let n_tokens = u64::try_from(text_chunk.n_tokens())?;
-
-        let mut classifier = model.sampled_token_classifier();
-
-        ingest_prompt_chunk(&mut classifier, &text_chunk)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let input_text = MtmdInputText {
+        text: "hello world".to_owned(),
+        add_special: false,
+        parse_special: false,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[])?;
 
-        let usage = classifier.usage();
-        if usage.prompt_tokens != n_tokens {
-            anyhow::bail!(
-                "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}",
-                usage.prompt_tokens
-            );
-        }
-        if usage.input_image_tokens != 0 {
-            anyhow::bail!(
-                "text chunk must not bump input_image_tokens; got {}",
-                usage.input_image_tokens
-            );
-        }
-        if usage.input_audio_tokens != 0 {
-            anyhow::bail!(
-                "text chunk must not bump input_audio_tokens; got {}",
-                usage.input_audio_tokens
-            );
-        }
+    let text_chunk = (0..chunks.len())
+        .filter_map(|index| chunks.get(index))
+        .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text))
+        .ok_or_else(|| {
+            anyhow::anyhow!("text-only tokenization should produce at least one text chunk")
+        })?;
 
-        Ok(())
-    }
+    let n_tokens = u64::try_from(text_chunk.n_tokens())?;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let image_path = fixtures_dir().join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-        let marker = mtmd_default_marker();
-        let input_text = MtmdInputText {
-            text: marker.to_owned(),
-            add_special: false,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-        let image_chunk = (0..chunks.len())
-            .filter_map(|index| chunks.get(index))
-            .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image))
-            .ok_or_else(|| {
-                anyhow::anyhow!("multimodal tokenization should produce an image chunk")
-            })?;
-
-        let n_tokens = u64::try_from(image_chunk.n_tokens())?;
-        if n_tokens == 0 {
-            anyhow::bail!("image chunk should report at least one token");
-        }
+    let mut classifier = model.sampled_token_classifier();
 
-        let mut classifier = model.sampled_token_classifier();
+    ingest_prompt_chunk(&mut classifier, &text_chunk)?;
 
-        ingest_prompt_chunk(&mut classifier, &image_chunk)?;
+    let usage = classifier.usage();
+    if usage.prompt_tokens != n_tokens {
+        anyhow::bail!(
+            "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}",
+            usage.prompt_tokens
+        );
+    }
+    if usage.input_image_tokens != 0 {
+        anyhow::bail!(
+            "text chunk must not bump input_image_tokens; got {}",
+            usage.input_image_tokens
+        );
+    }
+    if usage.input_audio_tokens != 0 {
+        anyhow::bail!(
+            "text chunk must not bump input_audio_tokens; got {}",
+            usage.input_audio_tokens
+        );
+    }
 
-        let usage = classifier.usage();
-        if usage.input_image_tokens != n_tokens {
-            anyhow::bail!(
-                "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}",
-                usage.input_image_tokens
-            );
-        }
-        if usage.prompt_tokens != 0 {
-            anyhow::bail!(
-                "image chunk must not bump prompt_tokens; got {}",
-                usage.prompt_tokens
-            );
-        }
-        if usage.input_audio_tokens != 0 {
-            anyhow::bail!(
-                "image chunk must not bump input_audio_tokens; got {}",
-                usage.input_audio_tokens
-            );
-        }
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let image_path = fixtures_dir().join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+    let marker = mtmd_default_marker();
+    let input_text = MtmdInputText {
+        text: marker.to_owned(),
+        add_special: false,
+        parse_special: true,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+    let image_chunk = (0..chunks.len())
+        .filter_map(|index| chunks.get(index))
+        .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image))
+        .ok_or_else(|| anyhow::anyhow!("multimodal tokenization should produce an image chunk"))?;
 
-        Ok(())
+    let n_tokens = u64::try_from(image_chunk.n_tokens())?;
+    if n_tokens == 0 {
+        anyhow::bail!("image chunk should report at least one token");
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn text_chunk_drives_marker_state_machine_to_reasoning(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let input_text = MtmdInputText {
-            text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n<think>\n".to_owned(),
-            add_special: false,
-            parse_special: true,
-        };
-        let chunks = mtmd_ctx.tokenize(input_text, &[])?;
-
-        let mut classifier = model.sampled_token_classifier();
-
-        for index in 0..chunks.len() {
-            let chunk = chunks
-                .get(index)
-                .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?;
-            ingest_prompt_chunk(&mut classifier, &chunk)?;
-        }
+    let mut classifier = model.sampled_token_classifier();
 
-        if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning {
-            anyhow::bail!(
-                "text chunk replay must transition the classifier section to Reasoning when the \
-                 prompt opens a `<think>` block; got {:?}",
-                classifier.current_section()
-            );
-        }
+    ingest_prompt_chunk(&mut classifier, &image_chunk)?;
 
-        Ok(())
+    let usage = classifier.usage();
+    if usage.input_image_tokens != n_tokens {
+        anyhow::bail!(
+            "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}",
+            usage.input_image_tokens
+        );
     }
+    if usage.prompt_tokens != 0 {
+        anyhow::bail!(
+            "image chunk must not bump prompt_tokens; got {}",
+            usage.prompt_tokens
+        );
+    }
+    if usage.input_audio_tokens != 0 {
+        anyhow::bail!(
+            "image chunk must not bump input_audio_tokens; got {}",
+            usage.input_audio_tokens
+        );
+    }
+
+    Ok(())
 }
 
-mod gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_bindings::mtmd::mtmd_default_marker;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_bindings_tests::test_model::fixtures_dir;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+fn text_chunk_drives_marker_state_machine_to_reasoning(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let input_text = MtmdInputText {
+        text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n<think>\n".to_owned(),
+        add_special: false,
+        parse_special: true,
+    };
+    let chunks = mtmd_ctx.tokenize(input_text, &[])?;
+
+    let mut classifier = model.sampled_token_classifier();
 
-    const MAX_GENERATED_TOKENS: i32 = 200;
+    for index in 0..chunks.len() {
+        let chunk = chunks
+            .get(index)
+            .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?;
+        ingest_prompt_chunk(&mut classifier, &chunk)?;
+    }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let image_path = fixtures_dir().join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-        let marker = mtmd_default_marker();
-        let prompt = format!(
-            "<bos><start_of_turn>user\n{marker}What animals do you see in this image?<end_of_turn>\n<start_of_turn>model\n<|channel>thought\n"
+    if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning {
+        anyhow::bail!(
+            "text chunk replay must transition the classifier section to Reasoning when the \
+             prompt opens a `<think>` block; got {:?}",
+            classifier.current_section()
         );
+    }
 
-        let input_text = MtmdInputText {
-            text: prompt,
-            add_special: false,
-            parse_special: true,
-        };
-
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-        let mut classifier = model.sampled_token_classifier();
-        let n_past =
-            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position: n_past,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
+    Ok(())
+}
 
-        let usage = classifier.usage();
+#[llama_test(
+    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"),
+)]
+fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    const MAX_GENERATED_TOKENS: i32 = 200;
 
-        if outcome.observed_reasoning == 0 {
-            anyhow::bail!(
-                "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \
-                 when the prompt opens a `<|channel>thought` block; outcome={outcome:?}"
-            );
-        }
-        if usage.reasoning_tokens == 0 {
-            anyhow::bail!(
-                "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-            );
-        }
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let image_path = fixtures_dir().join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+    let marker = mtmd_default_marker();
+    let prompt = format!(
+        "<bos><start_of_turn>user\n{marker}What animals do you see in this image?<end_of_turn>\n<start_of_turn>model\n<|channel>thought\n"
+    );
+
+    let input_text = MtmdInputText {
+        text: prompt,
+        add_special: false,
+        parse_special: true,
+    };
 
-        Ok(())
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+    let mut classifier = model.sampled_token_classifier();
+    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position: n_past,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
     }
-}
+    .run()?;
 
-mod mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_bindings::mtmd::mtmd_default_marker;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_bindings_tests::test_model::fixtures_dir;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    let usage = classifier.usage();
 
-    const MAX_GENERATED_TOKENS: i32 = 768;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let image_path = fixtures_dir().join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-        let marker = mtmd_default_marker();
-        let prompt = format!(
-            "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
-             First draft your thinking process (inner monologue) until you arrive at a response. \
-             Format your response using Markdown, and use LaTeX for any mathematical equations. \
-             Write both your thoughts and the response in the same language as the input.\n\n\
-             Your thinking process must follow the template below:\
-             [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
-             Be as casual and as long as you want until you are confident to generate the response \
-             to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
-             [INST]{marker}What animals do you see in this image?[/INST]"
+    if outcome.observed_reasoning == 0 {
+        anyhow::bail!(
+            "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \
+             when the prompt opens a `<|channel>thought` block; outcome={outcome:?}"
+        );
+    }
+    if usage.reasoning_tokens == 0 {
+        anyhow::bail!(
+            "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
         );
+    }
 
-        let input_text = MtmdInputText {
-            text: prompt,
-            add_special: true,
-            parse_special: true,
-        };
-
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-        let mut classifier = model.sampled_token_classifier();
-        let n_past =
-            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-        let mut sampler = LlamaSampler::greedy();
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position: n_past,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
+    Ok(())
+}
 
-        let usage = classifier.usage();
+#[llama_test(
+    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"),
+)]
+fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    const MAX_GENERATED_TOKENS: i32 = 768;
 
-        if outcome.observed_reasoning == 0 {
-            anyhow::bail!(
-                "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \
-                 when the model opens a `[THINK]` block; outcome={outcome:?}"
-            );
-        }
-        if usage.reasoning_tokens == 0 {
-            anyhow::bail!(
-                "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-            );
-        }
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let image_path = fixtures_dir().join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+    let marker = mtmd_default_marker();
+    let prompt = format!(
+        "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
+         First draft your thinking process (inner monologue) until you arrive at a response. \
+         Format your response using Markdown, and use LaTeX for any mathematical equations. \
+         Write both your thoughts and the response in the same language as the input.\n\n\
+         Your thinking process must follow the template below:\
+         [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
+         Be as casual and as long as you want until you are confident to generate the response \
+         to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
+         [INST]{marker}What animals do you see in this image?[/INST]"
+    );
+
+    let input_text = MtmdInputText {
+        text: prompt,
+        add_special: true,
+        parse_special: true,
+    };
 
-        Ok(())
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+    let mut classifier = model.sampled_token_classifier();
+    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+    let mut sampler = LlamaSampler::greedy();
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position: n_past,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
     }
-}
-
-mod qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_bindings::mtmd::mtmd_default_marker;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_bindings_tests::test_model::fixtures_dir;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    .run()?;
 
-    const MAX_GENERATED_TOKENS: i32 = 200;
+    let usage = classifier.usage();
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 4096,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let image_path = fixtures_dir().join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-        let marker = mtmd_default_marker();
-        let prompt = format!(
-            "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+    if outcome.observed_reasoning == 0 {
+        anyhow::bail!(
+            "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \
+             when the model opens a `[THINK]` block; outcome={outcome:?}"
+        );
+    }
+    if usage.reasoning_tokens == 0 {
+        anyhow::bail!(
+            "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
         );
+    }
 
-        let input_text = MtmdInputText {
-            text: prompt,
-            add_special: false,
-            parse_special: true,
-        };
-
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-        let mut classifier = model.sampled_token_classifier();
-        let n_past =
-            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position: n_past,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
+    Ok(())
+}
 
-        let usage = classifier.usage();
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 4096,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    const MAX_GENERATED_TOKENS: i32 = 200;
 
-        if outcome.observed_reasoning == 0 {
-            anyhow::bail!(
-                "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \
-                 when the prompt opens a `<think>` block; outcome={outcome:?}"
-            );
-        }
-        if usage.reasoning_tokens == 0 {
-            anyhow::bail!(
-                "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-            );
-        }
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let image_path = fixtures_dir().join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+    let marker = mtmd_default_marker();
+    let prompt = format!(
+        "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+    );
+
+    let input_text = MtmdInputText {
+        text: prompt,
+        add_special: false,
+        parse_special: true,
+    };
 
-        Ok(())
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+    let mut classifier = model.sampled_token_classifier();
+    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position: n_past,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
     }
-}
+    .run()?;
 
-mod qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::mtmd::MtmdBitmap;
-    use llama_cpp_bindings::mtmd::MtmdInputText;
-    use llama_cpp_bindings::mtmd::mtmd_default_marker;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_bindings_tests::test_model::fixtures_dir;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    const MAX_GENERATED_TOKENS: i32 = 200;
+    let usage = classifier.usage();
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 512,
-        n_ubatch = 512,
-        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-    )]
-    fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mtmd_ctx = fixture
-            .mtmd_context
-            .expect("mmproj_file declared in attribute");
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let image_path = fixtures_dir().join("llamas.jpg");
-        let image_path_str = image_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-        let marker = mtmd_default_marker();
-        let prompt = format!(
-            "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+    if outcome.observed_reasoning == 0 {
+        anyhow::bail!(
+            "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \
+             when the prompt opens a `<think>` block; outcome={outcome:?}"
         );
+    }
+    if usage.reasoning_tokens == 0 {
+        anyhow::bail!(
+            "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+        );
+    }
 
-        let input_text = MtmdInputText {
-            text: prompt,
-            add_special: false,
-            parse_special: true,
-        };
-
-        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-        let mut classifier = model.sampled_token_classifier();
-        let n_past =
-            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position: n_past,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
+    Ok(())
+}
 
-        let usage = classifier.usage();
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 512,
+    n_ubatch = 512,
+    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+)]
+fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    const MAX_GENERATED_TOKENS: i32 = 200;
 
-        if outcome.observed_reasoning == 0 {
-            anyhow::bail!(
-                "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}"
-            );
-        }
-        if usage.reasoning_tokens == 0 {
-            anyhow::bail!(
-                "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-            );
-        }
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mtmd_ctx = fixture
+        .mtmd_context
+        .expect("mmproj_file declared in attribute");
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let image_path = fixtures_dir().join("llamas.jpg");
+    let image_path_str = image_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+    let marker = mtmd_default_marker();
+    let prompt = format!(
+        "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+    );
+
+    let input_text = MtmdInputText {
+        text: prompt,
+        add_special: false,
+        parse_special: true,
+    };
 
-        Ok(())
+    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+    let mut classifier = model.sampled_token_classifier();
+    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position: n_past,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+
+    if outcome.observed_reasoning == 0 {
+        anyhow::bail!(
+            "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}"
+        );
+    }
+    if usage.reasoning_tokens == 0 {
+        anyhow::bail!(
+            "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+        );
     }
-}
 
+    Ok(())
+}
 llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs
index a5aac3d4..d5cad959 100644
--- a/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs
+++ b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs
@@ -1,2484 +1,2215 @@
+#![expect(
+    clippy::unnecessary_wraps,
+    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+)]
+
+use anyhow::Result;
+use anyhow::bail;
+use llama_cpp_bindings::ChatMessageParseOutcome;
+use llama_cpp_bindings::ToolCallArgsShape;
+use llama_cpp_bindings::ToolCallArguments;
+use llama_cpp_bindings::context::LlamaContext;
+use llama_cpp_bindings::llama_batch::LlamaBatch;
+use llama_cpp_bindings::model::AddBos;
+use llama_cpp_bindings::model::LlamaChatMessage;
+use llama_cpp_bindings::sampling::LlamaSampler;
+use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+use llama_cpp_test_harness::LlamaFixture;
+use llama_cpp_test_harness::llama_test;
 use llama_cpp_test_harness::llama_tests_main;
-
-mod deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+use serde_json::Value;
+use serde_json::json;
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 200;
 
     const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\
-    <｜User｜>What is 2 + 2?<｜Assistant｜><think>
+<｜User｜>What is 2 + 2?<｜Assistant｜><think>
 
-    </think>
+</think>
 
-    ";
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens =
-            model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-
-        assert!(
-            !outcome.generated_raw.is_empty(),
-            "DeepSeek-R1-8B: must generate at least one token"
-        );
-        assert_eq!(
-            outcome.observed_reasoning, 0,
-            "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \
-             when the prompt closes the think block before generation begins; \
-             generated={:?}",
-            outcome.generated_raw
-        );
-        assert_eq!(
-            outcome.observed_undeterminable, 0,
-            "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \
-             before generation, so no Undeterminable tokens may be emitted; \
-             generated={:?}",
-            outcome.generated_raw
-        );
-        assert_eq!(
-            usage.reasoning_tokens, 0,
-            "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
-        );
-        assert_eq!(
-            usage.undeterminable_tokens, 0,
-            "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
-        );
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens =
+        model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+
+    assert!(
+        !outcome.generated_raw.is_empty(),
+        "DeepSeek-R1-8B: must generate at least one token"
+    );
+    assert_eq!(
+        outcome.observed_reasoning, 0,
+        "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \
+         when the prompt closes the think block before generation begins; \
+         generated={:?}",
+        outcome.generated_raw
+    );
+    assert_eq!(
+        outcome.observed_undeterminable, 0,
+        "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \
+         before generation, so no Undeterminable tokens may be emitted; \
+         generated={:?}",
+        outcome.generated_raw
+    );
+    assert_eq!(
+        usage.reasoning_tokens, 0,
+        "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
+    );
+    assert_eq!(
+        usage.undeterminable_tokens, 0,
+        "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
+    );
+    assert!(
+        outcome.observed_content > 0,
+        "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token"
+    );
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content,
+        "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens"
+    );
+
+    for forbidden in FORBIDDEN_MARKERS {
         assert!(
-            outcome.observed_content > 0,
-            "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token"
-        );
-        assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content,
-            "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens"
+            !outcome.content_stream.contains(forbidden),
+            "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \
+             content_stream={:?}",
+            outcome.content_stream
         );
-
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(
-                !outcome.content_stream.contains(forbidden),
-                "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \
-                 content_stream={:?}",
-                outcome.content_stream
-            );
-        }
-
-        Ok(())
     }
-}
 
-mod deepseek_r1_8b_classifier_emits_reasoning {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[expect(
+    clippy::too_many_lines,
+    reason = "test asserts many distinct properties of DeepSeek-R1-8B reasoning output; shortening messages or splitting the body would reduce diagnostic signal at failure time"
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 1500;
 
-    // DeepSeek-R1-Distill-Llama-8B uses `<think>...</think>` reasoning markers
-    // and full-width-bar role tokens `<｜User｜>` / `<｜Assistant｜>` (U+FF5C,
-    // not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends
-    // `<｜Assistant｜><think>\n` — DeepSeek-R1 is a pure reasoner with no
-    // thinking-disabled mode — so the model resumes generation already inside
-    // the reasoning block.
     const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\
-    <｜User｜>What is 2 + 2?<｜Assistant｜><think>
-    ";
+<｜User｜>What is 2 + 2?<｜Assistant｜><think>
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[expect(
-        clippy::too_many_lines,
-        reason = "test asserts many distinct properties of DeepSeek-R1-8B reasoning output; shortening messages or splitting the body would reduce diagnostic signal at failure time"
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!(
-                "DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized"
-            );
-        };
-
-        assert!(
-            !outcome.generated_raw.is_empty(),
-            "DeepSeek-R1-8B: must generate at least one token"
-        );
-        assert!(
-            outcome.observed_reasoning > 0,
-            "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \
-             opens a <think> block; outcome={outcome:?}",
-        );
-        assert!(
-            usage.reasoning_tokens > 0,
-            "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \
-             <think> block; usage was {usage:?}"
-        );
-        assert_eq!(
-            outcome.observed_undeterminable, 0,
-            "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \
-             so no Undeterminable tokens may be emitted; outcome={outcome:?}"
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized");
+    };
+
+    assert!(
+        !outcome.generated_raw.is_empty(),
+        "DeepSeek-R1-8B: must generate at least one token"
+    );
+    assert!(
+        outcome.observed_reasoning > 0,
+        "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \
+         opens a <think> block; outcome={outcome:?}",
+    );
+    assert!(
+        usage.reasoning_tokens > 0,
+        "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \
+         <think> block; usage was {usage:?}"
+    );
+    assert_eq!(
+        outcome.observed_undeterminable, 0,
+        "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \
+         so no Undeterminable tokens may be emitted; outcome={outcome:?}"
+    );
+    assert_eq!(
+        usage.undeterminable_tokens, 0,
+        "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}"
+    );
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content + outcome.observed_reasoning,
+        "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning"
+    );
+
+    if parsed.reasoning_content.is_empty() {
+        eprintln!(
+            "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \
+             tokens — skipping strict parser-equality assertions"
         );
+    } else {
         assert_eq!(
-            usage.undeterminable_tokens, 0,
-            "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}"
+            outcome.reasoning_stream, parsed.reasoning_content,
+            "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \
+             (any difference means a marker leaked into the user-visible stream)",
         );
         assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content + outcome.observed_reasoning,
-            "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning"
+            outcome.content_stream, parsed.content,
+            "DeepSeek-R1-8B: per-token content stream must equal parser-side content \
+             (any difference means a marker leaked into the user-visible stream)",
         );
+    }
 
-        if parsed.reasoning_content.is_empty() {
-            eprintln!(
-                "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \
-                 tokens — skipping strict parser-equality assertions"
-            );
-        } else {
-            assert_eq!(
-                outcome.reasoning_stream, parsed.reasoning_content,
-                "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \
-                 (any difference means a marker leaked into the user-visible stream)",
-            );
-            assert_eq!(
-                outcome.content_stream, parsed.content,
-                "DeepSeek-R1-8B: per-token content stream must equal parser-side content \
-                 (any difference means a marker leaked into the user-visible stream)",
-            );
-        }
-
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(
-                !outcome.reasoning_stream.contains(forbidden),
-                "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \
-                 reasoning_stream={:?}",
-                outcome.reasoning_stream
-            );
-            assert!(
-                !outcome.content_stream.contains(forbidden),
-                "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \
-                 content_stream={:?}",
-                outcome.content_stream
-            );
-        }
-
-        Ok(())
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(
+            !outcome.reasoning_stream.contains(forbidden),
+            "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \
+             reasoning_stream={:?}",
+            outcome.reasoning_stream
+        );
+        assert!(
+            !outcome.content_stream.contains(forbidden),
+            "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \
+             content_stream={:?}",
+            outcome.content_stream
+        );
     }
-}
 
-mod deepseek_r1_8b_duck_types_gemma_paired_quote {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const GEMMA_PAIRED_QUOTE_PAYLOAD: &str =
         "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome =
-            fixture
-                .model
-                .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "duck-type pass must recognise Gemma paired-quote on a model with no registered \
-                 template; got Unrecognized"
-            );
-        };
-        assert_eq!(
-            parsed.tool_calls.len(),
-            1,
-            "expected one tool call; got {:?}",
-            parsed.tool_calls
+    let outcome =
+        fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "duck-type pass must recognise Gemma paired-quote on a model with no registered \
+             template; got Unrecognized"
         );
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+    };
+    assert_eq!(
+        parsed.tool_calls.len(),
+        1,
+        "expected one tool call; got {:?}",
+        parsed.tool_calls
+    );
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod deepseek_r1_8b_duck_types_glm_key_value_tags {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const GLM_KEY_VALUE_PAYLOAD: &str = "<tool_call>get_weather\
-    <arg_key>location</arg_key>\
-    <arg_value>Paris</arg_value>\
-    </tool_call>";
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "duck-type pass must recognise GLM key-value tags on a model with no registered \
-                 template; got Unrecognized"
-            );
-        };
-        assert_eq!(
-            parsed.tool_calls.len(),
-            1,
-            "expected one tool call; got {:?}",
-            parsed.tool_calls
+<arg_key>location</arg_key>\
+<arg_value>Paris</arg_value>\
+</tool_call>";
+
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "duck-type pass must recognise GLM key-value tags on a model with no registered \
+             template; got Unrecognized"
         );
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+    };
+    assert_eq!(
+        parsed.tool_calls.len(),
+        1,
+        "expected one tool call; got {:?}",
+        parsed.tool_calls
+    );
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod deepseek_r1_8b_duck_types_mistral_bracketed_json {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const MISTRAL_BRACKETED_JSON_PAYLOAD: &str =
         r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome =
-            fixture
-                .model
-                .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \
-                 template; got Unrecognized"
-            );
-        };
-        assert_eq!(
-            parsed.tool_calls.len(),
-            1,
-            "expected one tool call; got {:?}",
-            parsed.tool_calls
+    let outcome =
+        fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \
+             template; got Unrecognized"
         );
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+    };
+    assert_eq!(
+        parsed.tool_calls.len(),
+        1,
+        "expected one tool call; got {:?}",
+        parsed.tool_calls
+    );
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod deepseek_r1_8b_duck_types_qwen_xml {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const QWEN_XML_PAYLOAD: &str = "<tool_call>\n\
-    <function=get_weather>\n\
-    <parameter=location>\n\
-    Paris\n\
-    </parameter>\n\
-    </function>\n\
-    </tool_call>";
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "duck-type pass must recognise Qwen XML on a model with no registered template; \
-                 got Unrecognized"
-            );
-        };
-        assert_eq!(
-            parsed.tool_calls.len(),
-            1,
-            "expected one tool call; got {:?}",
-            parsed.tool_calls
+<function=get_weather>\n\
+<parameter=location>\n\
+Paris\n\
+</parameter>\n\
+</function>\n\
+</tool_call>";
+
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "duck-type pass must recognise Qwen XML on a model with no registered template; \
+             got Unrecognized"
         );
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+    };
+    assert_eq!(
+        parsed.tool_calls.len(),
+        1,
+        "expected one tool call; got {:?}",
+        parsed.tool_calls
+    );
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "plain content with tools requested must produce Recognized (with empty tool_calls); \
-                 got Unrecognized"
-            );
-        };
-        assert!(
-            parsed.tool_calls.is_empty(),
-            "expected no tool calls; got {:?}",
-            parsed.tool_calls
-        );
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
 
-        Ok(())
-    }
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "plain content with tools requested must produce Recognized (with empty tool_calls); \
+             got Unrecognized"
+        );
+    };
+    assert!(
+        parsed.tool_calls.is_empty(),
+        "expected no tool calls; got {:?}",
+        parsed.tool_calls
+    );
+
+    Ok(())
 }
 
-mod deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const PLAIN_CONTENT: &str = "Hello there.";
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message("[]", PLAIN_CONTENT, false)?;
+    let outcome = fixture
+        .model
+        .parse_chat_message("[]", PLAIN_CONTENT, false)?;
 
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!("plain content with empty tools array must produce Recognized; got Unrecognized");
-        };
-        assert!(
-            parsed.tool_calls.is_empty(),
-            "expected no tool calls; got {:?}",
-            parsed.tool_calls
-        );
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!("plain content with empty tools array must produce Recognized; got Unrecognized");
+    };
+    assert!(
+        parsed.tool_calls.is_empty(),
+        "expected no tool calls; got {:?}",
+        parsed.tool_calls
+    );
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 200;
 
     const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\
-    <bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
-    <start_of_turn>model\n<|channel>thought\n<channel|>\n";
+<bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
+<start_of_turn>model\n<|channel>thought\n<channel|>\n";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", "<channel|>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::greedy();
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-
-        assert!(
-            !outcome.generated_raw.is_empty(),
-            "Gemma 4 must generate at least one token"
-        );
-        assert_eq!(
-            outcome.observed_reasoning, 0,
-            "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \
-             when the prompt closes the thought channel before generation begins; \
-             generated={:?}",
-            outcome.generated_raw
-        );
-        assert_eq!(
-            outcome.observed_undeterminable, 0,
-            "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \
-             before generation, so no Undeterminable tokens may be emitted; \
-             generated={:?}",
-            outcome.generated_raw
-        );
-        assert_eq!(
-            usage.reasoning_tokens, 0,
-            "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
-        );
-        assert_eq!(
-            usage.undeterminable_tokens, 0,
-            "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
-        );
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::greedy();
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+
+    assert!(
+        !outcome.generated_raw.is_empty(),
+        "Gemma 4 must generate at least one token"
+    );
+    assert_eq!(
+        outcome.observed_reasoning, 0,
+        "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \
+         when the prompt closes the thought channel before generation begins; \
+         generated={:?}",
+        outcome.generated_raw
+    );
+    assert_eq!(
+        outcome.observed_undeterminable, 0,
+        "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \
+         before generation, so no Undeterminable tokens may be emitted; \
+         generated={:?}",
+        outcome.generated_raw
+    );
+    assert_eq!(
+        usage.reasoning_tokens, 0,
+        "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
+    );
+    assert_eq!(
+        usage.undeterminable_tokens, 0,
+        "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
+    );
+    assert!(
+        outcome.observed_content > 0,
+        "Gemma 4 thinking-disabled: classifier must emit at least one Content token"
+    );
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content,
+        "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens"
+    );
+
+    for forbidden in FORBIDDEN_MARKERS {
         assert!(
-            outcome.observed_content > 0,
-            "Gemma 4 thinking-disabled: classifier must emit at least one Content token"
+            !outcome.content_stream.contains(forbidden),
+            "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \
+             content_stream={:?}",
+            outcome.content_stream
         );
-        assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content,
-            "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens"
-        );
-
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(
-                !outcome.content_stream.contains(forbidden),
-                "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \
-                 content_stream={:?}",
-                outcome.content_stream
-            );
-        }
-
-        Ok(())
     }
-}
 
-mod gemma4_classifier_emits_reasoning {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn gemma4_classifier_emits_reasoning_for_thinking_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 1500;
 
     const GEMMA4_THINKING_PROMPT: &str = "\
-    <bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
-    <start_of_turn>model\n<|channel>thought\n";
+<bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
+<start_of_turn>model\n<|channel>thought\n";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", "<channel|>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn gemma4_classifier_emits_reasoning_for_thinking_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::greedy();
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized");
-        };
-
-        assert!(
-            !outcome.generated_raw.is_empty(),
-            "Gemma 4 must generate at least one token"
-        );
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::greedy();
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized");
+    };
+
+    assert!(
+        !outcome.generated_raw.is_empty(),
+        "Gemma 4 must generate at least one token"
+    );
+    assert!(
+        outcome.observed_reasoning > 0,
+        "Gemma 4 classifier must emit at least one Reasoning token when the model \
+         emits a `<|channel>thought` block; outcome={outcome:?}",
+    );
+    assert!(
+        usage.reasoning_tokens > 0,
+        "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \
+         reasoning block; usage was {usage:?}"
+    );
+    assert_eq!(
+        outcome.observed_undeterminable, 0,
+        "Gemma 4: classifier must not emit Undeterminable when the model emits a \
+         detected `<|channel>thought` marker; outcome={outcome:?}"
+    );
+    assert_eq!(
+        usage.undeterminable_tokens, 0,
+        "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}"
+    );
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content + outcome.observed_reasoning,
+        "Gemma 4: completion tokens must equal observed Content + Reasoning"
+    );
+    assert!(
+        !parsed.reasoning_content.is_empty(),
+        "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \
+         increase the budget or pick a more direct prompt. generated={:?}",
+        outcome.generated_raw,
+    );
+
+    for forbidden in FORBIDDEN_MARKERS {
         assert!(
-            outcome.observed_reasoning > 0,
-            "Gemma 4 classifier must emit at least one Reasoning token when the model \
-             emits a `<|channel>thought` block; outcome={outcome:?}",
+            !outcome.reasoning_stream.contains(forbidden),
+            "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \
+             reasoning_stream={:?}",
+            outcome.reasoning_stream
         );
         assert!(
-            usage.reasoning_tokens > 0,
-            "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \
-             reasoning block; usage was {usage:?}"
+            !outcome.content_stream.contains(forbidden),
+            "Gemma 4: content_stream leaked marker {forbidden:?}; \
+             content_stream={:?}",
+            outcome.content_stream
         );
-        assert_eq!(
-            outcome.observed_undeterminable, 0,
-            "Gemma 4: classifier must not emit Undeterminable when the model emits a \
-             detected `<|channel>thought` marker; outcome={outcome:?}"
-        );
-        assert_eq!(
-            usage.undeterminable_tokens, 0,
-            "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}"
-        );
-        assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content + outcome.observed_reasoning,
-            "Gemma 4: completion tokens must equal observed Content + Reasoning"
-        );
-        assert!(
-            !parsed.reasoning_content.is_empty(),
-            "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \
-             increase the budget or pick a more direct prompt. generated={:?}",
-            outcome.generated_raw,
-        );
-
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(
-                !outcome.reasoning_stream.contains(forbidden),
-                "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \
-                 reasoning_stream={:?}",
-                outcome.reasoning_stream
-            );
-            assert!(
-                !outcome.content_stream.contains(forbidden),
-                "Gemma 4: content_stream leaked marker {forbidden:?}; \
-                 content_stream={:?}",
-                outcome.content_stream
-            );
-        }
-
-        Ok(())
     }
-}
 
-mod gemma4_parses_tool_call_payload {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str =
         "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome =
-            fixture
-                .model
-                .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized"
-            );
-        };
-        assert_eq!(
-            parsed.tool_calls.len(),
-            1,
-            "expected one tool call; got {:?}",
-            parsed.tool_calls
-        );
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+    let outcome =
+        fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!("expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized");
+    };
+    assert_eq!(
+        parsed.tool_calls.len(),
+        1,
+        "expected one tool call; got {:?}",
+        parsed.tool_calls
+    );
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod gemma4_template_override_returns_full_markers {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::ToolCallArgsShape;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let template = model
-            .chat_template(None)
-            .expect("Gemma 4 chat template must be present");
-        let template_str = template.to_str().expect("template must be valid UTF-8");
-        assert!(
-            template_str.contains("<|tool_call>call:"),
-            "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \
-             template starts with: {:?}",
-            &template_str[..template_str.len().min(200)],
-        );
-
-        let markers = model
-            .tool_call_markers()
-            .expect("Gemma 4 must produce ToolCallMarkers via override registry");
-
-        assert_eq!(markers.open, "<|tool_call>call:");
-        assert_eq!(markers.close, "}");
-        let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else {
-            panic!("expected PairedQuote variant, got {:?}", markers.args_shape);
-        };
-        assert_eq!(shape.name_args_separator, "{");
-        assert_eq!(shape.value_quote.open, "<|\"|>");
-        assert_eq!(shape.value_quote.close, "<|\"|>");
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let template = model
+        .chat_template(None)
+        .expect("Gemma 4 chat template must be present");
+    let template_str = template.to_str().expect("template must be valid UTF-8");
+    assert!(
+        template_str.contains("<|tool_call>call:"),
+        "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \
+         template starts with: {:?}",
+        &template_str[..template_str.len().min(200)],
+    );
+
+    let markers = model
+        .tool_call_markers()
+        .expect("Gemma 4 must produce ToolCallMarkers via override registry");
+
+    assert_eq!(markers.open, "<|tool_call>call:");
+    assert_eq!(markers.close, "}");
+    let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else {
+        panic!("expected PairedQuote variant, got {:?}", markers.args_shape);
+    };
+    assert_eq!(shape.name_args_separator, "{");
+    assert_eq!(shape.value_quote.open, "<|\"|>");
+    assert_eq!(shape.value_quote.close, "<|\"|>");
+
+    Ok(())
 }
 
-mod glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 200;
 
     const GLM47_THINKING_DISABLED_PROMPT: &str = "\
-    <|user|>
-    What is 2 + 2?
-    <|assistant|>
-    </think>
+<|user|>
+What is 2 + 2?
+<|assistant|>
+</think>
 
-    ";
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
 
-        assert!(!outcome.generated_raw.is_empty());
-        assert_eq!(outcome.observed_reasoning, 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.reasoning_tokens, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert!(outcome.observed_content > 0);
-        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+    let usage = classifier.usage();
 
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
+    assert!(!outcome.generated_raw.is_empty());
+    assert_eq!(outcome.observed_reasoning, 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.reasoning_tokens, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert!(outcome.observed_content > 0);
+    assert_eq!(usage.completion_tokens(), outcome.observed_content);
 
-        Ok(())
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.content_stream.contains(forbidden));
     }
-}
 
-mod glm47_classifier_emits_reasoning {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 1500;
 
     const GLM47_THINKING_PROMPT: &str = "\
-    <|user|>
-    What is 2 + 2?
-    <|assistant|>
-    <think>
-    ";
+<|user|>
+What is 2 + 2?
+<|assistant|>
+<think>
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized");
-        };
-
-        assert!(!outcome.generated_raw.is_empty());
-        assert!(outcome.observed_reasoning > 0);
-        assert!(usage.reasoning_tokens > 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content + outcome.observed_reasoning
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized");
+    };
+
+    assert!(!outcome.generated_raw.is_empty());
+    assert!(outcome.observed_reasoning > 0);
+    assert!(usage.reasoning_tokens > 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content + outcome.observed_reasoning
+    );
+
+    if parsed.reasoning_content.is_empty() {
+        eprintln!(
+            "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
+             skipping strict parser-equality assertions"
         );
+    } else {
+        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+        assert_eq!(outcome.content_stream, parsed.content);
+    }
 
-        if parsed.reasoning_content.is_empty() {
-            eprintln!(
-                "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
-                 skipping strict parser-equality assertions"
-            );
-        } else {
-            assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-            assert_eq!(outcome.content_stream, parsed.content);
-        }
-
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.reasoning_stream.contains(forbidden));
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
-
-        Ok(())
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.reasoning_stream.contains(forbidden));
+        assert!(!outcome.content_stream.contains(forbidden));
     }
-}
 
-mod glm47_parses_tool_call_payload {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const GLM47_KEY_VALUE_PAYLOAD: &str = "<tool_call>get_weather\
-    <arg_key>location</arg_key>\
-    <arg_value>Paris</arg_value>\
-    </tool_call>";
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome =
-            fixture
-                .model
-                .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized"
-            );
-        };
-        assert_eq!(parsed.tool_calls.len(), 1);
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+<arg_key>location</arg_key>\
+<arg_value>Paris</arg_value>\
+</tool_call>";
 
-        Ok(())
-    }
-}
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?;
 
-mod glm47_template_override_returns_full_markers {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::ToolCallArgsShape;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let template = model
-            .chat_template(None)
-            .expect("GLM-4.7 chat template must be present");
-        let template_str = template.to_str().expect("template must be valid UTF-8");
-        assert!(template_str.contains("<arg_key>"));
-
-        let markers = model
-            .tool_call_markers()
-            .expect("GLM-4.7 must produce ToolCallMarkers via override registry");
-
-        assert_eq!(markers.open, "<tool_call>");
-        assert_eq!(markers.close, "</tool_call>");
-        let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else {
-            panic!(
-                "expected KeyValueXmlTags variant, got {:?}",
-                markers.args_shape
-            );
-        };
-        assert_eq!(shape.key_open, "<arg_key>");
-        assert_eq!(shape.key_close, "</arg_key>");
-        assert_eq!(shape.value_open, "<arg_value>");
-        assert_eq!(shape.value_close, "</arg_value>");
-
-        Ok(())
-    }
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized"
+        );
+    };
+    assert_eq!(parsed.tool_calls.len(), 1);
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
+
+    Ok(())
 }
 
-mod mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let template = model
+        .chat_template(None)
+        .expect("GLM-4.7 chat template must be present");
+    let template_str = template.to_str().expect("template must be valid UTF-8");
+    assert!(template_str.contains("<arg_key>"));
+
+    let markers = model
+        .tool_call_markers()
+        .expect("GLM-4.7 must produce ToolCallMarkers via override registry");
+
+    assert_eq!(markers.open, "<tool_call>");
+    assert_eq!(markers.close, "</tool_call>");
+    let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else {
+        panic!(
+            "expected KeyValueXmlTags variant, got {:?}",
+            markers.args_shape
+        );
+    };
+    assert_eq!(shape.key_open, "<arg_key>");
+    assert_eq!(shape.key_close, "</arg_key>");
+    assert_eq!(shape.value_open, "<arg_value>");
+    assert_eq!(shape.value_close, "</arg_value>");
+
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 200;
 
     const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\
-    [INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]";
+[INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]";
 
     const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens =
-            model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::greedy();
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::greedy();
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
 
-        let usage = classifier.usage();
+    let usage = classifier.usage();
 
-        assert!(!outcome.generated_raw.is_empty());
-        assert_eq!(outcome.observed_reasoning, 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.reasoning_tokens, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert!(outcome.observed_content > 0);
-        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+    assert!(!outcome.generated_raw.is_empty());
+    assert_eq!(outcome.observed_reasoning, 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.reasoning_tokens, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert!(outcome.observed_content > 0);
+    assert_eq!(usage.completion_tokens(), outcome.observed_content);
 
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
-
-        Ok(())
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.content_stream.contains(forbidden));
     }
-}
 
-mod mistral3_classifier_emits_reasoning {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn mistral3_classifier_emits_reasoning_for_thinking_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 768;
 
     const MISTRAL3_THINKING_PROMPT: &str = "\
-    [SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
-    First draft your thinking process (inner monologue) until you arrive at a response. \
-    Format your response using Markdown, and use LaTeX for any mathematical equations. \
-    Write both your thoughts and the response in the same language as the input.\n\n\
-    Your thinking process must follow the template below:\
-    [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
-    Be as casual and as long as you want until you are confident to generate the response \
-    to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
-    [INST]Reply with the single word: four. Do not explain.[/INST]";
+[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
+First draft your thinking process (inner monologue) until you arrive at a response. \
+Format your response using Markdown, and use LaTeX for any mathematical equations. \
+Write both your thoughts and the response in the same language as the input.\n\n\
+Your thinking process must follow the template below:\
+[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
+Be as casual and as long as you want until you are confident to generate the response \
+to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
+[INST]Reply with the single word: four. Do not explain.[/INST]";
 
     const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn mistral3_classifier_emits_reasoning_for_thinking_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::greedy();
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized");
-        };
-
-        assert!(!outcome.generated_raw.is_empty());
-        assert!(outcome.observed_reasoning > 0);
-        assert!(usage.reasoning_tokens > 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content + outcome.observed_reasoning,
-        );
-        assert!(!parsed.reasoning_content.is_empty());
-        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-        assert_eq!(outcome.content_stream, parsed.content);
-
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.reasoning_stream.contains(forbidden));
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
-
-        Ok(())
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::greedy();
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized");
+    };
+
+    assert!(!outcome.generated_raw.is_empty());
+    assert!(outcome.observed_reasoning > 0);
+    assert!(usage.reasoning_tokens > 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content + outcome.observed_reasoning,
+    );
+    assert!(!parsed.reasoning_content.is_empty());
+    assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+    assert_eq!(outcome.content_stream, parsed.content);
+
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.reasoning_stream.contains(forbidden));
+        assert!(!outcome.content_stream.contains(forbidden));
     }
-}
 
-mod mistral3_parses_tool_call_payload {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str =
         r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome =
-            fixture
-                .model
-                .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized"
-            );
-        };
-        assert_eq!(parsed.tool_calls.len(), 1);
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+    let outcome =
+        fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?;
 
-        Ok(())
-    }
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized"
+        );
+    };
+    assert_eq!(parsed.tool_calls.len(), 1);
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
+
+    Ok(())
 }
 
-mod qwen35_chat_inference_emits_reasoning_when_template_auto_opens {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::model::LlamaChatMessage;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let chat_template = model.chat_template(None)?;
-        let messages = vec![LlamaChatMessage::new(
-            "user".to_owned(),
-            "Hello! How are you?".to_owned(),
-        )?];
-        let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
-        let mut classifier = model.sampled_token_classifier();
-        let tokens = model.str_to_token(&prompt, AddBos::Always)?;
-        let prompt_token_count = u64::try_from(tokens.len())?;
-
-        let mut batch = LlamaBatch::new(512, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::greedy();
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: 1024,
-        }
-        .run()?;
-
-        assert!(!outcome.generated_raw.is_empty());
-        assert!(outcome.observed_reasoning > 0);
-        assert!(outcome.observed_content > 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(outcome.observed_tool_call, 0);
-
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
-        };
-        assert!(!parsed.content.is_empty());
-
-        let usage = classifier.into_usage();
-        assert_eq!(usage.prompt_tokens, prompt_token_count);
-        assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
-        assert_eq!(usage.undeterminable_tokens, 0);
-
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let chat_template = model.chat_template(None)?;
+    let messages = vec![LlamaChatMessage::new(
+        "user".to_owned(),
+        "Hello! How are you?".to_owned(),
+    )?];
+    let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+    let mut classifier = model.sampled_token_classifier();
+    let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+    let prompt_token_count = u64::try_from(tokens.len())?;
+
+    let mut batch = LlamaBatch::new(512, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::greedy();
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: 1024,
     }
+    .run()?;
+
+    assert!(!outcome.generated_raw.is_empty());
+    assert!(outcome.observed_reasoning > 0);
+    assert!(outcome.observed_content > 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(outcome.observed_tool_call, 0);
+
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
+    };
+    assert!(!parsed.content.is_empty());
+
+    let usage = classifier.into_usage();
+    assert_eq!(usage.prompt_tokens, prompt_token_count);
+    assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
+    assert_eq!(usage.undeterminable_tokens, 0);
+
+    Ok(())
 }
 
-mod qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 200;
 
     const QWEN35_THINKING_DISABLED_PROMPT: &str = "\
-    <|im_start|>user
-    What is 2 + 2?<|im_end|>
-    <|im_start|>assistant
-    <think>
+<|im_start|>user
+What is 2 + 2?<|im_end|>
+<|im_start|>assistant
+<think>
 
-    </think>
+</think>
 
-    ";
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
 
-        assert!(!outcome.generated_raw.is_empty());
-        assert_eq!(outcome.observed_reasoning, 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.reasoning_tokens, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert!(outcome.observed_content > 0);
-        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+    let usage = classifier.usage();
 
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
+    assert!(!outcome.generated_raw.is_empty());
+    assert_eq!(outcome.observed_reasoning, 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.reasoning_tokens, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert!(outcome.observed_content > 0);
+    assert_eq!(usage.completion_tokens(), outcome.observed_content);
 
-        Ok(())
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.content_stream.contains(forbidden));
     }
-}
 
-mod qwen35_classifier_emits_reasoning {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 1500;
 
     const QWEN35_THINKING_PROMPT: &str = "\
-    <|im_start|>user
-    What is 2 + 2?<|im_end|>
-    <|im_start|>assistant
-    <think>
-    ";
+<|im_start|>user
+What is 2 + 2?<|im_end|>
+<|im_start|>assistant
+<think>
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
-        };
-
-        assert!(!outcome.generated_raw.is_empty());
-        assert!(outcome.observed_reasoning > 0);
-        assert!(usage.reasoning_tokens > 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content + outcome.observed_reasoning,
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
+    };
+
+    assert!(!outcome.generated_raw.is_empty());
+    assert!(outcome.observed_reasoning > 0);
+    assert!(usage.reasoning_tokens > 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content + outcome.observed_reasoning,
+    );
+
+    if parsed.reasoning_content.is_empty() {
+        eprintln!(
+            "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
+             skipping strict parser-equality assertions"
         );
+    } else {
+        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+        assert_eq!(outcome.content_stream, parsed.content);
+    }
 
-        if parsed.reasoning_content.is_empty() {
-            eprintln!(
-                "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
-                 skipping strict parser-equality assertions"
-            );
-        } else {
-            assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-            assert_eq!(outcome.content_stream, parsed.content);
-        }
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.reasoning_stream.contains(forbidden));
+        assert!(!outcome.content_stream.contains(forbidden));
+    }
 
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.reasoning_stream.contains(forbidden));
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
+    Ok(())
+}
 
-        Ok(())
+fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> {
+    match arguments {
+        ToolCallArguments::ValidJson(value) => Ok(value),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson arguments, got InvalidJson: {raw}")
+        }
     }
 }
 
-mod qwen35_parses_constrained_schema_payload {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-    use serde_json::Value;
-    use serde_json::json;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
     const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "negotiate_with_cat",
-                "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "topic": {
-                            "type": "string",
-                            "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'"
-                        },
-                        "bribe": {
-                            "type": "string",
-                            "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"],
-                            "description": "What you are offering in exchange"
-                        },
-                        "desperation_level": {
-                            "type": "integer",
-                            "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)",
-                            "minimum": 1,
-                            "maximum": 10
-                        }
+    {
+        "type": "function",
+        "function": {
+            "name": "negotiate_with_cat",
+            "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "topic": {
+                        "type": "string",
+                        "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'"
+                    },
+                    "bribe": {
+                        "type": "string",
+                        "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"],
+                        "description": "What you are offering in exchange"
                     },
-                    "required": ["topic"],
-                    "additionalProperties": false
-                }
+                    "desperation_level": {
+                        "type": "integer",
+                        "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)",
+                        "minimum": 1,
+                        "maximum": 10
+                    }
+                },
+                "required": ["topic"],
+                "additionalProperties": false
             }
         }
-    ]"#;
+    }
+]"#;
 
     const NEGOTIATE_WITH_CAT_INPUT: &str = "<tool_call>\n\
-    <function=negotiate_with_cat>\n\
-    <parameter=bribe>\n\
-    tuna\n\
-    </parameter>\n\
-    <parameter=desperation_level>\n\
-    8\n\
-    </parameter>\n\
-    <parameter=topic>\n\
-    get off the keyboard\n\
-    </parameter>\n\
-    </function>\n\
-    </tool_call>";
-
-    fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> {
-        match arguments {
-            ToolCallArguments::ValidJson(value) => Ok(value),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson arguments, got InvalidJson: {raw}")
+<function=negotiate_with_cat>\n\
+<parameter=bribe>\n\
+tuna\n\
+</parameter>\n\
+<parameter=desperation_level>\n\
+8\n\
+</parameter>\n\
+<parameter=topic>\n\
+get off the keyboard\n\
+</parameter>\n\
+</function>\n\
+</tool_call>";
+
+    let outcome = fixture.model.parse_chat_message(
+        NEGOTIATE_WITH_CAT_TOOLS_JSON,
+        NEGOTIATE_WITH_CAT_INPUT,
+        false,
+    )?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \
+             got Unrecognized"
+        );
+    };
+
+    assert_eq!(parsed.tool_calls.len(), 1);
+    assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat");
+    assert_eq!(parsed.tool_calls[0].id, "call_0");
+    assert_eq!(
+        arguments_as_json(&parsed.tool_calls[0].arguments)?,
+        &json!({
+            "bribe": "tuna",
+            "desperation_level": 8,
+            "topic": "get off the keyboard",
+        }),
+    );
+
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+    const TOOLS_JSON: &str = r#"[
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
     }
+]"#;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome = fixture.model.parse_chat_message(
-            NEGOTIATE_WITH_CAT_TOOLS_JSON,
-            NEGOTIATE_WITH_CAT_INPUT,
-            false,
-        )?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \
-                 got Unrecognized"
-            );
-        };
-
-        assert_eq!(parsed.tool_calls.len(), 1);
-        assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat");
-        assert_eq!(parsed.tool_calls[0].id, "call_0");
-        assert_eq!(
-            arguments_as_json(&parsed.tool_calls[0].arguments)?,
-            &json!({
-                "bribe": "tuna",
-                "desperation_level": 8,
-                "topic": "get off the keyboard",
-            }),
-        );
+    const QWEN_XML_PAYLOAD: &str = "<tool_call>\n\
+<function=get_weather>\n\
+<parameter=location>\n\
+Paris\n\
+</parameter>\n\
+</function>\n\
+</tool_call>";
+
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized");
+    };
+    assert_eq!(parsed.tool_calls.len(), 1);
+    assert_eq!(parsed.tool_calls[0].name, "get_weather");
+    let location = match &parsed.tool_calls[0].arguments {
+        ToolCallArguments::ValidJson(value) => value
+            .get("location")
+            .and_then(|v| v.as_str())
+            .map(str::to_owned),
+        ToolCallArguments::InvalidJson(raw) => {
+            bail!("expected ValidJson, got InvalidJson: {raw}");
+        }
+    };
+    assert_eq!(location.as_deref(), Some("Paris"));
 
-        Ok(())
-    }
+    Ok(())
 }
 
-mod qwen35_parses_tool_call_payload {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::ToolCallArguments;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn qwen35_parses_partial_tool_call_returns_pending_state(fixture: &LlamaFixture<'_>) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
-
-    const QWEN_XML_PAYLOAD: &str = "<tool_call>\n\
-    <function=get_weather>\n\
-    <parameter=location>\n\
-    Paris\n\
-    </parameter>\n\
-    </function>\n\
-    </tool_call>";
+    }
+]"#;
 
     const PARTIAL_QWEN_XML_PAYLOAD: &str = "<tool_call>\n<function=get_weather>\n<parameter=lo";
 
-    const TWO_QWEN_XML_PAYLOADS: &str = "<tool_call>\n\
-    <function=get_weather>\n\
-    <parameter=location>\n\
-    Paris\n\
-    </parameter>\n\
-    </function>\n\
-    </tool_call>\n\
-    <tool_call>\n\
-    <function=get_weather>\n\
-    <parameter=location>\n\
-    Berlin\n\
-    </parameter>\n\
-    </function>\n\
-    </tool_call>";
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized");
-        };
-        assert_eq!(parsed.tool_calls.len(), 1);
-        assert_eq!(parsed.tool_calls[0].name, "get_weather");
-        let location = match &parsed.tool_calls[0].arguments {
-            ToolCallArguments::ValidJson(value) => value
-                .get("location")
-                .and_then(|v| v.as_str())
-                .map(str::to_owned),
-            ToolCallArguments::InvalidJson(raw) => {
-                bail!("expected ValidJson, got InvalidJson: {raw}");
-            }
-        };
-        assert_eq!(location.as_deref(), Some("Paris"));
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?;
 
-        Ok(())
-    }
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized");
+    };
+    assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1);
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn qwen35_parses_partial_tool_call_returns_pending_state(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let outcome =
-            fixture
-                .model
-                .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized");
-        };
-        assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1);
-
-        Ok(())
-    }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+    const TOOLS_JSON: &str = r#"[
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
+            }
+        }
+    }
+]"#;
 
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized"
-            );
-        };
-        assert!(
-            !parsed.tool_calls.is_empty(),
-            "expected at least one tool call; got {:?}",
-            parsed.tool_calls
+    const TWO_QWEN_XML_PAYLOADS: &str = "<tool_call>\n\
+<function=get_weather>\n\
+<parameter=location>\n\
+Paris\n\
+</parameter>\n\
+</function>\n\
+</tool_call>\n\
+<tool_call>\n\
+<function=get_weather>\n\
+<parameter=location>\n\
+Berlin\n\
+</parameter>\n\
+</function>\n\
+</tool_call>";
+
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?;
+
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized"
         );
-
-        Ok(())
-    }
+    };
+    assert!(
+        !parsed.tool_calls.is_empty(),
+        "expected at least one tool call; got {:?}",
+        parsed.tool_calls
+    );
+
+    Ok(())
 }
 
-mod qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const TOOLS_JSON: &str = r#"[
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather for a location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {"type": "string", "description": "The city name"}
-                    },
-                    "required": ["location"]
-                }
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "The city name"}
+                },
+                "required": ["location"]
             }
         }
-    ]"#;
+    }
+]"#;
 
     const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let outcome = fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
-
-        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-            bail!(
-                "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \
-                 tool_calls); got Unrecognized"
-            );
-        };
-        assert!(
-            parsed.tool_calls.is_empty(),
-            "expected no tool calls; got {:?}",
-            parsed.tool_calls
-        );
+    let outcome = fixture
+        .model
+        .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
 
-        Ok(())
-    }
+    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+        bail!(
+            "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \
+             tool_calls); got Unrecognized"
+        );
+    };
+    assert!(
+        parsed.tool_calls.is_empty(),
+        "expected no tool calls; got {:?}",
+        parsed.tool_calls
+    );
+
+    Ok(())
 }
 
-mod qwen36_chat_inference_emits_reasoning_when_template_auto_opens {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::model::LlamaChatMessage;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let chat_template = model.chat_template(None)?;
-        let messages = vec![LlamaChatMessage::new(
-            "user".to_owned(),
-            "Hello! How are you?".to_owned(),
-        )?];
-        let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
-        let mut classifier = model.sampled_token_classifier();
-        let tokens = model.str_to_token(&prompt, AddBos::Always)?;
-        let prompt_token_count = u64::try_from(tokens.len())?;
-
-        let mut batch = LlamaBatch::new(512, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::greedy();
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: 1024,
-        }
-        .run()?;
-
-        assert!(!outcome.generated_raw.is_empty());
-        assert!(outcome.observed_reasoning > 0);
-        assert!(outcome.observed_content > 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(outcome.observed_tool_call, 0);
-
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
-        };
-        assert!(!parsed.content.is_empty());
-
-        let usage = classifier.into_usage();
-        assert_eq!(usage.prompt_tokens, prompt_token_count);
-        assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
-        assert_eq!(usage.undeterminable_tokens, 0);
-
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let chat_template = model.chat_template(None)?;
+    let messages = vec![LlamaChatMessage::new(
+        "user".to_owned(),
+        "Hello! How are you?".to_owned(),
+    )?];
+    let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+    let mut classifier = model.sampled_token_classifier();
+    let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+    let prompt_token_count = u64::try_from(tokens.len())?;
+
+    let mut batch = LlamaBatch::new(512, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::greedy();
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: 1024,
     }
+    .run()?;
+
+    assert!(!outcome.generated_raw.is_empty());
+    assert!(outcome.observed_reasoning > 0);
+    assert!(outcome.observed_content > 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(outcome.observed_tool_call, 0);
+
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
+    };
+    assert!(!parsed.content.is_empty());
+
+    let usage = classifier.into_usage();
+    assert_eq!(usage.prompt_tokens, prompt_token_count);
+    assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
+    assert_eq!(usage.undeterminable_tokens, 0);
+
+    Ok(())
 }
 
-mod qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 200;
 
     const QWEN36_THINKING_DISABLED_PROMPT: &str = "\
-    <|im_start|>user
-    What is 2 + 2?<|im_end|>
-    <|im_start|>assistant
-    <think>
+<|im_start|>user
+What is 2 + 2?<|im_end|>
+<|im_start|>assistant
+<think>
 
-    </think>
+</think>
 
-    ";
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
 
-        assert!(!outcome.generated_raw.is_empty());
-        assert_eq!(outcome.observed_reasoning, 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.reasoning_tokens, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert!(outcome.observed_content > 0);
-        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+    let usage = classifier.usage();
 
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
+    assert!(!outcome.generated_raw.is_empty());
+    assert_eq!(outcome.observed_reasoning, 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.reasoning_tokens, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert!(outcome.observed_content > 0);
+    assert_eq!(usage.completion_tokens(), outcome.observed_content);
 
-        Ok(())
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.content_stream.contains(forbidden));
     }
-}
 
-mod qwen36_classifier_emits_reasoning {
-    use anyhow::Result;
-    use anyhow::bail;
-    use llama_cpp_bindings::ChatMessageParseOutcome;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
+    Ok(())
+}
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 8192,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
     const MAX_GENERATED_TOKENS: i32 = 1500;
 
     const QWEN36_THINKING_PROMPT: &str = "\
-    <|im_start|>user
-    What is 2 + 2?<|im_end|>
-    <|im_start|>assistant
-    <think>
-    ";
+<|im_start|>user
+What is 2 + 2?<|im_end|>
+<|im_start|>assistant
+<think>
+";
 
     const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 8192,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let mut classifier = model.sampled_token_classifier();
-        let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?;
-        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-        let mut batch = LlamaBatch::new(2048, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-            LlamaSampler::top_k(40),
-            LlamaSampler::top_p(0.9, 1),
-            LlamaSampler::min_p(0.05, 1),
-            LlamaSampler::temp(0.7),
-            LlamaSampler::dist(0x00C0_FFEE),
-        ]);
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: MAX_GENERATED_TOKENS,
-        }
-        .run()?;
-
-        let usage = classifier.usage();
-        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?;
-        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-            bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
-        };
-
-        assert!(!outcome.generated_raw.is_empty());
-        assert!(outcome.observed_reasoning > 0);
-        assert!(usage.reasoning_tokens > 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-        assert_eq!(
-            usage.completion_tokens(),
-            outcome.observed_content + outcome.observed_reasoning,
-        );
-
-        if parsed.reasoning_content.is_empty() {
-            eprintln!(
-                "Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS"
-            );
-        } else {
-            assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-            assert_eq!(outcome.content_stream, parsed.content);
-        }
-
-        for forbidden in FORBIDDEN_MARKERS {
-            assert!(!outcome.reasoning_stream.contains(forbidden));
-            assert!(!outcome.content_stream.contains(forbidden));
-        }
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let mut classifier = model.sampled_token_classifier();
+    let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?;
+    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+    let mut batch = LlamaBatch::new(2048, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+        LlamaSampler::top_k(40),
+        LlamaSampler::top_p(0.9, 1),
+        LlamaSampler::min_p(0.05, 1),
+        LlamaSampler::temp(0.7),
+        LlamaSampler::dist(0x00C0_FFEE),
+    ]);
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: MAX_GENERATED_TOKENS,
+    }
+    .run()?;
+
+    let usage = classifier.usage();
+    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?;
+    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+        bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
+    };
+
+    assert!(!outcome.generated_raw.is_empty());
+    assert!(outcome.observed_reasoning > 0);
+    assert!(usage.reasoning_tokens > 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+    assert_eq!(
+        usage.completion_tokens(),
+        outcome.observed_content + outcome.observed_reasoning,
+    );
+
+    if parsed.reasoning_content.is_empty() {
+        eprintln!("Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS");
+    } else {
+        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+        assert_eq!(outcome.content_stream, parsed.content);
+    }
 
-        Ok(())
+    for forbidden in FORBIDDEN_MARKERS {
+        assert!(!outcome.reasoning_stream.contains(forbidden));
+        assert!(!outcome.content_stream.contains(forbidden));
     }
-}
 
+    Ok(())
+}
 llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs
index dc9395aa..8ceec4d1 100644
--- a/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs
+++ b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs
@@ -1,2518 +1,2436 @@
+#![expect(
+    clippy::unnecessary_wraps,
+    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+)]
+
+use std::ffi::CStr;
+use std::io::Write;
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::Context as _;
+use anyhow::Result;
+use llama_cpp_bindings::GrammarError;
+use llama_cpp_bindings::SampledToken;
+use llama_cpp_bindings::context::LlamaContext;
+use llama_cpp_bindings::ggml_time_us;
+use llama_cpp_bindings::json_schema_to_grammar;
+use llama_cpp_bindings::llama_batch::LlamaBatch;
+use llama_cpp_bindings::llguidance_sampler::create_llg_sampler;
+use llama_cpp_bindings::model::AddBos;
+use llama_cpp_bindings::model::LlamaChatMessage;
+use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier;
+use llama_cpp_bindings::sampled_token_section::SampledTokenSection;
+use llama_cpp_bindings::sampling::LlamaSampler;
+use llama_cpp_bindings::streaming_markers::StreamingMarkers;
+use llama_cpp_bindings::token::LlamaToken;
+use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+use llama_cpp_test_harness::LlamaFixture;
+use llama_cpp_test_harness::llama_test;
 use llama_cpp_test_harness::llama_tests_main;
 
-mod model_sampling {
-    use anyhow::Result;
-    use llama_cpp_bindings::SampledToken;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::json_schema_to_grammar;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 256,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn sample_returns_result_and_succeeds_with_valid_index(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut context = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let tokens = model.str_to_token("Hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-
-        batch.add_sequence(&tokens, 0, false)?;
-
-        context.decode(&mut batch)?;
-
-        let mut sampler =
-            LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-
-        let result = sampler.sample(&context, batch.n_tokens() - 1);
-
-        assert!(result.is_ok());
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut context = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-        let tokens = model.str_to_token(prompt, AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-
-        batch.add_sequence(&tokens, 0, false)?;
-
-        context.decode(&mut batch)?;
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?,
-            LlamaSampler::temp(0.8),
-            LlamaSampler::greedy(),
-        ]);
-
-        let mut classifier = model.sampled_token_classifier();
-        let (raw_token, mut outcomes) =
-            classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
-        outcomes.extend(classifier.flush());
-
-        assert_eq!(
-            outcomes.len(),
-            1,
-            "expected one finalised outcome after flush"
-        );
-        let outcome = &outcomes[0];
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 256,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn sample_returns_result_and_succeeds_with_valid_index(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut context = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let tokens = model.str_to_token("Hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+
+    batch.add_sequence(&tokens, 0, false)?;
+
+    context.decode(&mut batch)?;
+
+    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+
+    let result = sampler.sample(&context, batch.n_tokens() - 1);
+
+    assert!(result.is_ok());
+    Ok(())
+}
 
-        let raw_as_sampled = SampledToken::Content(raw_token);
-        assert!(
-            !model.is_eog_token(&raw_as_sampled),
-            "Grammar sampler should not allow EOS as first token"
-        );
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut context = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+    let tokens = model.str_to_token(prompt, AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+
+    batch.add_sequence(&tokens, 0, false)?;
+
+    context.decode(&mut batch)?;
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?,
+        LlamaSampler::temp(0.8),
+        LlamaSampler::greedy(),
+    ]);
+
+    let mut classifier = model.sampled_token_classifier();
+    let (raw_token, mut outcomes) =
+        classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
+    outcomes.extend(classifier.flush());
+
+    assert_eq!(
+        outcomes.len(),
+        1,
+        "expected one finalised outcome after flush"
+    );
+    let outcome = &outcomes[0];
+
+    let raw_as_sampled = SampledToken::Content(raw_token);
+    assert!(
+        !model.is_eog_token(&raw_as_sampled),
+        "Grammar sampler should not allow EOS as first token"
+    );
+
+    let piece = &outcome.raw_piece;
+    let first_char = piece
+        .chars()
+        .next()
+        .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))?
+        .to_lowercase()
+        .next()
+        .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?;
+
+    assert!(
+        first_char == 'y' || first_char == 'n',
+        "Grammar should constrain first token to start with y/n, got: '{piece}'"
+    );
+    assert_eq!(
+        classifier.usage().completion_tokens(),
+        1,
+        "exactly one completion token sampled"
+    );
+
+    Ok(())
+}
 
-        let piece = &outcome.raw_piece;
-        let first_char = piece
-            .chars()
-            .next()
-            .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))?
-            .to_lowercase()
-            .next()
-            .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?;
-
-        assert!(
-            first_char == 'y' || first_char == 'n',
-            "Grammar should constrain first token to start with y/n, got: '{piece}'"
-        );
-        assert_eq!(
-            classifier.usage().completion_tokens(),
-            1,
-            "exactly one completion token sampled"
-        );
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn json_schema_grammar_sampler_constrains_output_to_json(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut context = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+    let tokens = model.str_to_token(prompt, AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+
+    batch.add_sequence(&tokens, 0, false)?;
+
+    context.decode(&mut batch)?;
+
+    let grammar_str = json_schema_to_grammar(
+        r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#,
+    )?;
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::grammar(model, &grammar_str, "root")?,
+        LlamaSampler::temp(0.8),
+        LlamaSampler::greedy(),
+    ]);
+
+    let mut classifier = model.sampled_token_classifier();
+    let (raw_token, mut outcomes) =
+        classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
+    outcomes.extend(classifier.flush());
+
+    assert_eq!(
+        outcomes.len(),
+        1,
+        "expected one finalised outcome after flush"
+    );
+    let outcome = &outcomes[0];
+
+    let raw_as_sampled = SampledToken::Content(raw_token);
+    assert!(
+        !model.is_eog_token(&raw_as_sampled),
+        "Grammar sampler should not allow EOS as first token"
+    );
+
+    let piece = &outcome.raw_piece;
+
+    assert!(
+        piece.starts_with('{'),
+        "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'"
+    );
+    assert_eq!(
+        classifier.usage().completion_tokens(),
+        1,
+        "exactly one completion token sampled"
+    );
+
+    Ok(())
+}
 
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn sample_with_grammar_produces_constrained_output_in_loop(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let mut context = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+    let tokens = model.str_to_token(prompt, AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+
+    let mut classifier = model.sampled_token_classifier();
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+    context.decode(&mut batch)?;
+    classifier.commit_prompt_tokens();
+
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?,
+        LlamaSampler::temp(0.8),
+        LlamaSampler::greedy(),
+    ]);
+
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: 10,
     }
+    .run()?;
+
+    let lowercase = outcome.generated_raw.to_lowercase();
+    assert!(
+        lowercase == "yes" || lowercase == "no",
+        "Grammar loop should produce 'yes' or 'no', got: '{}'",
+        outcome.generated_raw
+    );
+    assert!(
+        outcome.eog_seen,
+        "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}"
+    );
+    assert_eq!(outcome.observed_reasoning, 0);
+    assert_eq!(outcome.observed_undeterminable, 0);
+    assert_eq!(outcome.observed_tool_call, 0);
+    assert!(outcome.observed_content > 0);
+
+    let usage = classifier.into_usage();
+    assert_eq!(usage.completion_tokens(), outcome.observed_content);
+    assert_eq!(usage.reasoning_tokens, 0);
+    assert_eq!(usage.undeterminable_tokens, 0);
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn json_schema_grammar_sampler_constrains_output_to_json(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut context = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-        let tokens = model.str_to_token(prompt, AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-
-        batch.add_sequence(&tokens, 0, false)?;
-
-        context.decode(&mut batch)?;
-
-        let grammar_str = json_schema_to_grammar(
-            r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#,
-        )?;
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::grammar(model, &grammar_str, "root")?,
-            LlamaSampler::temp(0.8),
-            LlamaSampler::greedy(),
-        ]);
-
-        let mut classifier = model.sampled_token_classifier();
-        let (raw_token, mut outcomes) =
-            classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
-        outcomes.extend(classifier.flush());
-
-        assert_eq!(
-            outcomes.len(),
-            1,
-            "expected one finalised outcome after flush"
-        );
-        let outcome = &outcomes[0];
-
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut context = LlamaContext::from_model(
+        model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let prompt =
+        "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+    let tokens = model.str_to_token(prompt, AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+
+    batch.add_sequence(&tokens, 0, false)?;
+
+    context.decode(&mut batch)?;
+
+    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+
+    let mut classifier = model.sampled_token_classifier();
+    let mut sampled_count: u64 = 0;
+
+    for (position, _) in (batch.n_tokens()..).zip(0..5) {
+        let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?;
         let raw_as_sampled = SampledToken::Content(raw_token);
-        assert!(
-            !model.is_eog_token(&raw_as_sampled),
-            "Grammar sampler should not allow EOS as first token"
-        );
-
-        let piece = &outcome.raw_piece;
 
-        assert!(
-            piece.starts_with('{'),
-            "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'"
-        );
-        assert_eq!(
-            classifier.usage().completion_tokens(),
-            1,
-            "exactly one completion token sampled"
-        );
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn sample_with_grammar_produces_constrained_output_in_loop(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut context = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-        let tokens = model.str_to_token(prompt, AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-
-        let mut classifier = model.sampled_token_classifier();
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-        context.decode(&mut batch)?;
-        classifier.commit_prompt_tokens();
-
-        let mut sampler = LlamaSampler::chain_simple([
-            LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?,
-            LlamaSampler::temp(0.8),
-            LlamaSampler::greedy(),
-        ]);
-
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: 10,
+        if model.is_eog_token(&raw_as_sampled) {
+            break;
         }
-        .run()?;
-
-        let lowercase = outcome.generated_raw.to_lowercase();
-        assert!(
-            lowercase == "yes" || lowercase == "no",
-            "Grammar loop should produce 'yes' or 'no', got: '{}'",
-            outcome.generated_raw
-        );
-        assert!(
-            outcome.eog_seen,
-            "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}"
-        );
-        assert_eq!(outcome.observed_reasoning, 0);
-        assert_eq!(outcome.observed_undeterminable, 0);
-        assert_eq!(outcome.observed_tool_call, 0);
-        assert!(outcome.observed_content > 0);
-
-        let usage = classifier.into_usage();
-        assert_eq!(usage.completion_tokens(), outcome.observed_content);
-        assert_eq!(usage.reasoning_tokens, 0);
-        assert_eq!(usage.undeterminable_tokens, 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut context = LlamaContext::from_model(
-            model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
 
-        let prompt =
-            "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-        let tokens = model.str_to_token(prompt, AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
+        sampled_count += 1;
 
-        batch.add_sequence(&tokens, 0, false)?;
+        batch.clear();
+        batch.add(&raw_as_sampled, position, &[0], true)?;
 
         context.decode(&mut batch)?;
-
-        let mut sampler =
-            LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-
-        let mut classifier = model.sampled_token_classifier();
-        let mut sampled_count: u64 = 0;
-
-        for (position, _) in (batch.n_tokens()..).zip(0..5) {
-            let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?;
-            let raw_as_sampled = SampledToken::Content(raw_token);
-
-            if model.is_eog_token(&raw_as_sampled) {
-                break;
-            }
-
-            sampled_count += 1;
-
-            batch.clear();
-            batch.add(&raw_as_sampled, position, &[0], true)?;
-
-            context.decode(&mut batch)?;
-        }
-
-        let _ = classifier.flush();
-
-        assert!(
-            sampled_count > 0,
-            "Should produce at least one token without grammar"
-        );
-        let usage = classifier.into_usage();
-        assert!(
-            usage.completion_tokens() >= sampled_count,
-            "completion_tokens ({}) must include the {sampled_count} non-EOG samples",
-            usage.completion_tokens()
-        );
-
-        Ok(())
-    }
-}
-
-mod sampling {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::GrammarError;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings::token::LlamaToken;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let breakers: Vec<&[u8]> = vec![b"\n", b"\t"];
-        let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn dry_sampler_with_null_byte_in_seq_breakers_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let breakers: Vec<&[u8]> = vec![b"hello\0world"];
-        let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers);
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root");
-
-        assert!(sampler.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let trigger_words: Vec<&[u8]> = vec![b"function"];
-        let sampler = LlamaSampler::grammar_lazy(
-            fixture.model,
-            "root ::= \"hello\"",
-            "root",
-            trigger_words,
-            &[],
-        );
-
-        assert!(sampler.is_ok());
-
-        Ok(())
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let patterns = vec!["\\{.*".to_owned()];
-        let sampler = LlamaSampler::grammar_lazy_patterns(
-            fixture.model,
-            "root ::= \"hello\"",
-            "root",
-            &patterns,
-            &[],
-        );
-
-        assert!(sampler.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let trigger_words: Vec<&[u8]> = vec![b"function"];
-        let result = LlamaSampler::grammar_lazy(
-            fixture.model,
-            "expr ::= \"hello\"",
-            "root",
-            trigger_words,
-            &[],
-        );
-
-        assert!(matches!(result, Err(GrammarError::RootNotFound)));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_lazy_with_null_byte_in_trigger_word_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"];
-        let result = LlamaSampler::grammar_lazy(
-            fixture.model,
-            "root ::= \"hello\"",
-            "root",
-            trigger_words,
-            &[],
-        );
-
-        assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_))));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_lazy_patterns_with_root_not_found_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let patterns = vec!["\\{.*".to_owned()];
-        let result = LlamaSampler::grammar_lazy_patterns(
-            fixture.model,
-            "expr ::= \"hello\"",
-            "root",
-            &patterns,
-            &[],
-        );
-
-        assert!(matches!(result, Err(GrammarError::RootNotFound)));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let patterns = vec!["hel\0lo".to_owned()];
-        let result = LlamaSampler::grammar_lazy_patterns(
-            fixture.model,
-            "root ::= \"hello\"",
-            "root",
-            &patterns,
-            &[],
-        );
-
-        assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_))));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let patterns = vec!["[".to_owned()];
-        let result = LlamaSampler::grammar_lazy_patterns(
-            fixture.model,
-            "root ::= \"hello\"",
-            "root",
-            &patterns,
-            &[],
-        );
-
-        assert!(matches!(
-            result,
-            Err(GrammarError::InvalidTriggerPattern { .. }),
-        ));
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no");
-
-        assert!(result.is_ok());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = LlamaSampler::logit_bias(0, &[]);
+    let _ = classifier.flush();
 
-        assert!(result.is_ok());
+    assert!(
+        sampled_count > 0,
+        "Should produce at least one token without grammar"
+    );
+    let usage = classifier.into_usage();
+    assert!(
+        usage.completion_tokens() >= sampled_count,
+        "completion_tokens ({}) must include the {sampled_count} non-EOG samples",
+        usage.completion_tokens()
+    );
 
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn dry_sampler_with_root_not_found_grammar_does_not_apply(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let breakers: Vec<&[u8]> = vec![b"\n"];
-        let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-        let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()];
+    Ok(())
+}
 
-        sampler.accept_many(&tokens)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let breakers: Vec<&[u8]> = vec![b"\n", b"\t"];
+    let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn dry_sampler_with_null_byte_in_seq_breakers_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let breakers: Vec<&[u8]> = vec![b"hello\0world"];
+    let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers);
+
+    assert!(result.is_err());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn with_tokens_returns_self_after_accepting_each_token(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-        let tokens = [fixture.model.token_bos(), fixture.model.token_eos()];
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root");
+
+    assert!(sampler.is_ok());
+
+    Ok(())
+}
 
-        let _consumed = sampler.with_tokens(tokens.iter().copied())?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let trigger_words: Vec<&[u8]> = vec![b"function"];
+    let sampler = LlamaSampler::grammar_lazy(
+        fixture.model,
+        "root ::= \"hello\"",
+        "root",
+        trigger_words,
+        &[],
+    );
+
+    assert!(sampler.is_ok());
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let patterns = vec!["\\{.*".to_owned()];
+    let sampler = LlamaSampler::grammar_lazy_patterns(
+        fixture.model,
+        "root ::= \"hello\"",
+        "root",
+        &patterns,
+        &[],
+    );
+
+    assert!(sampler.is_ok());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let trigger_words: Vec<&[u8]> = vec![b"function"];
+    let result = LlamaSampler::grammar_lazy(
+        fixture.model,
+        "expr ::= \"hello\"",
+        "root",
+        trigger_words,
+        &[],
+    );
+
+    assert!(matches!(result, Err(GrammarError::RootNotFound)));
+
+    Ok(())
+}
 
-        sampler.accept(fixture.model.token_bos())?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_lazy_with_null_byte_in_trigger_word_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"];
+    let result = LlamaSampler::grammar_lazy(
+        fixture.model,
+        "root ::= \"hello\"",
+        "root",
+        trigger_words,
+        &[],
+    );
+
+    assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_))));
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_lazy_patterns_with_root_not_found_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let patterns = vec!["\\{.*".to_owned()];
+    let result = LlamaSampler::grammar_lazy_patterns(
+        fixture.model,
+        "expr ::= \"hello\"",
+        "root",
+        &patterns,
+        &[],
+    );
+
+    assert!(matches!(result, Err(GrammarError::RootNotFound)));
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let patterns = vec!["hel\0lo".to_owned()];
+    let result = LlamaSampler::grammar_lazy_patterns(
+        fixture.model,
+        "root ::= \"hello\"",
+        "root",
+        &patterns,
+        &[],
+    );
+
+    assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_))));
+
+    Ok(())
+}
 
-        sampler.try_accept(LlamaToken::new(0))?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let patterns = vec!["[".to_owned()];
+    let result = LlamaSampler::grammar_lazy_patterns(
+        fixture.model,
+        "root ::= \"hello\"",
+        "root",
+        &patterns,
+        &[],
+    );
+
+    assert!(matches!(
+        result,
+        Err(GrammarError::InvalidTriggerPattern { .. }),
+    ));
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no");
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = LlamaSampler::logit_bias(0, &[]);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
 
-        let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?;
-        let sampler = LlamaSampler::greedy();
-        sampler.apply(&mut data_array);
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn dry_sampler_with_root_not_found_grammar_does_not_apply(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let breakers: Vec<&[u8]> = vec![b"\n"];
+    let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+    let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()];
+
+    sampler.accept_many(&tokens)?;
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 2048,
-        n_ubatch = 512,
-    )]
-    fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut context = LlamaContext::from_model(
-            fixture.model,
-            fixture.backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-        let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-        let mut sampler =
-            LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-        let result = sampler.sample(&context, batch.n_tokens() - 1);
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn with_tokens_returns_self_after_accepting_each_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+    let tokens = [fixture.model.token_bos(), fixture.model.token_eos()];
+
+    let _consumed = sampler.with_tokens(tokens.iter().copied())?;
+
+    Ok(())
+}
 
-        assert!(result.is_ok());
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+
+    sampler.accept(fixture.model.token_bos())?;
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+
+    sampler.try_accept(LlamaToken::new(0))?;
+
+    Ok(())
 }
 
-mod text_generation {
-    use std::io::Write;
-    use std::time::Duration;
-
-    use anyhow::Context as _;
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::ggml_time_us;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::model::LlamaChatMessage;
-    use llama_cpp_bindings::sampled_token::SampledToken;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mut ctx = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )
-        .with_context(|| "unable to create context")?;
-
-        let prompt = "Hello my name is";
-        let max_generated_tokens: i32 = 64;
-
-        let mut classifier = model.sampled_token_classifier();
-        let tokens_list = model
-            .str_to_token(prompt, AddBos::Always)
-            .with_context(|| format!("failed to tokenize {prompt}"))?;
-        let prompt_token_count = u64::try_from(tokens_list.len())?;
-
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-
-        for token in &tokens_list {
-            eprint!(
-                "{}",
-                model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)?
-            );
-        }
-        std::io::stderr().flush()?;
-
-        let mut batch = LlamaBatch::new(512, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?;
-
-        assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
-        assert_eq!(classifier.usage().prompt_tokens, 0);
-
-        ctx.decode(&mut batch)
-            .with_context(|| "llama_decode() failed")?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-        assert_eq!(classifier.usage().prompt_tokens, prompt_token_count);
-
-        let mut sampler =
-            LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]);
-        let initial_position = batch.n_tokens();
-        let t_main_start = ggml_time_us();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut ctx,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens,
-        }
-        .run()?;
-        let t_main_end = ggml_time_us();
-        let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
-        let total_observed =
-            outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
-
-        #[expect(
-            clippy::cast_precision_loss,
-            reason = "logged throughput tolerates f32 precision"
-        )]
-        let tokens_per_second = total_observed as f32 / duration.as_secs_f32();
-
-        eprintln!(
-            "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
-            duration.as_secs_f32(),
-        );
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?;
+    let sampler = LlamaSampler::greedy();
+    sampler.apply(&mut data_array);
+
+    Ok(())
+}
 
-        assert!(
-            !outcome.generated_raw.is_empty(),
-            "model should generate at least one token"
-        );
-        assert_eq!(
-            outcome.observed_tool_call, 0,
-            "raw prompt without tool-call markers must not produce ToolCall tokens; \
-             outcome={outcome:?}"
-        );
-        assert!(
-            total_observed > 0,
-            "model must produce at least one classified token; outcome={outcome:?}"
-        );
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 2048,
+    n_ubatch = 512,
+)]
+fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut context = LlamaContext::from_model(
+        fixture.model,
+        fixture.backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+    let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+    let result = sampler.sample(&context, batch.n_tokens() - 1);
+
+    assert!(result.is_ok());
+
+    Ok(())
+}
 
-        let usage = classifier.into_usage();
-        assert_eq!(
-            usage.prompt_tokens, prompt_token_count,
-            "prompt_tokens must equal the tokenizer's prompt length"
-        );
-        assert_eq!(
-            usage.content_tokens, outcome.observed_content,
-            "content_tokens must equal observed Content variants"
-        );
-        assert_eq!(
-            usage.reasoning_tokens, outcome.observed_reasoning,
-            "reasoning_tokens must equal observed Reasoning variants"
-        );
-        assert_eq!(
-            usage.undeterminable_tokens, outcome.observed_undeterminable,
-            "undeterminable_tokens must equal observed Undeterminable variants"
-        );
-        assert_eq!(
-            usage.tool_call_tokens, outcome.observed_tool_call,
-            "tool_call_tokens must equal observed ToolCall variants"
-        );
-        assert_eq!(
-            usage.completion_tokens(),
-            total_observed,
-            "completion_tokens must equal Content + Reasoning + Undeterminable"
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mut ctx = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )
+    .with_context(|| "unable to create context")?;
+
+    let prompt = "Hello my name is";
+    let max_generated_tokens: i32 = 64;
+
+    let mut classifier = model.sampled_token_classifier();
+    let tokens_list = model
+        .str_to_token(prompt, AddBos::Always)
+        .with_context(|| format!("failed to tokenize {prompt}"))?;
+    let prompt_token_count = u64::try_from(tokens_list.len())?;
+
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+
+    for token in &tokens_list {
+        eprint!(
+            "{}",
+            model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)?
         );
-
-        Ok(())
     }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let chat_template = model.chat_template(None)?;
-        let messages = vec![LlamaChatMessage::new(
-            "user".to_string(),
-            "Hello! How are you?".to_string(),
-        )?];
-        let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
-        let mut classifier = model.sampled_token_classifier();
-        let tokens = model.str_to_token(&prompt, AddBos::Always)?;
-        let prompt_token_count = u64::try_from(tokens.len())?;
-
-        let mut batch = LlamaBatch::new(512, 1)?;
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-        assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
-        assert_eq!(classifier.usage().prompt_tokens, 0);
-
-        context.decode(&mut batch)?;
-
-        let promoted = classifier.commit_prompt_tokens();
-        assert_eq!(promoted, prompt_token_count);
-
-        let mut sampler = LlamaSampler::greedy();
-        let initial_position = batch.n_tokens();
-        let outcome = ClassifySampleLoop {
-            model,
-            classifier: &mut classifier,
-            sampler: &mut sampler,
-            context: &mut context,
-            batch: &mut batch,
-            initial_position,
-            max_generated_tokens: 1024,
-        }
-        .run()?;
-
-        println!();
-
-        assert!(
-            !outcome.generated_raw.is_empty(),
-            "model should generate at least one token"
-        );
-        let total_observed =
-            outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
-        assert!(
-            total_observed > 0,
-            "model must produce at least one classified token; outcome={outcome:?}"
-        );
-        assert_eq!(
-            outcome.observed_tool_call, 0,
-            "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}"
-        );
-
-        let usage = classifier.into_usage();
-
-        assert_eq!(
-            usage.prompt_tokens, prompt_token_count,
-            "prompt_tokens must equal the tokenizer's prompt length"
-        );
-        assert_eq!(
-            usage.content_tokens, outcome.observed_content,
-            "content_tokens must equal observed Content variants"
-        );
-        assert_eq!(
-            usage.reasoning_tokens, outcome.observed_reasoning,
-            "reasoning_tokens must equal observed Reasoning variants"
-        );
-        assert_eq!(
-            usage.undeterminable_tokens, outcome.observed_undeterminable,
-            "undeterminable_tokens must equal observed Undeterminable variants"
-        );
-        assert_eq!(
-            usage.completion_tokens(),
-            total_observed,
-            "completion_tokens must equal Content + Reasoning + Undeterminable"
-        );
-        assert_eq!(
-            usage.tool_call_tokens, outcome.observed_tool_call,
-            "tool_call_tokens must equal observed ToolCall variants"
-        );
-
-        Ok(())
+    std::io::stderr().flush()?;
+
+    let mut batch = LlamaBatch::new(512, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?;
+
+    assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+    assert_eq!(classifier.usage().prompt_tokens, 0);
+
+    ctx.decode(&mut batch)
+        .with_context(|| "llama_decode() failed")?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+    assert_eq!(classifier.usage().prompt_tokens, prompt_token_count);
+
+    let mut sampler =
+        LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]);
+    let initial_position = batch.n_tokens();
+    let t_main_start = ggml_time_us();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut ctx,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens,
     }
+    .run()?;
+    let t_main_end = ggml_time_us();
+    let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+    let total_observed =
+        outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
+
+    #[expect(
+        clippy::cast_precision_loss,
+        reason = "logged throughput tolerates f32 precision"
+    )]
+    let tokens_per_second = total_observed as f32 / duration.as_secs_f32();
+
+    eprintln!(
+        "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
+        duration.as_secs_f32(),
+    );
+
+    assert!(
+        !outcome.generated_raw.is_empty(),
+        "model should generate at least one token"
+    );
+    assert_eq!(
+        outcome.observed_tool_call, 0,
+        "raw prompt without tool-call markers must not produce ToolCall tokens; \
+         outcome={outcome:?}"
+    );
+    assert!(
+        total_observed > 0,
+        "model must produce at least one classified token; outcome={outcome:?}"
+    );
+
+    let usage = classifier.into_usage();
+    assert_eq!(
+        usage.prompt_tokens, prompt_token_count,
+        "prompt_tokens must equal the tokenizer's prompt length"
+    );
+    assert_eq!(
+        usage.content_tokens, outcome.observed_content,
+        "content_tokens must equal observed Content variants"
+    );
+    assert_eq!(
+        usage.reasoning_tokens, outcome.observed_reasoning,
+        "reasoning_tokens must equal observed Reasoning variants"
+    );
+    assert_eq!(
+        usage.undeterminable_tokens, outcome.observed_undeterminable,
+        "undeterminable_tokens must equal observed Undeterminable variants"
+    );
+    assert_eq!(
+        usage.tool_call_tokens, outcome.observed_tool_call,
+        "tool_call_tokens must equal observed ToolCall variants"
+    );
+    assert_eq!(
+        usage.completion_tokens(),
+        total_observed,
+        "completion_tokens must equal Content + Reasoning + Undeterminable"
+    );
+
+    Ok(())
 }
 
-mod constrained_decoding {
-    use std::io::Write;
-
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampled_token::SampledToken;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-
-        let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n";
-
-        let mut ctx = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let tokens_list = model.str_to_token(prompt, AddBos::Always)?;
-
-        let mut batch = LlamaBatch::new(512, 1)?;
-        let last_index = i32::try_from(tokens_list.len())? - 1;
-
-        for (index, token) in (0_i32..).zip(&tokens_list) {
-            batch.add(
-                &SampledToken::Content(*token),
-                index,
-                &[0],
-                index == last_index,
-            )?;
-        }
-
-        ctx.decode(&mut batch)?;
-
-        let schema = r#"{
-      "type": "object",
-      "properties": {
-        "city": { "type": "string" },
-        "temperature": { "type": "number" }
-      },
-      "required": ["city", "temperature"]
-    }"#;
-
-        let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?;
-        let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-
-        let mut n_cur = batch.n_tokens();
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-        let mut generated = String::new();
-
-        while n_cur <= 128 {
-            let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?);
-
-            if model.is_eog_token(&token) {
-                break;
-            }
-
-            let output_string = model.token_to_piece(&token, &mut decoder, true, None)?;
-            generated.push_str(&output_string);
-            print!("{output_string}");
-            std::io::stdout().flush()?;
-
-            batch.clear();
-            batch.add(&token, n_cur, &[0], true)?;
-            n_cur += 1;
-            ctx.decode(&mut batch)?;
-        }
-
-        println!();
-
-        let parsed = serde_json::Deserializer::from_str(&generated)
-            .into_iter::<serde_json::Value>()
-            .next()
-            .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??;
-
-        assert!(parsed.get("city").is_some());
-        assert!(parsed.get("temperature").is_some());
-
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let chat_template = model.chat_template(None)?;
+    let messages = vec![LlamaChatMessage::new(
+        "user".to_string(),
+        "Hello! How are you?".to_string(),
+    )?];
+    let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+    let mut classifier = model.sampled_token_classifier();
+    let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+    let prompt_token_count = u64::try_from(tokens.len())?;
+
+    let mut batch = LlamaBatch::new(512, 1)?;
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+    assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+    assert_eq!(classifier.usage().prompt_tokens, 0);
+
+    context.decode(&mut batch)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+    assert_eq!(promoted, prompt_token_count);
+
+    let mut sampler = LlamaSampler::greedy();
+    let initial_position = batch.n_tokens();
+    let outcome = ClassifySampleLoop {
+        model,
+        classifier: &mut classifier,
+        sampler: &mut sampler,
+        context: &mut context,
+        batch: &mut batch,
+        initial_position,
+        max_generated_tokens: 1024,
     }
+    .run()?;
+
+    println!();
+
+    assert!(
+        !outcome.generated_raw.is_empty(),
+        "model should generate at least one token"
+    );
+    let total_observed =
+        outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
+    assert!(
+        total_observed > 0,
+        "model must produce at least one classified token; outcome={outcome:?}"
+    );
+    assert_eq!(
+        outcome.observed_tool_call, 0,
+        "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}"
+    );
+
+    let usage = classifier.into_usage();
+
+    assert_eq!(
+        usage.prompt_tokens, prompt_token_count,
+        "prompt_tokens must equal the tokenizer's prompt length"
+    );
+    assert_eq!(
+        usage.content_tokens, outcome.observed_content,
+        "content_tokens must equal observed Content variants"
+    );
+    assert_eq!(
+        usage.reasoning_tokens, outcome.observed_reasoning,
+        "reasoning_tokens must equal observed Reasoning variants"
+    );
+    assert_eq!(
+        usage.undeterminable_tokens, outcome.observed_undeterminable,
+        "undeterminable_tokens must equal observed Undeterminable variants"
+    );
+    assert_eq!(
+        usage.completion_tokens(),
+        total_observed,
+        "completion_tokens must equal Content + Reasoning + Undeterminable"
+    );
+    assert_eq!(
+        usage.tool_call_tokens, outcome.observed_tool_call,
+        "tool_call_tokens must equal observed ToolCall variants"
+    );
+
+    Ok(())
 }
 
-mod llguidance {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use std::ffi::CStr;
-    use std::sync::Arc;
-
-    use anyhow::Result;
-    use llama_cpp_bindings::context::LlamaContext;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::llguidance_sampler::create_llg_sampler;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_bindings::sampling::LlamaSampler;
-    use llama_cpp_bindings::token::LlamaToken;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    const JSON_SCHEMA: &str =
-        r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#;
-    const REGEX_GRAMMAR: &str = r"yes|no";
-    const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?;
-
-        assert!(!sampler.sampler.is_null());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-        assert!(!sampler.sampler.is_null());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?;
-
-        assert!(!sampler.sampler.is_null());
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything");
-
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = create_llg_sampler(fixture.model, "json", "{this is not valid json");
-
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = create_llg_sampler(fixture.model, "regex", "[invalid");
-
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-        let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) };
-        assert!(!name_ptr.is_null());
-        let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?;
-
-        assert_eq!(name, "llguidance");
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-        let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) };
-
-        assert!(!cloned.is_null());
-
-        unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) };
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let backend = fixture.backend;
+
+    let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n";
+
+    let mut ctx = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let tokens_list = model.str_to_token(prompt, AddBos::Always)?;
+
+    let mut batch = LlamaBatch::new(512, 1)?;
+    let last_index = i32::try_from(tokens_list.len())? - 1;
+
+    for (index, token) in (0_i32..).zip(&tokens_list) {
+        batch.add(
+            &SampledToken::Content(*token),
+            index,
+            &[0],
+            index == last_index,
         )?;
-
-        let prompt = "Answer yes or no:";
-        let tokens = model.str_to_token(prompt, AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
-
-        let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
-        let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-
-        let token = chain.sample(&context, batch.n_tokens() - 1)?;
-        chain.accept(token)?;
-
-        Ok(())
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-        let huge_token = LlamaToken(i32::MAX - 1);
-        let _ = sampler.accept(huge_token);
+    ctx.decode(&mut batch)?;
 
-        Ok(())
-    }
+    let schema = r#"{
+  "type": "object",
+  "properties": {
+    "city": { "type": "string" },
+    "temperature": { "type": "number" }
+  },
+  "required": ["city", "temperature"]
+}"#;
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let first = fixture.model.approximate_tok_env();
-        let second = fixture.model.approximate_tok_env();
+    let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?;
+    let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
 
-        assert!(Arc::ptr_eq(&first, &second));
+    let mut n_cur = batch.n_tokens();
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+    let mut generated = String::new();
 
-        Ok(())
-    }
+    while n_cur <= 128 {
+        let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?);
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn approximate_tok_env_drives_consistent_grammar_constraint(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-        let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+        if model.is_eog_token(&token) {
+            break;
+        }
 
-        assert!(!first.sampler.is_null());
-        assert!(!second.sampler.is_null());
+        let output_string = model.token_to_piece(&token, &mut decoder, true, None)?;
+        generated.push_str(&output_string);
+        print!("{output_string}");
+        std::io::stdout().flush()?;
 
-        Ok(())
+        batch.clear();
+        batch.add(&token, n_cur, &[0], true)?;
+        n_cur += 1;
+        ctx.decode(&mut batch)?;
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let backend = fixture.backend;
-        let mut context = LlamaContext::from_model(
-            model,
-            backend,
-            (*fixture.context_params).into_llama_context_params(),
-        )?;
-
-        let tokens = model.str_to_token("Answer:", AddBos::Always)?;
-        let mut batch = LlamaBatch::new(512, 1)?;
-        batch.add_sequence(&tokens, 0, false)?;
-        context.decode(&mut batch)?;
+    println!();
 
-        let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
-        let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-        let _ = chain.sample(&context, batch.n_tokens() - 1);
+    let parsed = serde_json::Deserializer::from_str(&generated)
+        .into_iter::<serde_json::Value>()
+        .next()
+        .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??;
 
-        Ok(())
-    }
+    assert!(parsed.get("city").is_some());
+    assert!(parsed.get("temperature").is_some());
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 512,
-        n_ubatch = 128,
-    )]
-    fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-        let huge_token = LlamaToken(i32::MAX - 1);
-        let _ = sampler.accept(huge_token);
-        sampler.reset();
-        let after = sampler.accept(LlamaToken(0));
-        assert!(
-            after.is_ok() || after.is_err(),
-            "after reset, sampler.accept must return Ok or Err (not panic)"
-        );
-        Ok(())
-    }
+    Ok(())
 }
 
-mod sampled_token_classifier_markers {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::SampledToken;
-    use llama_cpp_bindings::llama_batch::LlamaBatch;
-    use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier;
-    use llama_cpp_bindings::sampled_token_section::SampledTokenSection;
-    use llama_cpp_bindings::streaming_markers::StreamingMarkers;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn classifier_starts_in_pending_section_for_default_fixture(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let classifier = fixture.model.sampled_token_classifier();
-
-        assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn classifier_construction_is_idempotent_across_calls(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let first = fixture.model.sampled_token_classifier();
-        let second = fixture.model.sampled_token_classifier();
-
-        assert_eq!(first.current_section(), second.current_section());
-        assert_eq!(first.usage(), second.usage());
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-
-        let outcomes = classifier.ingest(model.token_bos());
-
-        assert_eq!(outcomes.len(), 1);
-        let outcome = &outcomes[0];
-        assert!(matches!(
-            outcome.sampled_token,
-            SampledToken::Undeterminable(_)
-        ));
-        assert_eq!(outcome.visible_piece, outcome.raw_piece);
-        assert_eq!(classifier.usage().undeterminable_tokens, 1);
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn ingest_with_no_markers_decodes_each_token_independently(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-
-        let _ = classifier.ingest(model.token_bos());
-        let _ = classifier.ingest(model.token_eos());
-
-        assert_eq!(classifier.usage().undeterminable_tokens, 2);
-        Ok(())
-    }
+const JSON_SCHEMA: &str =
+    r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#;
+const REGEX_GRAMMAR: &str = r"yes|no";
+const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#;
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?;
+
+    assert!(!sampler.sampler.is_null());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-        let usage_before = *classifier.usage();
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+    assert!(!sampler.sampler.is_null());
+
+    Ok(())
+}
 
-        classifier.ingest_prompt_token(model.token_bos());
-        classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?;
+
+    assert!(!sampler.sampler.is_null());
+
+    Ok(())
+}
 
-        assert_eq!(*classifier.usage(), usage_before);
-        assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything");
+
+    assert!(result.is_err());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn feed_prompt_to_batch_increments_pending_prompt_tokens(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-        let mut batch = LlamaBatch::new(8, 1)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = create_llg_sampler(fixture.model, "json", "{this is not valid json");
+
+    assert!(result.is_err());
+    Ok(())
+}
 
-        classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
-        classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = create_llg_sampler(fixture.model, "regex", "[invalid");
+
+    assert!(result.is_err());
+    Ok(())
+}
 
-        assert_eq!(classifier.pending_prompt_tokens(), 2);
-        assert_eq!(batch.n_tokens(), 2);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+    let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) };
+    assert!(!name_ptr.is_null());
+    let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?;
+
+    assert_eq!(name, "llguidance");
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+    let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) };
+
+    assert!(!cloned.is_null());
+
+    unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) };
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-        let mut batch = LlamaBatch::new(8, 1)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let prompt = "Answer yes or no:";
+    let tokens = model.str_to_token(prompt, AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
+    let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+
+    let token = chain.sample(&context, batch.n_tokens() - 1)?;
+    chain.accept(token)?;
+
+    Ok(())
+}
 
-        let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()];
-        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+    let huge_token = LlamaToken(i32::MAX - 1);
+    let _ = sampler.accept(huge_token);
+
+    Ok(())
+}
 
-        assert_eq!(classifier.pending_prompt_tokens(), 3);
-        assert_eq!(batch.n_tokens(), 3);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let first = fixture.model.approximate_tok_env();
+    let second = fixture.model.approximate_tok_env();
+
+    assert!(Arc::ptr_eq(&first, &second));
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn approximate_tok_env_drives_consistent_grammar_constraint(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+    let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+    assert!(!first.sampler.is_null());
+    assert!(!second.sampler.is_null());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-        let mut batch = LlamaBatch::new(8, 1)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let backend = fixture.backend;
+    let mut context = LlamaContext::from_model(
+        model,
+        backend,
+        (*fixture.context_params).into_llama_context_params(),
+    )?;
+
+    let tokens = model.str_to_token("Answer:", AddBos::Always)?;
+    let mut batch = LlamaBatch::new(512, 1)?;
+    batch.add_sequence(&tokens, 0, false)?;
+    context.decode(&mut batch)?;
+
+    let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
+    let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+    let _ = chain.sample(&context, batch.n_tokens() - 1);
+
+    Ok(())
+}
 
-        classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
-        classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 512,
+    n_ubatch = 128,
+)]
+fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+    let huge_token = LlamaToken(i32::MAX - 1);
+    let _ = sampler.accept(huge_token);
+    sampler.reset();
+    let after = sampler.accept(LlamaToken(0));
+    assert!(
+        after.is_ok() || after.is_err(),
+        "after reset, sampler.accept must return Ok or Err (not panic)"
+    );
+    Ok(())
+}
 
-        let promoted = classifier.commit_prompt_tokens();
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn classifier_starts_in_pending_section_for_default_fixture(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let classifier = fixture.model.sampled_token_classifier();
+
+    assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
+    Ok(())
+}
 
-        assert_eq!(promoted, 2);
-        assert_eq!(classifier.pending_prompt_tokens(), 0);
-        assert_eq!(classifier.usage().prompt_tokens, 2);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn classifier_construction_is_idempotent_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let first = fixture.model.sampled_token_classifier();
+    let second = fixture.model.sampled_token_classifier();
+
+    assert_eq!(first.current_section(), second.current_section());
+    assert_eq!(first.usage(), second.usage());
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+
+    let outcomes = classifier.ingest(model.token_bos());
+
+    assert_eq!(outcomes.len(), 1);
+    let outcome = &outcomes[0];
+    assert!(matches!(
+        outcome.sampled_token,
+        SampledToken::Undeterminable(_)
+    ));
+    assert_eq!(outcome.visible_piece, outcome.raw_piece);
+    assert_eq!(classifier.usage().undeterminable_tokens, 1);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn discard_pending_prompt_tokens_clears_count_without_recording_usage(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-        let mut batch = LlamaBatch::new(8, 1)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn ingest_with_no_markers_decodes_each_token_independently(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+
+    let _ = classifier.ingest(model.token_bos());
+    let _ = classifier.ingest(model.token_eos());
+
+    assert_eq!(classifier.usage().undeterminable_tokens, 2);
+    Ok(())
+}
 
-        classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+    let usage_before = *classifier.usage();
+
+    classifier.ingest_prompt_token(model.token_bos());
+    classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]);
+
+    assert_eq!(*classifier.usage(), usage_before);
+    assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
+    Ok(())
+}
 
-        let discarded = classifier.discard_pending_prompt_tokens();
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn feed_prompt_to_batch_increments_pending_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+    let mut batch = LlamaBatch::new(8, 1)?;
+
+    classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+    classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+
+    assert_eq!(classifier.pending_prompt_tokens(), 2);
+    assert_eq!(batch.n_tokens(), 2);
+
+    Ok(())
+}
 
-        assert_eq!(discarded, 1);
-        assert_eq!(classifier.pending_prompt_tokens(), 0);
-        assert_eq!(classifier.usage().prompt_tokens, 0);
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+    let mut batch = LlamaBatch::new(8, 1)?;
+
+    let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()];
+    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+    assert_eq!(classifier.pending_prompt_tokens(), 3);
+    assert_eq!(batch.n_tokens(), 3);
+
+    Ok(())
+}
 
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+    let mut batch = LlamaBatch::new(8, 1)?;
+
+    classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+    classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+
+    let promoted = classifier.commit_prompt_tokens();
+
+    assert_eq!(promoted, 2);
+    assert_eq!(classifier.pending_prompt_tokens(), 0);
+    assert_eq!(classifier.usage().prompt_tokens, 2);
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?;
-        let _ = left;
-        let _ = right;
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn discard_pending_prompt_tokens_clears_count_without_recording_usage(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+    let mut batch = LlamaBatch::new(8, 1)?;
+
+    classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+
+    let discarded = classifier.discard_pending_prompt_tokens();
+
+    assert_eq!(discarded, 1);
+    assert_eq!(classifier.pending_prompt_tokens(), 0);
+    assert_eq!(classifier.usage().prompt_tokens, 0);
+
+    Ok(())
 }
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?;
+    let _ = left;
+    let _ = right;
+    Ok(())
+}
 llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs
index 7b26c7ee..fc3624f9 100644
--- a/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs
+++ b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs
@@ -1,1978 +1,1889 @@
+#![expect(
+    clippy::unnecessary_wraps,
+    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+)]
+
+use std::ffi::CString;
+use std::num::NonZeroU16;
+use std::pin::pin;
+
+use anyhow::Result;
+use llama_cpp_bindings::SampledToken;
+use llama_cpp_bindings::context::params::LlamaContextParams;
+use llama_cpp_bindings::max_devices;
+use llama_cpp_bindings::model::AddBos;
+use llama_cpp_bindings::model::params::LlamaModelParams;
+use llama_cpp_test_harness::LlamaFixture;
+use llama_cpp_test_harness::llama_test;
 use llama_cpp_test_harness::llama_tests_main;
 
-mod model_properties {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-
-        assert!(model.n_vocab() > 0);
-        assert!(model.n_embd() > 0);
-        assert!(model.n_params() > 0);
-        assert!(model.n_ctx_train()? > 0);
-
-        Ok(())
-    }
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-        assert!(fixture.model.n_layer()? > 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+
+    assert!(model.n_vocab() > 0);
+    assert!(model.n_embd() > 0);
+    assert!(model.n_params() > 0);
+    assert!(model.n_ctx_train()? > 0);
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-        assert!(fixture.model.n_head()? > 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(fixture.model.n_layer()? > 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-        assert!(fixture.model.n_head_kv()? > 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(fixture.model.n_head()? > 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> {
-        assert!(fixture.model.size() > 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(fixture.model.n_head_kv()? > 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> {
-        assert!(!fixture.model.is_recurrent());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(fixture.model.size() > 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn is_hybrid_returns_false_for_non_hybrid_default_models(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        assert!(
-            !fixture.model.is_hybrid(),
-            "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(!fixture.model.is_recurrent());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
-        assert!(
-            fixture.model.is_hybrid(),
-            "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn is_hybrid_returns_false_for_non_hybrid_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(
+        !fixture.model.is_hybrid(),
+        "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn rope_type_returns_a_known_variant_for_rope_carrying_default_models(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        use llama_cpp_bindings::model::rope_type::RopeType;
-        let rope = fixture.model.rope_type();
-        assert!(
-            matches!(
-                rope,
-                Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision)
-            ),
-            "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(
+        fixture.model.is_hybrid(),
+        "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let rope = fixture.model.rope_type();
-        assert!(
-            rope.is_none(),
-            "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn rope_type_returns_a_known_variant_for_rope_carrying_default_models(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    use llama_cpp_bindings::model::rope_type::RopeType;
+    let rope = fixture.model.rope_type();
+    assert!(
+        matches!(
+            rope,
+            Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision)
+        ),
+        "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-        use llama_cpp_bindings::model::vocab_type::VocabType;
-        let vocab = fixture.model.vocab_type()?;
-        assert!(
-            matches!(vocab, VocabType::BPE | VocabType::SPM),
-            "vocab_type must be a known variant; got {vocab:?}"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let rope = fixture.model.rope_type();
+    assert!(
+        rope.is_none(),
+        "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}"
+    );
+    Ok(())
 }
 
-mod model_metadata_kv {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-        assert!(fixture.model.meta_count() > 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use llama_cpp_bindings::model::vocab_type::VocabType;
+    let vocab = fixture.model.vocab_type()?;
+    assert!(
+        matches!(vocab, VocabType::BPE | VocabType::SPM),
+        "vocab_type must be a known variant; got {vocab:?}"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let key = fixture.model.meta_key_by_index(0)?;
-        assert!(!key.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+    assert!(fixture.model.meta_count() > 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let value = fixture.model.meta_val_str_by_index(0)?;
-        assert!(!value.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let key = fixture.model.meta_key_by_index(0)?;
+    assert!(!key.is_empty());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = fixture.model.meta_key_by_index(999_999);
-        assert!(result.is_err());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let value = fixture.model.meta_val_str_by_index(0)?;
+    assert!(!value.is_empty());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = fixture.model.meta_val_str_by_index(999_999);
-        assert!(result.is_err());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = fixture.model.meta_key_by_index(999_999);
+    assert!(result.is_err());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let first_key = model.meta_key_by_index(0)?;
-        let value = model.meta_val_str(&first_key)?;
-        assert!(!value.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = fixture.model.meta_val_str_by_index(999_999);
+    assert!(result.is_err());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_val_str_with_long_value_triggers_buffer_resize(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let count = model.meta_count();
-
-        for index in 0..count {
-            let key = model.meta_key_by_index(index);
-            let value = model.meta_val_str_by_index(index);
-            assert!(key.is_ok());
-            assert!(value.is_ok());
-        }
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let first_key = model.meta_key_by_index(0)?;
+    let value = model.meta_val_str(&first_key)?;
+    assert!(!value.is_empty());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let result = fixture.model.meta_val_str("key\0with_null");
-        assert!(result.is_err());
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_val_str_with_long_value_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let count = model.meta_count();
+
+    for index in 0..count {
+        let key = model.meta_key_by_index(index);
+        let value = model.meta_val_str_by_index(index);
+        assert!(key.is_ok());
+        assert!(value.is_ok());
     }
+    Ok(())
 }
 
-mod model_params {
-    #![expect(
-        clippy::similar_names,
-        reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity"
-    )]
-
-    use std::ffi::CString;
-    use std::pin::pin;
-
-    use anyhow::Result;
-    use llama_cpp_bindings::context::params::LlamaContextParams;
-    use llama_cpp_bindings::max_devices;
-    use llama_cpp_bindings::model::params::LlamaModelParams;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model_path_str = fixture
-            .model_path
-            .to_str()
-            .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?;
-        let model_path_cstr = CString::new(model_path_str)?;
-
-        let mut params = pin!(LlamaModelParams::default());
-        let mut context_params = LlamaContextParams::default();
-        let mut margins = vec![0usize; max_devices()];
-
-        let result = params.as_mut().fit_params(
-            &model_path_cstr,
-            &mut context_params,
-            &mut margins,
-            512,
-            llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
-        );
-
-        let fit =
-            result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?;
-        assert!(fit.n_ctx > 0);
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = fixture.model.meta_val_str("key\0with_null");
+    assert!(result.is_err());
+    Ok(())
 }
 
-mod model_special_tokens {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_bindings::SampledToken;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let bos = model.token_bos();
-        let eos = model.token_eos();
-
-        assert_ne!(bos, eos);
-        assert!(model.is_eog_token(&SampledToken::Content(eos)));
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[expect(
+    clippy::similar_names,
+    reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity"
+)]
+fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model_path_str = fixture
+        .model_path
+        .to_str()
+        .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?;
+    let model_path_cstr = CString::new(model_path_str)?;
+
+    let mut params = pin!(LlamaModelParams::default());
+    let mut context_params = LlamaContextParams::default();
+    let mut margins = vec![0usize; max_devices()];
+
+    let result = params.as_mut().fit_params(
+        &model_path_cstr,
+        &mut context_params,
+        &mut margins,
+        512,
+        llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
+    );
+
+    let fit = result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?;
+    assert!(fit.n_ctx > 0);
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let nl_token = fixture.model.token_nl();
-        assert!(nl_token.0 >= 0);
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let bos = model.token_bos();
+    let eos = model.token_eos();
+
+    assert_ne!(bos, eos);
+    assert!(model.is_eog_token(&SampledToken::Content(eos)));
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let eos = model.token_eos();
-        assert!(model.is_eog_token(&SampledToken::Reasoning(eos)));
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let nl_token = fixture.model.token_nl();
+    assert!(nl_token.0 >= 0);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let eos = model.token_eos();
-        assert!(model.is_eog_token(&SampledToken::ToolCall(eos)));
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let eos = model.token_eos();
+    assert!(model.is_eog_token(&SampledToken::Reasoning(eos)));
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let eos = model.token_eos();
-        assert!(model.is_eog_token(&SampledToken::Undeterminable(eos)));
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let eos = model.token_eos();
+    assert!(model.is_eog_token(&SampledToken::ToolCall(eos)));
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let token = model.decode_start_token();
-        let n_vocab = model.n_vocab();
-        assert!(
-            token.0 == -1 || (0..n_vocab).contains(&token.0),
-            "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}"
-        );
-        assert_eq!(
-            token,
-            model.decode_start_token(),
-            "decode_start_token must be deterministic across calls"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let eos = model.token_eos();
+    assert!(model.is_eog_token(&SampledToken::Undeterminable(eos)));
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let token = model.token_sep();
-        let n_vocab = model.n_vocab();
-        assert!(
-            token.0 == -1 || (0..n_vocab).contains(&token.0),
-            "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}"
-        );
-        assert_eq!(
-            token,
-            model.token_sep(),
-            "token_sep must be deterministic across calls"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let token = model.decode_start_token();
+    let n_vocab = model.n_vocab();
+    assert!(
+        token.0 == -1 || (0..n_vocab).contains(&token.0),
+        "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}"
+    );
+    assert_eq!(
+        token,
+        model.decode_start_token(),
+        "decode_start_token must be deterministic across calls"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let bos = model.token_bos();
-        let attrs = model.token_attr(bos)?;
-        let bit_repr = format!("{:?}", *attrs);
-        assert!(
-            !bit_repr.is_empty(),
-            "token_attr(bos) must produce Debug output"
-        );
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let token = model.token_sep();
+    let n_vocab = model.n_vocab();
+    assert!(
+        token.0 == -1 || (0..n_vocab).contains(&token.0),
+        "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}"
+    );
+    assert_eq!(
+        token,
+        model.token_sep(),
+        "token_sep must be deterministic across calls"
+    );
+    Ok(())
 }
 
-mod model_str_to_token {
-    use anyhow::Result;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let tokens = model.str_to_token("hello world", AddBos::Never)?;
-        assert!(!tokens.is_empty());
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-        let piece = model.token_to_piece(
-            &llama_cpp_bindings::SampledToken::Content(tokens[0]),
-            &mut decoder,
-            false,
-            None,
-        )?;
-
-        assert!(!piece.is_empty());
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let bos = model.token_bos();
+    let attrs = model.token_attr(bos)?;
+    let bit_repr = format!("{:?}", *attrs);
+    assert!(
+        !bit_repr.is_empty(),
+        "token_attr(bos) must produce Debug output"
+    );
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn str_to_token_grows_buffer_when_initial_estimation_too_small(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let many_short_chars = "a b c d e f g h i j k l";
-        let tokens = fixture
-            .model
-            .str_to_token(many_short_chars, AddBos::Always)?;
-
-        assert!(
-            tokens.len() > 8,
-            "expected regrow; got {} tokens",
-            tokens.len()
-        );
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let tokens = model.str_to_token("hello world", AddBos::Never)?;
+    assert!(!tokens.is_empty());
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+    let piece =
+        model.token_to_piece(&SampledToken::Content(tokens[0]), &mut decoder, false, None)?;
+
+    assert!(!piece.is_empty());
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?;
-        let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?;
-
-        assert!(tokens_with_bos.len() >= tokens_without_bos.len());
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn str_to_token_grows_buffer_when_initial_estimation_too_small(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let many_short_chars = "a b c d e f g h i j k l";
+    let tokens = fixture
+        .model
+        .str_to_token(many_short_chars, AddBos::Always)?;
+
+    assert!(
+        tokens.len() > 8,
+        "expected regrow; got {} tokens",
+        tokens.len()
+    );
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn str_to_token_with_many_tokens_triggers_buffer_resize(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        use std::fmt::Write;
-
-        let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| {
-            let _ = write!(accumulator, "{number} ");
-            accumulator
-        });
-
-        let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?;
-
-        assert!(tokens.len() > many_numbers.len() / 2);
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?;
+    let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?;
+
+    assert!(tokens_with_bos.len() >= tokens_without_bos.len());
+
+    Ok(())
 }
 
-mod model_token_to_piece {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use std::num::NonZeroU16;
-
-    use anyhow::Result;
-    use llama_cpp_bindings::SampledToken;
-    use llama_cpp_bindings::model::AddBos;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_to_piece_bytes_returns_bytes_for_known_token(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let tokens = model.str_to_token("hello", AddBos::Never)?;
-        let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?;
-
-        assert!(!bytes.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn str_to_token_with_many_tokens_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use std::fmt::Write;
+
+    let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| {
+        let _ = write!(accumulator, "{number} ");
+        accumulator
+    });
+
+    let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?;
+
+    assert!(tokens.len() > many_numbers.len() / 2);
+
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_to_piece_handles_large_token_requiring_buffer_resize(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-
-        for (token, _) in model.tokens(true).take(200) {
-            let result =
-                model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None);
-            assert!(result.is_ok());
-        }
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_to_piece_bytes_returns_bytes_for_known_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let tokens = model.str_to_token("hello", AddBos::Never)?;
+    let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?;
+
+    assert!(!bytes.is_empty());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_to_piece_bytes_insufficient_buffer_returns_error(
-        fixture: &LlamaFixture<'_>,
-    ) -> Result<()> {
-        let model = fixture.model;
-        let tokens = model.str_to_token("hello", AddBos::Never)?;
-        let result = model.token_to_piece_bytes(tokens[0], 1, false, None);
-
-        assert!(
-            result
-                .unwrap_err()
-                .to_string()
-                .contains("Insufficient Buffer Space")
-        );
-        Ok(())
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_to_piece_handles_large_token_requiring_buffer_resize(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+
+    for (token, _) in model.tokens(true).take(200) {
+        let result = model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None);
+        assert!(result.is_ok());
     }
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-        let tokens = model.str_to_token("hello", AddBos::Never)?;
-        let result = model.token_to_piece(
-            &SampledToken::Content(tokens[0]),
-            &mut decoder,
-            false,
-            NonZeroU16::new(1),
-        );
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_to_piece_bytes_insufficient_buffer_returns_error(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let model = fixture.model;
+    let tokens = model.str_to_token("hello", AddBos::Never)?;
+    let result = model.token_to_piece_bytes(tokens[0], 1, false, None);
+
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Insufficient Buffer Space")
+    );
+    Ok(())
+}
 
-        assert!(result.is_ok());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+    let tokens = model.str_to_token("hello", AddBos::Never)?;
+    let result = model.token_to_piece(
+        &SampledToken::Content(tokens[0]),
+        &mut decoder,
+        false,
+        NonZeroU16::new(1),
+    );
+
+    assert!(result.is_ok());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-        let tokens = model.str_to_token("hi", AddBos::Never)?;
-
-        let piece = model.token_to_piece(
-            &SampledToken::Reasoning(tokens[0]),
-            &mut decoder,
-            true,
-            None,
-        )?;
-
-        assert!(!piece.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+    let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+    let piece = model.token_to_piece(
+        &SampledToken::Reasoning(tokens[0]),
+        &mut decoder,
+        true,
+        None,
+    )?;
+
+    assert!(!piece.is_empty());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-        let tokens = model.str_to_token("hi", AddBos::Never)?;
-
-        let piece =
-            model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?;
-
-        assert!(!piece.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+    let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+    let piece =
+        model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?;
+
+    assert!(!piece.is_empty());
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut decoder = encoding_rs::UTF_8.new_decoder();
-        let tokens = model.str_to_token("hi", AddBos::Never)?;
-
-        let piece = model.token_to_piece(
-            &SampledToken::Undeterminable(tokens[0]),
-            &mut decoder,
-            true,
-            None,
-        )?;
-
-        assert!(!piece.is_empty());
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+    let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+    let piece = model.token_to_piece(
+        &SampledToken::Undeterminable(tokens[0]),
+        &mut decoder,
+        true,
+        None,
+    )?;
+
+    assert!(!piece.is_empty());
+    Ok(())
 }
 
-mod model_tokens_iterator {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let mut count = 0;
-
-        for (token, _piece_result) in model.tokens(false) {
-            assert!(token.0 >= 0);
-            count += 1;
-
-            if count >= 100 {
-                break;
-            }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let mut count = 0;
+
+    for (token, _piece_result) in model.tokens(false) {
+        assert!(token.0 >= 0);
+        count += 1;
+
+        if count >= 100 {
+            break;
         }
-
-        assert_eq!(count, 100);
-        Ok(())
     }
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 512,
-        n_batch = 128,
-        n_ubatch = 64,
-    )]
-    fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let model = fixture.model;
-        let n_vocab = model.n_vocab();
-        let count = model.tokens(false).count();
-
-        assert_eq!(count, usize::try_from(n_vocab)?);
-        Ok(())
-    }
+    assert_eq!(count, 100);
+    Ok(())
 }
 
-mod model_helpers {
-    #![expect(
-        clippy::unnecessary_wraps,
-        reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
-    )]
-
-    use anyhow::Result;
-    use llama_cpp_test_harness::LlamaFixture;
-    use llama_cpp_test_harness::llama_test;
-
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128
-    )]
-    fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let formatted = format!("{:?}", fixture.model);
-
-        assert!(formatted.contains("LlamaModel"));
-        assert!(formatted.contains("model"));
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let model = fixture.model;
+    let n_vocab = model.n_vocab();
+    let count = model.tokens(false).count();
+
+    assert_eq!(count, usize::try_from(n_vocab)?);
+    Ok(())
+}
 
-    #[llama_test(
-        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-        n_gpu_layers = 999,
-        use_mmap = true,
-        use_mlock = false,
-        n_ctx = 2048,
-        n_batch = 512,
-        n_ubatch = 128
-    )]
-    fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
-        let first = fixture.model.approximate_tok_env();
-        let second = fixture.model.approximate_tok_env();
-
-        assert!(std::sync::Arc::ptr_eq(&first, &second));
-
-        Ok(())
-    }
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128
+)]
+fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let formatted = format!("{:?}", fixture.model);
+
+    assert!(formatted.contains("LlamaModel"));
+    assert!(formatted.contains("model"));
+
+    Ok(())
 }
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 2048,
+    n_batch = 512,
+    n_ubatch = 128
+)]
+fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let first = fixture.model.approximate_tok_env();
+    let second = fixture.model.approximate_tok_env();
+
+    assert!(std::sync::Arc::ptr_eq(&first, &second));
+
+    Ok(())
+}
 llama_tests_main!();
diff --git a/llama-cpp-bindings/src/batch_add_error.rs b/llama-cpp-bindings/src/batch_add_error.rs
index ea4cb154..e3ec5864 100644
--- a/llama-cpp-bindings/src/batch_add_error.rs
+++ b/llama-cpp-bindings/src/batch_add_error.rs
@@ -1,13 +1,9 @@
-/// Errors that can occur when adding a token to a batch.
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum BatchAddError {
-    /// There was not enough space in the batch to add the token.
     #[error("Insufficient Space of {0}")]
     InsufficientSpace(usize),
-    /// Empty buffer is provided for [`crate::llama_batch::LlamaBatch::get_one`]
     #[error("Empty buffer")]
     EmptyBuffer,
-    /// An integer value exceeded the allowed range.
     #[error("Integer overflow: {0}")]
     IntegerOverflow(String),
 }
diff --git a/llama-cpp-bindings/src/context.rs b/llama-cpp-bindings/src/context.rs
index b980f831..49702de6 100644
--- a/llama-cpp-bindings/src/context.rs
+++ b/llama-cpp-bindings/src/context.rs
@@ -1,5 +1,3 @@
-//! Safe wrapper around `llama_context`.
-
 use std::ffi::c_void;
 use std::fmt::{Debug, Formatter};
 use std::num::NonZeroI32;
@@ -57,11 +55,8 @@ unsafe extern "C" fn abort_callback_trampoline(data: *mut c_void) -> bool {
     flag.load(Ordering::Relaxed)
 }
 
-/// Safe wrapper around `llama_context`.
 pub struct LlamaContext<'model> {
-    /// Raw pointer to the underlying `llama_context`.
     pub context: NonNull<llama_cpp_bindings_sys::llama_context>,
-    /// A reference to the context's model.
     pub model: &'model LlamaModel,
     abort_flag: Option<Arc<AtomicBool>>,
     initialized_logits: Vec<i32>,
@@ -77,7 +72,6 @@ impl Debug for LlamaContext<'_> {
 }
 
 impl<'model> LlamaContext<'model> {
-    /// Wraps existing raw pointers into a new `LlamaContext`.
     #[must_use]
     pub const fn new(
         llama_model: &'model LlamaModel,
@@ -93,11 +87,6 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Create a new context bound to `model`.
-    ///
-    /// `_backend` is unused in the body but serves as a compile-time witness that
-    /// the global llama.cpp backend has been initialised before context creation.
-    ///
     /// # Errors
     ///
     /// Returns [`LlamaContextLoadError`] when llama.cpp fails to allocate the context.
@@ -143,29 +132,21 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Gets the max number of logical tokens that can be submitted to decode. Must be greater than or equal to [`Self::n_ubatch`].
     #[must_use]
     pub fn n_batch(&self) -> u32 {
         unsafe { llama_cpp_bindings_sys::llama_n_batch(self.context.as_ptr()) }
     }
 
-    /// Gets the max number of physical tokens (hardware level) to decode in batch. Must be less than or equal to [`Self::n_batch`].
     #[must_use]
     pub fn n_ubatch(&self) -> u32 {
         unsafe { llama_cpp_bindings_sys::llama_n_ubatch(self.context.as_ptr()) }
     }
 
-    /// Gets the size of the context.
     #[must_use]
     pub fn n_ctx(&self) -> u32 {
         unsafe { llama_cpp_bindings_sys::llama_n_ctx(self.context.as_ptr()) }
     }
 
-    /// Sets an abort flag that llama.cpp checks during computation.
-    ///
-    /// When the flag is set to `true`, any in-progress `decode()` call will
-    /// abort and return `DecodeError::Aborted`. The `Arc` is stored internally
-    /// to ensure the flag outlives the callback registration.
     #[expect(unsafe_code, reason = "required for FFI abort callback registration")]
     pub fn set_abort_flag(&mut self, flag: Arc<AtomicBool>) {
         let raw_ptr = Arc::as_ptr(&flag) as *mut c_void;
@@ -180,7 +161,6 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Clears the abort callback so that decode calls are no longer interruptible.
     #[expect(unsafe_code, reason = "required for FFI abort callback deregistration")]
     pub fn clear_abort_callback(&mut self) {
         self.abort_flag = None;
@@ -194,33 +174,20 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Waits for all pending backend operations to complete.
-    ///
-    /// Must be called before freeing the context to prevent hangs
-    /// during resource cleanup.
     #[expect(unsafe_code, reason = "required for FFI synchronization call")]
     pub fn synchronize(&self) {
         unsafe { llama_cpp_bindings_sys::llama_synchronize(self.context.as_ptr()) }
     }
 
-    /// Detaches the threadpool from the context.
-    ///
-    /// Must be called before freeing the context to prevent threadpool
-    /// workers from accessing freed resources.
     #[expect(unsafe_code, reason = "required for FFI threadpool detachment")]
     pub fn detach_threadpool(&self) {
         unsafe { llama_cpp_bindings_sys::llama_detach_threadpool(self.context.as_ptr()) }
     }
 
-    /// Marks a logit index as initialized so it can be read via
-    /// `get_logits_ith`. Use after external decode operations (like
-    /// `eval_chunks`) that bypass the Rust `decode()` method.
     pub fn mark_logits_initialized(&mut self, token_index: i32) {
         self.initialized_logits = vec![token_index];
     }
 
-    /// Decodes the batch.
-    ///
     /// # Errors
     ///
     /// - `DecodeError` if the decoding failed.
@@ -267,8 +234,6 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Encodes the batch.
-    ///
     /// # Errors
     ///
     /// - `EncodeError` if the encoding failed.
@@ -318,13 +283,6 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Get the embeddings for the given sequence in the current context.
-    ///
-    /// # Returns
-    ///
-    /// A slice containing the embeddings for the last decoded batch.
-    /// The size corresponds to the `n_embd` parameter of the context's model.
-    ///
     /// # Errors
     ///
     /// - When the current context was constructed without enabling embeddings.
@@ -353,13 +311,6 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Get the embeddings for the given token in the current context.
-    ///
-    /// # Returns
-    ///
-    /// A slice containing the embeddings for the last decoded batch of the given token.
-    /// The size corresponds to the `n_embd` parameter of the context's model.
-    ///
     /// # Errors
     ///
     /// - When the current context was constructed without enabling embeddings.
@@ -388,12 +339,6 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Get the logits for the last token in the context.
-    ///
-    /// # Returns
-    /// An iterator over unsorted `LlamaTokenData` containing the
-    /// logits for the last token in the context.
-    ///
     /// # Errors
     /// Returns `LogitsError` if logits are null or `n_vocab` overflows.
     pub fn candidates(&self) -> Result<impl Iterator<Item = LlamaTokenData> + '_, LogitsError> {
@@ -405,25 +350,12 @@ impl<'model> LlamaContext<'model> {
         }))
     }
 
-    /// Get the token data array for the last token in the context.
-    ///
     /// # Errors
     /// Returns `LogitsError` if logits are null or `n_vocab` overflows.
     pub fn token_data_array(&self) -> Result<LlamaTokenDataArray, LogitsError> {
         Ok(LlamaTokenDataArray::from_iter(self.candidates()?, false))
     }
 
-    /// Token logits obtained from the last call to `decode()`.
-    /// The logits for which `batch.logits[i] != 0` are stored contiguously
-    /// in the order they have appeared in the batch.
-    /// Rows: number of tokens for which `batch.logits[i] != 0`
-    /// Cols: `n_vocab`
-    ///
-    /// # Returns
-    ///
-    /// A slice containing the logits for the last decoded token.
-    /// The size corresponds to the `n_vocab` parameter of the context's model.
-    ///
     /// # Errors
     /// Returns `LogitsError` if the logits pointer is null or `n_vocab` overflows.
     pub fn get_logits(&self) -> Result<&[f32], LogitsError> {
@@ -438,8 +370,6 @@ impl<'model> LlamaContext<'model> {
         Ok(unsafe { slice::from_raw_parts(data, len) })
     }
 
-    /// Get the logits for the ith token in the context.
-    ///
     /// # Errors
     /// Returns `LogitsError` if the token is not initialized or out of range.
     pub fn candidates_ith(
@@ -454,8 +384,6 @@ impl<'model> LlamaContext<'model> {
         }))
     }
 
-    /// Get the token data array for the ith token in the context.
-    ///
     /// # Errors
     /// Returns `LogitsError` if the token is not initialized or out of range.
     pub fn token_data_array_ith(
@@ -468,8 +396,6 @@ impl<'model> LlamaContext<'model> {
         ))
     }
 
-    /// Get the logits for the ith token in the context.
-    ///
     /// # Errors
     /// Returns `LogitsError` if the token is not initialized, out of range, or `n_vocab` overflows.
     pub fn get_logits_ith(&self, token_index: i32) -> Result<&[f32], LogitsError> {
@@ -497,19 +423,15 @@ impl<'model> LlamaContext<'model> {
         Ok(unsafe { slice::from_raw_parts(data, len) })
     }
 
-    /// Reset the timings for the context.
     pub fn reset_timings(&mut self) {
         unsafe { llama_cpp_bindings_sys::llama_perf_context_reset(self.context.as_ptr()) }
     }
 
-    /// Returns the timings for the context.
     pub fn timings(&mut self) -> LlamaTimings {
         let timings = unsafe { llama_cpp_bindings_sys::llama_perf_context(self.context.as_ptr()) };
         LlamaTimings { timings }
     }
 
-    /// Sets a lora adapter.
-    ///
     /// # Errors
     ///
     /// See [`LlamaLoraAdapterSetError`] for more information.
@@ -534,11 +456,6 @@ impl<'model> LlamaContext<'model> {
         Ok(())
     }
 
-    /// Remove all lora adapters.
-    ///
-    /// Note: The upstream API now replaces all adapters at once via
-    /// `llama_set_adapters_lora`. This clears all adapters from the context.
-    ///
     /// # Errors
     ///
     /// See [`LlamaLoraAdapterRemoveError`] for more information.
diff --git a/llama-cpp-bindings/src/context/kv_cache.rs b/llama-cpp-bindings/src/context/kv_cache.rs
index dff5e2aa..80b97a67 100644
--- a/llama-cpp-bindings/src/context/kv_cache.rs
+++ b/llama-cpp-bindings/src/context/kv_cache.rs
@@ -1,5 +1,3 @@
-//! utilities for working with the kv cache
-
 use std::ffi::c_int;
 use std::num::{NonZeroU8, TryFromIntError};
 use std::os::raw::c_char;
@@ -9,44 +7,22 @@ use crate::context::LlamaContext;
 use crate::error::{KvCacheSeqAddError, KvCacheSeqDivError};
 use crate::ffi_error_reader::read_and_free_cpp_error;
 
-/// Errors that can occur when attempting to prepare values for the kv cache
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum KvCacheConversionError {
-    /// Sequence id conversion to i32 failed
     #[error("Provided sequence id is too large for a i32")]
     SeqIdTooLarge(#[source] TryFromIntError),
-    /// Position 0 conversion to i32 failed
     #[error("Provided start position is too large for a i32")]
     P0TooLarge(#[source] TryFromIntError),
-    /// Position 1 conversion to i32 failed
     #[error("Provided end position is too large for a i32")]
     P1TooLarge(#[source] TryFromIntError),
 }
 
 impl LlamaContext<'_> {
-    /// Copy the cache from one sequence to another.
-    ///
-    /// # Parameters
-    ///
-    /// * `src` - The sequence id to copy the cache from.
-    /// * `dest` - The sequence id to copy the cache to.
-    /// * `size` - The size of the cache to copy.
     pub fn copy_cache(&mut self, src: i32, dest: i32, size: i32) {
         let mem = unsafe { llama_cpp_bindings_sys::llama_get_memory(self.context.as_ptr()) };
         unsafe { llama_cpp_bindings_sys::llama_memory_seq_cp(mem, src, dest, 0, size) }
     }
 
-    /// Copy the cache from one sequence to another.
-    ///
-    /// # Returns
-    /// A `Result` indicating whether the operation was successful.
-    ///
-    /// # Parameters
-    /// * `src` - The sequence id to copy the cache from.
-    /// * `dest` - The sequence id to copy the cache to.
-    /// * `p0` - The start position of the cache to clear. If `None`, the entire cache is copied up to `p1`.
-    /// * `p1` - The end position of the cache to clear. If `None`, the entire cache is copied starting from `p0`.
-    ///
     /// # Errors
     /// If either position exceeds [`i32::MAX`].
     pub fn copy_kv_cache_seq(
@@ -67,18 +43,6 @@ impl LlamaContext<'_> {
         Ok(())
     }
 
-    /// Clear the kv cache for the given sequence within the specified range `[p0, p1)`
-    /// Returns `false` only when partial sequence removals fail. Full sequence removals always succeed.
-    ///
-    /// # Returns
-    /// A `Result` indicating whether the operation was successful. If the sequence id or
-    /// either position exceeds the maximum i32 value, no removal is attempted and an `Err` is returned.
-    ///
-    /// # Parameters
-    /// * `src` - The sequence id to clear the cache for. If `None`, matches all sequences
-    /// * `p0` - The start position of the cache to clear. If `None`, the entire cache is cleared up to `p1`.
-    /// * `p1` - The end position of the cache to clear. If `None`, the entire cache is cleared from `p0`.
-    ///
     /// # Errors
     /// If the sequence id or either position exceeds [`i32::MAX`].
     pub fn clear_kv_cache_seq(
@@ -100,38 +64,17 @@ impl LlamaContext<'_> {
         Ok(unsafe { llama_cpp_bindings_sys::llama_memory_seq_rm(mem, src, p0, p1) })
     }
 
-    /// Clear the KV cache, including both metadata and the underlying data buffers.
     pub fn clear_kv_cache(&mut self) {
         let mem = unsafe { llama_cpp_bindings_sys::llama_get_memory(self.context.as_ptr()) };
         let clear_data_buffers = true;
         unsafe { llama_cpp_bindings_sys::llama_memory_clear(mem, clear_data_buffers) }
     }
 
-    /// Removes all tokens that do not belong to the specified sequence
-    ///
-    /// # Parameters
-    ///
-    /// * `seq_id` - The sequence id to keep
     pub fn kv_cache_seq_keep(&mut self, seq_id: i32) {
         let mem = unsafe { llama_cpp_bindings_sys::llama_get_memory(self.context.as_ptr()) };
         unsafe { llama_cpp_bindings_sys::llama_memory_seq_keep(mem, seq_id) }
     }
 
-    /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in `[p0, p1)`
-    /// If the KV cache is `RoPEd`, the KV data is updated accordingly:
-    ///   - lazily on next [`LlamaContext::decode`]
-    ///   - explicitly with [`Self::kv_cache_update`]
-    ///
-    /// # Returns
-    /// A `Result` indicating whether the operation was successful.
-    ///
-    /// # Parameters
-    ///
-    /// * `seq_id` - The sequence id to update
-    /// * `p0` - The start position of the cache to update. If `None`, the entire cache is updated up to `p1`.
-    /// * `p1` - The end position of the cache to update. If `None`, the entire cache is updated starting from `p0`.
-    /// * `delta` - The relative position to add to the tokens
-    ///
     /// # Errors
     /// If either position exceeds [`i32::MAX`], or the underlying memory operation reports a failure.
     pub fn kv_cache_seq_add(
@@ -177,21 +120,6 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Integer division of the positions by factor of `d > 1`
-    /// If the KV cache is `RoPEd`, the KV data is updated accordingly:
-    ///   - lazily on next [`LlamaContext::decode`]
-    ///   - explicitly with [`Self::kv_cache_update`]
-    ///
-    /// # Returns
-    /// A `Result` indicating whether the operation was successful.
-    ///
-    /// # Parameters
-    ///
-    /// * `seq_id` - The sequence id to update
-    /// * `p0` - The start position of the cache to update. If `None`, the entire cache is updated up to `p1`.
-    /// * `p1` - The end position of the cache to update. If `None`, the entire cache is updated starting from `p0`.
-    /// * `d` - The factor to divide the positions by
-    ///
     /// # Errors
     /// If either position exceeds [`i32::MAX`], or the underlying memory operation reports a failure.
     pub fn kv_cache_seq_div(
@@ -238,11 +166,6 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Returns the largest position present in the KV cache for the specified sequence
-    ///
-    /// # Parameters
-    ///
-    /// * `seq_id` - The sequence id to get the max position for
     #[must_use]
     pub fn kv_cache_seq_pos_max(&self, seq_id: i32) -> i32 {
         unsafe {
diff --git a/llama-cpp-bindings/src/context/kv_cache_type.rs b/llama-cpp-bindings/src/context/kv_cache_type.rs
index 661a59e1..3ffc7f11 100644
--- a/llama-cpp-bindings/src/context/kv_cache_type.rs
+++ b/llama-cpp-bindings/src/context/kv_cache_type.rs
@@ -1,4 +1,3 @@
-/// A rusty wrapper around `ggml_type` for KV cache types.
 #[expect(
     non_camel_case_types,
     reason = "variant names mirror llama.cpp's `enum ggml_type` symbol names verbatim so they can \
@@ -11,11 +10,6 @@
 )]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum KvCacheType {
-    /// Represents an unknown or not-yet-mapped `ggml_type` and carries the raw value.
-    /// When passed through FFI, the raw value is used as-is (if llama.cpp supports it,
-    /// the runtime will operate with that type).
-    /// This variant preserves API compatibility when new `ggml_type` values are
-    /// introduced in the future.
     Unknown(llama_cpp_bindings_sys::ggml_type),
     F32,
     F16,
diff --git a/llama-cpp-bindings/src/context/llama_attention_type.rs b/llama-cpp-bindings/src/context/llama_attention_type.rs
index b785ffb0..79b9f66f 100644
--- a/llama-cpp-bindings/src/context/llama_attention_type.rs
+++ b/llama-cpp-bindings/src/context/llama_attention_type.rs
@@ -1,12 +1,8 @@
-/// A rusty wrapper around `LLAMA_ATTENTION_TYPE`.
 #[repr(i8)]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum LlamaAttentionType {
-    /// The attention type is unspecified
     Unspecified = -1,
-    /// Causal attention
     Causal = 0,
-    /// Non-causal attention
     NonCausal = 1,
 }
 
diff --git a/llama-cpp-bindings/src/context/llama_pooling_type.rs b/llama-cpp-bindings/src/context/llama_pooling_type.rs
index f0d4486b..651216f3 100644
--- a/llama-cpp-bindings/src/context/llama_pooling_type.rs
+++ b/llama-cpp-bindings/src/context/llama_pooling_type.rs
@@ -1,23 +1,14 @@
-/// A rusty wrapper around `LLAMA_POOLING_TYPE`.
 #[repr(i8)]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum LlamaPoolingType {
-    /// The pooling type is unspecified
     Unspecified = -1,
-    /// No pooling
     None = 0,
-    /// Mean pooling
     Mean = 1,
-    /// CLS pooling
     Cls = 2,
-    /// Last pooling
     Last = 3,
-    /// Rank pooling
     Rank = 4,
 }
 
-/// Create a `LlamaPoolingType` from a `c_int` - returns `LlamaPoolingType::Unspecified` if
-/// the value is not recognized.
 impl From<i32> for LlamaPoolingType {
     fn from(value: i32) -> Self {
         match value {
@@ -31,7 +22,6 @@ impl From<i32> for LlamaPoolingType {
     }
 }
 
-/// Create a `c_int` from a `LlamaPoolingType`.
 impl From<LlamaPoolingType> for i32 {
     fn from(value: LlamaPoolingType) -> Self {
         match value {
diff --git a/llama-cpp-bindings/src/context/llama_state_seq_flags.rs b/llama-cpp-bindings/src/context/llama_state_seq_flags.rs
index efc66b94..cbe5ccc0 100644
--- a/llama-cpp-bindings/src/context/llama_state_seq_flags.rs
+++ b/llama-cpp-bindings/src/context/llama_state_seq_flags.rs
@@ -1,31 +1,21 @@
-//! Flags for extended state sequence operations on hybrid/recurrent models.
-
-/// Flags controlling which parts of state to save/restore for sequence operations.
-///
-/// Used with the `state_seq_*_ext` methods on [`super::LlamaContext`] to enable
-/// partial state operations (e.g., saving only recurrent/SSM state for hybrid models).
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct LlamaStateSeqFlags {
     flags: u32,
 }
 
 impl LlamaStateSeqFlags {
-    /// Save/restore only partial (recurrent/SSM) state, skipping attention KV cache.
     pub const PARTIAL_ONLY: Self = Self { flags: 1 };
 
-    /// No flags set.
     #[must_use]
     pub const fn empty() -> Self {
         Self { flags: 0 }
     }
 
-    /// Returns the raw bit representation.
     #[must_use]
     pub const fn bits(&self) -> u32 {
         self.flags
     }
 
-    /// Returns true if `self` contains all bits in `other`.
     #[must_use]
     pub const fn contains(&self, other: Self) -> bool {
         (self.flags & other.flags) == other.flags
diff --git a/llama-cpp-bindings/src/context/load_seq_state_error.rs b/llama-cpp-bindings/src/context/load_seq_state_error.rs
index 158c8c3b..b3e27983 100644
--- a/llama-cpp-bindings/src/context/load_seq_state_error.rs
+++ b/llama-cpp-bindings/src/context/load_seq_state_error.rs
@@ -1,29 +1,17 @@
-//! Error type for sequence state file load operations.
-
 use std::ffi::NulError;
 use std::path::PathBuf;
 
-/// Failed to load a sequence state file.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LoadSeqStateError {
-    /// llama.cpp failed to load the sequence state file
     #[error("Failed to load sequence state file")]
     FailedToLoad,
 
-    /// null byte in string
     #[error("null byte in string {0}")]
     NullError(#[from] NulError),
 
-    /// failed to convert path to str
     #[error("failed to convert path {0} to str")]
     PathToStrError(PathBuf),
 
-    /// Insufficient max length
     #[error("max_length is not large enough to hold {n_out} (was {max_tokens})")]
-    InsufficientMaxLength {
-        /// The length of the loaded sequence
-        n_out: usize,
-        /// The maximum length
-        max_tokens: usize,
-    },
+    InsufficientMaxLength { n_out: usize, max_tokens: usize },
 }
diff --git a/llama-cpp-bindings/src/context/load_session_error.rs b/llama-cpp-bindings/src/context/load_session_error.rs
index e317f278..be514e13 100644
--- a/llama-cpp-bindings/src/context/load_session_error.rs
+++ b/llama-cpp-bindings/src/context/load_session_error.rs
@@ -1,29 +1,17 @@
-//! Error type for session file load operations.
-
 use std::ffi::NulError;
 use std::path::PathBuf;
 
-/// Failed to load a session file.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LoadSessionError {
-    /// llama.cpp failed to load the session file
     #[error("Failed to load session file")]
     FailedToLoad,
 
-    /// null byte in string
     #[error("null byte in string {0}")]
     NullError(#[from] NulError),
 
-    /// failed to convert path to str
     #[error("failed to convert path {0} to str")]
     PathToStrError(PathBuf),
 
-    /// Insufficient max length
     #[error("max_length is not large enough to hold {n_out} (was {max_tokens})")]
-    InsufficientMaxLength {
-        /// The length of the session file
-        n_out: usize,
-        /// The maximum length
-        max_tokens: usize,
-    },
+    InsufficientMaxLength { n_out: usize, max_tokens: usize },
 }
diff --git a/llama-cpp-bindings/src/context/params.rs b/llama-cpp-bindings/src/context/params.rs
index 13935e21..0b2f8348 100644
--- a/llama-cpp-bindings/src/context/params.rs
+++ b/llama-cpp-bindings/src/context/params.rs
@@ -1,4 +1,3 @@
-//! A safe wrapper around `llama_context_params`.
 use std::fmt::Debug;
 use std::num::NonZeroU32;
 
@@ -7,21 +6,6 @@ pub use crate::context::llama_attention_type::LlamaAttentionType;
 pub use crate::context::llama_pooling_type::LlamaPoolingType;
 pub use crate::context::rope_scaling_type::RopeScalingType;
 
-/// A safe wrapper around `llama_context_params`.
-///
-/// Generally this should be created with [`Default::default()`] and then modified with `with_*` methods.
-///
-/// # Examples
-///
-/// ```rust
-/// # use std::num::NonZeroU32;
-/// use llama_cpp_bindings::context::params::LlamaContextParams;
-///
-///let ctx_params = LlamaContextParams::default()
-///    .with_n_ctx(NonZeroU32::new(2048));
-///
-/// assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048));
-/// ```
 #[derive(Debug, Clone)]
 #[expect(
     missing_docs,
@@ -38,105 +22,43 @@ pub struct LlamaContextParams {
     pub context_params: llama_cpp_bindings_sys::llama_context_params,
 }
 
-/// SAFETY: we do not currently allow setting or reading the pointers that cause this to not be automatically send or sync.
 unsafe impl Send for LlamaContextParams {}
 unsafe impl Sync for LlamaContextParams {}
 
 impl LlamaContextParams {
-    /// Set the side of the context
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// # use std::num::NonZeroU32;
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// let params = params.with_n_ctx(NonZeroU32::new(2048));
-    /// assert_eq!(params.n_ctx(), NonZeroU32::new(2048));
-    /// ```
     #[must_use]
     pub fn with_n_ctx(mut self, n_ctx: Option<NonZeroU32>) -> Self {
         self.context_params.n_ctx = n_ctx.map_or(0, NonZeroU32::get);
         self
     }
 
-    /// Get the size of the context.
-    ///
-    /// [`None`] if the context size is specified by the model and not the context.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.n_ctx(), std::num::NonZeroU32::new(512));
     #[must_use]
     pub const fn n_ctx(&self) -> Option<NonZeroU32> {
         NonZeroU32::new(self.context_params.n_ctx)
     }
 
-    /// Set the `n_batch`
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// # use std::num::NonZeroU32;
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_n_batch(2048);
-    /// assert_eq!(params.n_batch(), 2048);
-    /// ```
     #[must_use]
     pub const fn with_n_batch(mut self, n_batch: u32) -> Self {
         self.context_params.n_batch = n_batch;
         self
     }
 
-    /// Get the `n_batch`
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// assert_eq!(params.n_batch(), 2048);
-    /// ```
     #[must_use]
     pub const fn n_batch(&self) -> u32 {
         self.context_params.n_batch
     }
 
-    /// Set the `n_ubatch`
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// # use std::num::NonZeroU32;
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_n_ubatch(512);
-    /// assert_eq!(params.n_ubatch(), 512);
-    /// ```
     #[must_use]
     pub const fn with_n_ubatch(mut self, n_ubatch: u32) -> Self {
         self.context_params.n_ubatch = n_ubatch;
         self
     }
 
-    /// Get the `n_ubatch`
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// assert_eq!(params.n_ubatch(), 512);
-    /// ```
     #[must_use]
     pub const fn n_ubatch(&self) -> u32 {
         self.context_params.n_ubatch
     }
 
-    /// Set the flash attention policy using llama.cpp enum
     #[must_use]
     pub const fn with_flash_attention_policy(
         mut self,
@@ -146,232 +68,88 @@ impl LlamaContextParams {
         self
     }
 
-    /// Get the flash attention policy
     #[must_use]
     pub const fn flash_attention_policy(&self) -> llama_cpp_bindings_sys::llama_flash_attn_type {
         self.context_params.flash_attn_type
     }
 
-    /// Set the `offload_kqv` parameter to control offloading KV cache & KQV ops to GPU
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_offload_kqv(false);
-    /// assert_eq!(params.offload_kqv(), false);
-    /// ```
     #[must_use]
     pub const fn with_offload_kqv(mut self, enabled: bool) -> Self {
         self.context_params.offload_kqv = enabled;
         self
     }
 
-    /// Get the `offload_kqv` parameter
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// assert_eq!(params.offload_kqv(), true);
-    /// ```
     #[must_use]
     pub const fn offload_kqv(&self) -> bool {
         self.context_params.offload_kqv
     }
 
-    /// Set the type of rope scaling.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::{LlamaContextParams, RopeScalingType};
-    /// let params = LlamaContextParams::default()
-    ///     .with_rope_scaling_type(RopeScalingType::Linear);
-    /// assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear);
-    /// ```
     #[must_use]
     pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self {
         self.context_params.rope_scaling_type = i32::from(rope_scaling_type);
         self
     }
 
-    /// Get the type of rope scaling.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.rope_scaling_type(), llama_cpp_bindings::context::params::RopeScalingType::Unspecified);
-    /// ```
     #[must_use]
     pub fn rope_scaling_type(&self) -> RopeScalingType {
         RopeScalingType::from(self.context_params.rope_scaling_type)
     }
 
-    /// Set the rope frequency base.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///    .with_rope_freq_base(0.5);
-    /// assert_eq!(params.rope_freq_base(), 0.5);
-    /// ```
     #[must_use]
     pub const fn with_rope_freq_base(mut self, rope_freq_base: f32) -> Self {
         self.context_params.rope_freq_base = rope_freq_base;
         self
     }
 
-    /// Get the rope frequency base.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.rope_freq_base(), 0.0);
-    /// ```
     #[must_use]
     pub const fn rope_freq_base(&self) -> f32 {
         self.context_params.rope_freq_base
     }
 
-    /// Set the rope frequency scale.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///   .with_rope_freq_scale(0.5);
-    /// assert_eq!(params.rope_freq_scale(), 0.5);
-    /// ```
     #[must_use]
     pub const fn with_rope_freq_scale(mut self, rope_freq_scale: f32) -> Self {
         self.context_params.rope_freq_scale = rope_freq_scale;
         self
     }
 
-    /// Get the rope frequency scale.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.rope_freq_scale(), 0.0);
-    /// ```
     #[must_use]
     pub const fn rope_freq_scale(&self) -> f32 {
         self.context_params.rope_freq_scale
     }
 
-    /// Get the number of threads.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.n_threads(), 4);
-    /// ```
     #[must_use]
     pub const fn n_threads(&self) -> i32 {
         self.context_params.n_threads
     }
 
-    /// Get the number of threads allocated for batches.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.n_threads_batch(), 4);
-    /// ```
     #[must_use]
     pub const fn n_threads_batch(&self) -> i32 {
         self.context_params.n_threads_batch
     }
 
-    /// Set the number of threads.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///    .with_n_threads(8);
-    /// assert_eq!(params.n_threads(), 8);
-    /// ```
     #[must_use]
     pub const fn with_n_threads(mut self, n_threads: i32) -> Self {
         self.context_params.n_threads = n_threads;
         self
     }
 
-    /// Set the number of threads allocated for batches.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///    .with_n_threads_batch(8);
-    /// assert_eq!(params.n_threads_batch(), 8);
-    /// ```
     #[must_use]
     pub const fn with_n_threads_batch(mut self, n_threads: i32) -> Self {
         self.context_params.n_threads_batch = n_threads;
         self
     }
 
-    /// Check whether embeddings are enabled
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert!(!params.embeddings());
-    /// ```
     #[must_use]
     pub const fn embeddings(&self) -> bool {
         self.context_params.embeddings
     }
 
-    /// Enable the use of embeddings
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///    .with_embeddings(true);
-    /// assert!(params.embeddings());
-    /// ```
     #[must_use]
     pub const fn with_embeddings(mut self, embedding: bool) -> Self {
         self.context_params.embeddings = embedding;
         self
     }
 
-    /// Set the evaluation callback.
-    ///
-    /// # Examples
-    ///
-    /// ```no_run
-    /// extern "C" fn cb_eval_fn(
-    ///     t: *mut llama_cpp_bindings_sys::ggml_tensor,
-    ///     ask: bool,
-    ///     user_data: *mut std::ffi::c_void,
-    /// ) -> bool {
-    ///     false
-    /// }
-    ///
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));
-    /// ```
     #[must_use]
     pub fn with_cb_eval(
         mut self,
@@ -381,16 +159,6 @@ impl LlamaContextParams {
         self
     }
 
-    /// Set the evaluation callback user data.
-    ///
-    /// # Examples
-    ///
-    /// ```no_run
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// let user_data = std::ptr::null_mut();
-    /// let params = params.with_cb_eval_user_data(user_data);
-    /// ```
     #[must_use]
     pub const fn with_cb_eval_user_data(
         mut self,
@@ -400,382 +168,171 @@ impl LlamaContextParams {
         self
     }
 
-    /// Set the type of pooling.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::{LlamaContextParams, LlamaPoolingType};
-    /// let params = LlamaContextParams::default()
-    ///     .with_pooling_type(LlamaPoolingType::Last);
-    /// assert_eq!(params.pooling_type(), LlamaPoolingType::Last);
-    /// ```
     #[must_use]
     pub fn with_pooling_type(mut self, pooling_type: LlamaPoolingType) -> Self {
         self.context_params.pooling_type = i32::from(pooling_type);
         self
     }
 
-    /// Get the type of pooling.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.pooling_type(), llama_cpp_bindings::context::params::LlamaPoolingType::Unspecified);
-    /// ```
     #[must_use]
     pub fn pooling_type(&self) -> LlamaPoolingType {
         LlamaPoolingType::from(self.context_params.pooling_type)
     }
 
-    /// Set whether to use full sliding window attention
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_swa_full(false);
-    /// assert_eq!(params.swa_full(), false);
-    /// ```
     #[must_use]
     pub const fn with_swa_full(mut self, enabled: bool) -> Self {
         self.context_params.swa_full = enabled;
         self
     }
 
-    /// Get whether full sliding window attention is enabled
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// assert_eq!(params.swa_full(), true);
-    /// ```
     #[must_use]
     pub const fn swa_full(&self) -> bool {
         self.context_params.swa_full
     }
 
-    /// Set the max number of sequences (i.e. distinct states for recurrent models)
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_n_seq_max(64);
-    /// assert_eq!(params.n_seq_max(), 64);
-    /// ```
     #[must_use]
     pub const fn with_n_seq_max(mut self, n_seq_max: u32) -> Self {
         self.context_params.n_seq_max = n_seq_max;
         self
     }
 
-    /// Get the max number of sequences (i.e. distinct states for recurrent models)
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// assert_eq!(params.n_seq_max(), 1);
-    /// ```
     #[must_use]
     pub const fn n_seq_max(&self) -> u32 {
         self.context_params.n_seq_max
     }
-    /// Set the KV cache data type for K
-    /// use `llama_cpp_bindings::context::params::{LlamaContextParams`, `KvCacheType`};
-    /// let params = `LlamaContextParams::default().with_type_k(KvCacheType::Q4_0)`;
-    /// `assert_eq!(params.type_k()`, `KvCacheType::Q4_0`);
-    /// ```
     #[must_use]
     pub fn with_type_k(mut self, type_k: KvCacheType) -> Self {
         self.context_params.type_k = type_k.into();
         self
     }
 
-    /// Get the KV cache data type for K
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// let _ = params.type_k();
-    /// ```
     #[must_use]
     pub fn type_k(&self) -> KvCacheType {
         KvCacheType::from(self.context_params.type_k)
     }
 
-    /// Set the KV cache data type for V
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::{LlamaContextParams, KvCacheType};
-    /// let params = LlamaContextParams::default().with_type_v(KvCacheType::Q4_1);
-    /// assert_eq!(params.type_v(), KvCacheType::Q4_1);
-    /// ```
     #[must_use]
     pub fn with_type_v(mut self, type_v: KvCacheType) -> Self {
         self.context_params.type_v = type_v.into();
         self
     }
 
-    /// Get the KV cache data type for V
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// let _ = params.type_v();
-    /// ```
     #[must_use]
     pub fn type_v(&self) -> KvCacheType {
         KvCacheType::from(self.context_params.type_v)
     }
 
-    /// Set the attention type
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::{LlamaContextParams, LlamaAttentionType};
-    /// let params = LlamaContextParams::default()
-    ///     .with_attention_type(LlamaAttentionType::NonCausal);
-    /// assert_eq!(params.attention_type(), LlamaAttentionType::NonCausal);
-    /// ```
     #[must_use]
     pub fn with_attention_type(mut self, attention_type: LlamaAttentionType) -> Self {
         self.context_params.attention_type = i32::from(attention_type);
         self
     }
 
-    /// Get the attention type
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// let params = llama_cpp_bindings::context::params::LlamaContextParams::default();
-    /// assert_eq!(params.attention_type(), llama_cpp_bindings::context::params::LlamaAttentionType::Unspecified);
-    /// ```
     #[must_use]
     pub fn attention_type(&self) -> LlamaAttentionType {
         LlamaAttentionType::from(self.context_params.attention_type)
     }
 
-    /// Set the `YaRN` extrapolation factor
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_yarn_ext_factor(1.0);
-    /// assert!((params.yarn_ext_factor() - 1.0).abs() < f32::EPSILON);
-    /// ```
     #[must_use]
     pub const fn with_yarn_ext_factor(mut self, yarn_ext_factor: f32) -> Self {
         self.context_params.yarn_ext_factor = yarn_ext_factor;
         self
     }
 
-    /// Get the `YaRN` extrapolation factor
     #[must_use]
     pub const fn yarn_ext_factor(&self) -> f32 {
         self.context_params.yarn_ext_factor
     }
 
-    /// Set the `YaRN` attention factor
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_yarn_attn_factor(2.0);
-    /// assert!((params.yarn_attn_factor() - 2.0).abs() < f32::EPSILON);
-    /// ```
     #[must_use]
     pub const fn with_yarn_attn_factor(mut self, yarn_attn_factor: f32) -> Self {
         self.context_params.yarn_attn_factor = yarn_attn_factor;
         self
     }
 
-    /// Get the `YaRN` attention factor
     #[must_use]
     pub const fn yarn_attn_factor(&self) -> f32 {
         self.context_params.yarn_attn_factor
     }
 
-    /// Set the `YaRN` low correction dim
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_yarn_beta_fast(32.0);
-    /// assert!((params.yarn_beta_fast() - 32.0).abs() < f32::EPSILON);
-    /// ```
     #[must_use]
     pub const fn with_yarn_beta_fast(mut self, yarn_beta_fast: f32) -> Self {
         self.context_params.yarn_beta_fast = yarn_beta_fast;
         self
     }
 
-    /// Get the `YaRN` low correction dim
     #[must_use]
     pub const fn yarn_beta_fast(&self) -> f32 {
         self.context_params.yarn_beta_fast
     }
 
-    /// Set the `YaRN` high correction dim
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_yarn_beta_slow(1.0);
-    /// assert!((params.yarn_beta_slow() - 1.0).abs() < f32::EPSILON);
-    /// ```
     #[must_use]
     pub const fn with_yarn_beta_slow(mut self, yarn_beta_slow: f32) -> Self {
         self.context_params.yarn_beta_slow = yarn_beta_slow;
         self
     }
 
-    /// Get the `YaRN` high correction dim
     #[must_use]
     pub const fn yarn_beta_slow(&self) -> f32 {
         self.context_params.yarn_beta_slow
     }
 
-    /// Set the `YaRN` original context size
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_yarn_orig_ctx(4096);
-    /// assert_eq!(params.yarn_orig_ctx(), 4096);
-    /// ```
     #[must_use]
     pub const fn with_yarn_orig_ctx(mut self, yarn_orig_ctx: u32) -> Self {
         self.context_params.yarn_orig_ctx = yarn_orig_ctx;
         self
     }
 
-    /// Get the `YaRN` original context size
     #[must_use]
     pub const fn yarn_orig_ctx(&self) -> u32 {
         self.context_params.yarn_orig_ctx
     }
 
-    /// Set the KV cache defragmentation threshold
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_defrag_thold(0.1);
-    /// assert!((params.defrag_thold() - 0.1).abs() < f32::EPSILON);
-    /// ```
     #[must_use]
     pub const fn with_defrag_thold(mut self, defrag_thold: f32) -> Self {
         self.context_params.defrag_thold = defrag_thold;
         self
     }
 
-    /// Get the KV cache defragmentation threshold
     #[must_use]
     pub const fn defrag_thold(&self) -> f32 {
         self.context_params.defrag_thold
     }
 
-    /// Set whether performance timings are disabled
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_no_perf(true);
-    /// assert!(params.no_perf());
-    /// ```
     #[must_use]
     pub const fn with_no_perf(mut self, no_perf: bool) -> Self {
         self.context_params.no_perf = no_perf;
         self
     }
 
-    /// Get whether performance timings are disabled
     #[must_use]
     pub const fn no_perf(&self) -> bool {
         self.context_params.no_perf
     }
 
-    /// Set whether to offload ops to GPU
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_op_offload(false);
-    /// assert!(!params.op_offload());
-    /// ```
     #[must_use]
     pub const fn with_op_offload(mut self, op_offload: bool) -> Self {
         self.context_params.op_offload = op_offload;
         self
     }
 
-    /// Get whether ops are offloaded to GPU
     #[must_use]
     pub const fn op_offload(&self) -> bool {
         self.context_params.op_offload
     }
 
-    /// Set whether to use a unified KV cache buffer across input sequences
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_bindings::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_kv_unified(true);
-    /// assert!(params.kv_unified());
-    /// ```
     #[must_use]
     pub const fn with_kv_unified(mut self, kv_unified: bool) -> Self {
         self.context_params.kv_unified = kv_unified;
         self
     }
 
-    /// Get whether a unified KV cache buffer is used across input sequences
     #[must_use]
     pub const fn kv_unified(&self) -> bool {
         self.context_params.kv_unified
     }
 }
 
-/// Default parameters for `LlamaContext`. (as defined in llama.cpp by `llama_context_default_params`)
-/// ```
-/// # use std::num::NonZeroU32;
-/// use llama_cpp_bindings::context::params::{LlamaContextParams, RopeScalingType};
-/// let params = LlamaContextParams::default();
-/// assert_eq!(params.n_ctx(), NonZeroU32::new(512), "n_ctx should be 512");
-/// assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified);
-/// ```
 impl Default for LlamaContextParams {
     fn default() -> Self {
         let context_params = unsafe { llama_cpp_bindings_sys::llama_context_default_params() };
diff --git a/llama-cpp-bindings/src/context/rope_scaling_type.rs b/llama-cpp-bindings/src/context/rope_scaling_type.rs
index 0bbfa831..92aca372 100644
--- a/llama-cpp-bindings/src/context/rope_scaling_type.rs
+++ b/llama-cpp-bindings/src/context/rope_scaling_type.rs
@@ -1,19 +1,12 @@
-/// A rusty wrapper around `rope_scaling_type`.
 #[repr(i8)]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum RopeScalingType {
-    /// The scaling type is unspecified
     Unspecified = -1,
-    /// No scaling
     None = 0,
-    /// Linear scaling
     Linear = 1,
-    /// Yarn scaling
     Yarn = 2,
 }
 
-/// Create a `RopeScalingType` from a `c_int` - returns `RopeScalingType::ScalingUnspecified` if
-/// the value is not recognized.
 impl From<i32> for RopeScalingType {
     fn from(value: i32) -> Self {
         match value {
@@ -25,7 +18,6 @@ impl From<i32> for RopeScalingType {
     }
 }
 
-/// Create a `c_int` from a `RopeScalingType`.
 impl From<RopeScalingType> for i32 {
     fn from(value: RopeScalingType) -> Self {
         match value {
diff --git a/llama-cpp-bindings/src/context/save_seq_state_error.rs b/llama-cpp-bindings/src/context/save_seq_state_error.rs
index 129cd1cd..96410430 100644
--- a/llama-cpp-bindings/src/context/save_seq_state_error.rs
+++ b/llama-cpp-bindings/src/context/save_seq_state_error.rs
@@ -1,20 +1,14 @@
-//! Error type for sequence state file save operations.
-
 use std::ffi::NulError;
 use std::path::PathBuf;
 
-/// Failed to save a sequence state file.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum SaveSeqStateError {
-    /// llama.cpp failed to save the sequence state file
     #[error("Failed to save sequence state file")]
     FailedToSave,
 
-    /// null byte in string
     #[error("null byte in string {0}")]
     NullError(#[from] NulError),
 
-    /// failed to convert path to str
     #[error("failed to convert path {0} to str")]
     PathToStrError(PathBuf),
 }
diff --git a/llama-cpp-bindings/src/context/save_session_error.rs b/llama-cpp-bindings/src/context/save_session_error.rs
index 95999328..6814182e 100644
--- a/llama-cpp-bindings/src/context/save_session_error.rs
+++ b/llama-cpp-bindings/src/context/save_session_error.rs
@@ -1,20 +1,14 @@
-//! Error type for session file save operations.
-
 use std::ffi::NulError;
 use std::path::PathBuf;
 
-/// Failed to save a session file.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum SaveSessionError {
-    /// llama.cpp failed to save the session file
     #[error("Failed to save session file")]
     FailedToSave,
 
-    /// null byte in string
     #[error("null byte in string {0}")]
     NullError(#[from] NulError),
 
-    /// failed to convert path to str
     #[error("failed to convert path {0} to str")]
     PathToStrError(PathBuf),
 }
diff --git a/llama-cpp-bindings/src/context/session.rs b/llama-cpp-bindings/src/context/session.rs
index 3c7d211c..4a3f16ba 100644
--- a/llama-cpp-bindings/src/context/session.rs
+++ b/llama-cpp-bindings/src/context/session.rs
@@ -1,5 +1,3 @@
-//! utilities for working with session files
-
 use crate::context::LlamaContext;
 use crate::context::llama_state_seq_flags::LlamaStateSeqFlags;
 use crate::context::load_seq_state_error::LoadSeqStateError;
@@ -49,14 +47,6 @@ fn process_seq_load_result(
 }
 
 impl LlamaContext<'_> {
-    /// Save the full state to a file.
-    ///
-    /// # Parameters
-    ///
-    /// * `path_session` - The file to save to.
-    /// * `tokens` - The tokens to associate the state with. This should be a prefix of a sequence
-    ///   of tokens that the context has processed, so that the relevant KV caches are already filled.
-    ///
     /// # Errors
     ///
     /// Fails if the path is not a valid utf8 or llama.cpp fails to save the state file.
@@ -88,18 +78,6 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Load a state file into the current context.
-    ///
-    /// You still need to pass the returned tokens to the context for inference to work. What this
-    /// function buys you is that the KV caches are already filled with the relevant data.
-    ///
-    /// # Parameters
-    ///
-    /// * `path_session` - The file to load from. It must be a state file from a compatible context,
-    ///   otherwise the function will error.
-    /// * `max_tokens` - The maximum token length of the loaded state. If the state was saved with a
-    ///   longer length, the function will error.
-    ///
     /// # Errors
     ///
     /// Fails if the path is not a valid utf8 or llama.cpp fails to load the state file.
@@ -134,24 +112,10 @@ impl LlamaContext<'_> {
         process_session_load_result(success, n_out, max_tokens, tokens)
     }
 
-    /// Save state for a single sequence to a file.
-    ///
-    /// This enables saving state for individual sequences, which is useful for multi-sequence
-    /// inference scenarios.
-    ///
-    /// # Parameters
-    ///
-    /// * `filepath` - The file to save to.
-    /// * `seq_id` - The sequence ID whose state to save.
-    /// * `tokens` - The tokens to associate with the saved state.
-    ///
     /// # Errors
     ///
     /// Fails if the path is not a valid utf8 or llama.cpp fails to save the sequence state file.
     ///
-    /// # Returns
-    ///
-    /// The number of bytes written on success.
     pub fn state_seq_save_file(
         &self,
         filepath: impl AsRef<Path>,
@@ -184,24 +148,10 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Load state for a single sequence from a file.
-    ///
-    /// This enables loading state for individual sequences, which is useful for multi-sequence
-    /// inference scenarios.
-    ///
-    /// # Parameters
-    ///
-    /// * `filepath` - The file to load from.
-    /// * `dest_seq_id` - The destination sequence ID to load the state into.
-    /// * `max_tokens` - The maximum number of tokens to read.
-    ///
     /// # Errors
     ///
     /// Fails if the path is not a valid utf8 or llama.cpp fails to load the sequence state file.
     ///
-    /// # Returns
-    ///
-    /// A tuple of `(tokens, bytes_read)` on success.
     pub fn state_seq_load_file(
         &mut self,
         filepath: impl AsRef<Path>,
@@ -236,19 +186,11 @@ impl LlamaContext<'_> {
         process_seq_load_result(bytes_read, n_out, max_tokens, tokens)
     }
 
-    /// Returns the maximum size in bytes of the state (rng, logits, embedding
-    /// and `kv_cache`) - will often be smaller after compacting tokens
     #[must_use]
     pub fn get_state_size(&self) -> usize {
         unsafe { llama_cpp_bindings_sys::llama_state_get_size(self.context.as_ptr()) }
     }
 
-    /// Copies the state to the specified destination buffer.
-    ///
-    /// Use [`get_state_size`](Self::get_state_size) to determine the required buffer size.
-    ///
-    /// Returns the number of bytes copied.
-    ///
     /// # Safety
     ///
     /// The `dest` buffer must be large enough to hold the complete state data.
@@ -262,10 +204,6 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Set the state reading from the specified buffer.
-    ///
-    /// Returns the number of bytes read.
-    ///
     /// # Safety
     ///
     /// The `src` buffer must contain data previously obtained from [`copy_state_data`](Self::copy_state_data)
@@ -281,10 +219,6 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Get the size of the state data for a specific sequence, with extended flags.
-    ///
-    /// Useful for hybrid/recurrent models where partial state (e.g., only SSM state)
-    /// may be saved or restored.
     #[must_use]
     pub fn state_seq_get_size_ext(&self, seq_id: i32, flags: &LlamaStateSeqFlags) -> usize {
         unsafe {
@@ -296,13 +230,6 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Copy state data for a specific sequence into `dest`, with extended flags.
-    ///
-    /// Use [`state_seq_get_size_ext`](Self::state_seq_get_size_ext) to determine the required
-    /// buffer size before calling this method.
-    ///
-    /// Returns the number of bytes written.
-    ///
     /// # Safety
     ///
     /// The `dest` buffer must be large enough to hold the complete state data.
@@ -323,10 +250,6 @@ impl LlamaContext<'_> {
         }
     }
 
-    /// Restore state data for a specific sequence from `src`, with extended flags.
-    ///
-    /// Returns the number of bytes read.
-    ///
     /// # Safety
     ///
     /// The `src` buffer must contain data previously obtained from
diff --git a/llama-cpp-bindings/src/error.rs b/llama-cpp-bindings/src/error.rs
index ba684109..436edad7 100644
--- a/llama-cpp-bindings/src/error.rs
+++ b/llama-cpp-bindings/src/error.rs
@@ -70,5 +70,4 @@ pub use token_to_string_error::TokenToStringError;
 pub use tool_call_format_failure::ToolCallFormatFailure;
 pub use xml_function_tags_failure::XmlFunctionTagsFailure;
 
-/// A failable result from a llama.cpp function.
 pub type Result<TValue> = std::result::Result<TValue, LlamaCppError>;
diff --git a/llama-cpp-bindings/src/error/apply_chat_template_error.rs b/llama-cpp-bindings/src/error/apply_chat_template_error.rs
index 251dda35..363c9f38 100644
--- a/llama-cpp-bindings/src/error/apply_chat_template_error.rs
+++ b/llama-cpp-bindings/src/error/apply_chat_template_error.rs
@@ -1,12 +1,9 @@
 use std::string::FromUtf8Error;
 
-/// Failed to apply model chat template.
 #[derive(Debug, thiserror::Error)]
 pub enum ApplyChatTemplateError {
-    /// the string could not be converted to utf8.
     #[error("{0}")]
     FromUtf8Error(#[from] FromUtf8Error),
-    /// An integer conversion failed.
     #[error("Integer conversion error: {0}")]
     IntConversionError(#[from] std::num::TryFromIntError),
 }
diff --git a/llama-cpp-bindings/src/error/bracketed_args_failure.rs b/llama-cpp-bindings/src/error/bracketed_args_failure.rs
index 8750a9be..dcda30ae 100644
--- a/llama-cpp-bindings/src/error/bracketed_args_failure.rs
+++ b/llama-cpp-bindings/src/error/bracketed_args_failure.rs
@@ -1,4 +1,3 @@
-/// Failures specific to the bracketed-JSON args parser (Mistral 3 `[TOOL_CALLS]name[ARGS]{...}`).
 #[derive(Debug, thiserror::Error)]
 pub enum BracketedArgsFailure {
     #[error("tool call '{tool_name}' arguments are not valid JSON: {message}")]
diff --git a/llama-cpp-bindings/src/error/chat_template_error.rs b/llama-cpp-bindings/src/error/chat_template_error.rs
index 190b96fa..d063de05 100644
--- a/llama-cpp-bindings/src/error/chat_template_error.rs
+++ b/llama-cpp-bindings/src/error/chat_template_error.rs
@@ -1,17 +1,13 @@
 use std::ffi::NulError;
 
-/// There was an error while getting the chat template from a model.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum ChatTemplateError {
-    /// gguf has no chat template (by that name)
     #[error("chat template not found - returned null pointer")]
     MissingTemplate,
 
-    /// chat template contained a null byte
     #[error("null byte in string {0}")]
     NullError(#[from] NulError),
 
-    /// The chat template was not valid utf8.
     #[error(transparent)]
     Utf8Error(#[from] std::str::Utf8Error),
 }
diff --git a/llama-cpp-bindings/src/error/embeddings_error.rs b/llama-cpp-bindings/src/error/embeddings_error.rs
index a01bb428..9555f196 100644
--- a/llama-cpp-bindings/src/error/embeddings_error.rs
+++ b/llama-cpp-bindings/src/error/embeddings_error.rs
@@ -1,16 +1,11 @@
-/// When embedding related functions fail
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum EmbeddingsError {
-    /// Embeddings weren't enabled in the context options
     #[error("Embeddings weren't enabled in the context options")]
     NotEnabled,
-    /// Logits weren't enabled for the given token
     #[error("Logits were not enabled for the given token")]
     LogitsNotEnabled,
-    /// The given sequence index exceeds the max sequence id
     #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
     NonePoolType,
-    /// The embedding dimension does not fit into a usize.
     #[error("Invalid embedding dimension: {0}")]
     InvalidEmbeddingDimension(#[source] std::num::TryFromIntError),
 }
diff --git a/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs b/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs
index 146bcedb..d5d485f7 100644
--- a/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs
+++ b/llama-cpp-bindings/src/error/eval_multimodal_chunks_error.rs
@@ -1,16 +1,12 @@
 use crate::mtmd::MtmdEvalError;
 use crate::mtmd::mtmd_input_chunk_type_error::MtmdInputChunkTypeError;
 
-/// Failed to evaluate multimodal chunks through the request classifier.
 #[derive(Debug, thiserror::Error)]
 pub enum EvalMultimodalChunksError {
-    /// `MtmdInputChunks::eval_chunks` returned an error.
     #[error("{0}")]
     EvalFailed(#[from] MtmdEvalError),
-    /// A chunk reported a type that is not known to this binding.
     #[error("{0}")]
     UnknownChunkType(#[from] MtmdInputChunkTypeError),
-    /// A chunk index that was within `chunks.len()` returned `None` from `chunks.get(index)`.
     #[error("chunk index {0} out of bounds during post-eval walk")]
     ChunkOutOfBounds(usize),
 }
diff --git a/llama-cpp-bindings/src/error/fit_error.rs b/llama-cpp-bindings/src/error/fit_error.rs
index 2d6fe6b5..fbb809c5 100644
--- a/llama-cpp-bindings/src/error/fit_error.rs
+++ b/llama-cpp-bindings/src/error/fit_error.rs
@@ -1,20 +1,13 @@
-/// Returned by [`crate::model::params::LlamaModelParams::fit_params`].
 #[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
 pub enum FitError {
-    /// No combination of model parameters fits the available device memory.
     #[error("no parameter combination fits available memory")]
     NoFittingMemoryLayout,
-    /// Parameter fitting was aborted by a hard error reported by the underlying library
-    /// (e.g., model file missing, backend initialization failed).
     #[error("parameter fitting aborted")]
     Aborted,
-    /// The fitting helper returned a status code the wrapper does not recognise.
     #[error("parameter fitting returned an unknown status code: {code}")]
     UnknownStatus { code: i32 },
-    /// Wrapper could not allocate memory for an error message.
     #[error("not enough memory")]
     NotEnoughMemory,
-    /// Generic exception caught at the wrapper boundary, with the underlying message.
     #[error("{message}")]
     Reported { message: String },
 }
diff --git a/llama-cpp-bindings/src/error/json_object_failure.rs b/llama-cpp-bindings/src/error/json_object_failure.rs
index b5d88570..e18868ce 100644
--- a/llama-cpp-bindings/src/error/json_object_failure.rs
+++ b/llama-cpp-bindings/src/error/json_object_failure.rs
@@ -1,4 +1,3 @@
-/// Failures specific to the JSON-object args parser (Qwen 3 `<tool_call>{"name":..., "arguments":...}</tool_call>`).
 #[derive(Debug, thiserror::Error)]
 pub enum JsonObjectFailure {
     #[error("tool call body has malformed JSON: {message}")]
diff --git a/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs b/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs
index 3c46093a..83941376 100644
--- a/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs
+++ b/llama-cpp-bindings/src/error/key_value_xml_tags_failure.rs
@@ -1,4 +1,3 @@
-/// Failures specific to the key-value XML-tags parser (GLM-4.7 `<tool_call>{name}<arg_key>{k}</arg_key><arg_value>{v}</arg_value>...</tool_call>`).
 #[derive(Debug, thiserror::Error)]
 pub enum KeyValueXmlTagsFailure {
     #[error("tool call function tag has empty name")]
diff --git a/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs b/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs
index 3d536c4a..cf9be711 100644
--- a/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs
+++ b/llama-cpp-bindings/src/error/llama_lora_adapter_remove_error.rs
@@ -1,7 +1,5 @@
-/// An error that can occur when loading a model.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LlamaLoraAdapterRemoveError {
-    /// llama.cpp returned a non-zero error code.
     #[error("error code from llama cpp")]
     ErrorResult(i32),
 }
diff --git a/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs b/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs
index 362f6ca1..3bca954f 100644
--- a/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs
+++ b/llama-cpp-bindings/src/error/llama_lora_adapter_set_error.rs
@@ -1,7 +1,5 @@
-/// An error that can occur when loading a model.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LlamaLoraAdapterSetError {
-    /// llama.cpp returned a non-zero error code.
     #[error("error code from llama cpp")]
     ErrorResult(i32),
 }
diff --git a/llama-cpp-bindings/src/error/logits_error.rs b/llama-cpp-bindings/src/error/logits_error.rs
index f6a198d2..8462a9b9 100644
--- a/llama-cpp-bindings/src/error/logits_error.rs
+++ b/llama-cpp-bindings/src/error/logits_error.rs
@@ -1,24 +1,13 @@
-/// When logits-related functions fail
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LogitsError {
-    /// The logits data pointer is null.
     #[error("logits data pointer is null")]
     NullLogits,
-    /// The requested token index has not been initialized for logits.
     #[error("logit for token index {0} is not initialized")]
     TokenNotInitialized(i32),
-    /// The token index exceeds the context size.
     #[error("token index {token_index} exceeds context size {context_size}")]
-    TokenIndexExceedsContext {
-        /// The token index that was requested.
-        token_index: u32,
-        /// The context size.
-        context_size: u32,
-    },
-    /// The vocabulary size does not fit into a usize.
+    TokenIndexExceedsContext { token_index: u32, context_size: u32 },
     #[error("n_vocab does not fit into usize: {0}")]
     VocabSizeOverflow(#[source] std::num::TryFromIntError),
-    /// The token index does not fit into a u32.
     #[error("token_index does not fit into u32: {0}")]
     TokenIndexOverflow(#[source] std::num::TryFromIntError),
 }
diff --git a/llama-cpp-bindings/src/error/meta_val_error.rs b/llama-cpp-bindings/src/error/meta_val_error.rs
index 30b07223..ecd86e6b 100644
--- a/llama-cpp-bindings/src/error/meta_val_error.rs
+++ b/llama-cpp-bindings/src/error/meta_val_error.rs
@@ -1,18 +1,14 @@
 use std::ffi::NulError;
 use std::string::FromUtf8Error;
 
-/// Failed fetching metadata value
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum MetaValError {
-    /// The provided string contains an unexpected null-byte
     #[error("null byte in string {0}")]
     NullError(#[from] NulError),
 
-    /// The returned data contains invalid UTF8 data
     #[error("FromUtf8Error {0}")]
     FromUtf8Error(#[from] FromUtf8Error),
 
-    /// Got negative return value. This happens if the key or index queried does not exist.
     #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
     NegativeReturn(i32),
 }
diff --git a/llama-cpp-bindings/src/error/model_params_error.rs b/llama-cpp-bindings/src/error/model_params_error.rs
index 377596f1..8e70ebb4 100644
--- a/llama-cpp-bindings/src/error/model_params_error.rs
+++ b/llama-cpp-bindings/src/error/model_params_error.rs
@@ -1,18 +1,9 @@
-/// Errors that can occur when modifying model parameters.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum ModelParamsError {
-    /// The internal override vector has no available slot.
     #[error("No available slot in override vector")]
     NoAvailableSlot,
-    /// The first override slot is not empty.
     #[error("Override slot is not empty")]
     SlotNotEmpty,
-    /// A character in the key is not a valid C char.
     #[error("Invalid character in key: byte {byte}, {reason}")]
-    InvalidCharacterInKey {
-        /// The byte value that failed conversion.
-        byte: u8,
-        /// The reason the conversion failed.
-        reason: String,
-    },
+    InvalidCharacterInKey { byte: u8, reason: String },
 }
diff --git a/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs b/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs
index c7076486..d38337bf 100644
--- a/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs
+++ b/llama-cpp-bindings/src/error/new_llama_chat_message_error.rs
@@ -1,9 +1,7 @@
 use std::ffi::NulError;
 
-/// Failed to apply model chat template.
 #[derive(Debug, thiserror::Error)]
 pub enum NewLlamaChatMessageError {
-    /// the string contained a null byte and thus could not be converted to a c string.
     #[error("{0}")]
     NulError(#[from] NulError),
 }
diff --git a/llama-cpp-bindings/src/error/paired_quote_failure.rs b/llama-cpp-bindings/src/error/paired_quote_failure.rs
index 9a2a3d85..53b50aa8 100644
--- a/llama-cpp-bindings/src/error/paired_quote_failure.rs
+++ b/llama-cpp-bindings/src/error/paired_quote_failure.rs
@@ -1,4 +1,3 @@
-/// Failures specific to the paired-quote args parser (Gemma 4 `<|tool_call>call:name{key:<|"|>val<|"|>}`).
 #[derive(Debug, thiserror::Error)]
 pub enum PairedQuoteFailure {
     #[error("empty key in tool call '{tool_name}' arguments")]
diff --git a/llama-cpp-bindings/src/error/sampling_error.rs b/llama-cpp-bindings/src/error/sampling_error.rs
index 7a2e7346..de13b87e 100644
--- a/llama-cpp-bindings/src/error/sampling_error.rs
+++ b/llama-cpp-bindings/src/error/sampling_error.rs
@@ -1,7 +1,5 @@
-/// Errors that can occur when creating a sampling configuration.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum SamplingError {
-    /// An integer value exceeded the allowed range
     #[error("Integer overflow: {0}")]
     IntegerOverflow(String),
 }
diff --git a/llama-cpp-bindings/src/error/token_sampling_error.rs b/llama-cpp-bindings/src/error/token_sampling_error.rs
index da1bc7f0..90b89dcc 100644
--- a/llama-cpp-bindings/src/error/token_sampling_error.rs
+++ b/llama-cpp-bindings/src/error/token_sampling_error.rs
@@ -1,7 +1,5 @@
-/// Failed to sample a token from the data array.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum TokenSamplingError {
-    /// The sampler did not select any token.
     #[error("No token was selected by the sampler")]
     NoTokenSelected,
 }
diff --git a/llama-cpp-bindings/src/error/token_to_string_error.rs b/llama-cpp-bindings/src/error/token_to_string_error.rs
index 0fb0eb89..af3ea657 100644
--- a/llama-cpp-bindings/src/error/token_to_string_error.rs
+++ b/llama-cpp-bindings/src/error/token_to_string_error.rs
@@ -1,20 +1,15 @@
 use std::os::raw::c_int;
 use std::string::FromUtf8Error;
 
-/// An error that can occur when converting a token to a string.
 #[derive(Debug, thiserror::Error, Clone)]
 #[non_exhaustive]
 pub enum TokenToStringError {
-    /// the token type was unknown
     #[error("Unknown Token Type")]
     UnknownTokenType,
-    /// There was insufficient buffer space to convert the token to a string.
     #[error("Insufficient Buffer Space {0}")]
     InsufficientBufferSpace(c_int),
-    /// The token was not valid utf8.
     #[error("FromUtf8Error {0}")]
     FromUtf8Error(#[from] FromUtf8Error),
-    /// An integer conversion failed.
     #[error("Integer conversion error: {0}")]
     IntConversionError(#[from] std::num::TryFromIntError),
 }
diff --git a/llama-cpp-bindings/src/error/tool_call_format_failure.rs b/llama-cpp-bindings/src/error/tool_call_format_failure.rs
index ca1bd3d7..e188f81b 100644
--- a/llama-cpp-bindings/src/error/tool_call_format_failure.rs
+++ b/llama-cpp-bindings/src/error/tool_call_format_failure.rs
@@ -4,7 +4,6 @@ use crate::error::key_value_xml_tags_failure::KeyValueXmlTagsFailure;
 use crate::error::paired_quote_failure::PairedQuoteFailure;
 use crate::error::xml_function_tags_failure::XmlFunctionTagsFailure;
 
-/// Top-level failure for the wrapper-side template-override parsers (one variant per supported shape).
 #[derive(Debug, thiserror::Error)]
 pub enum ToolCallFormatFailure {
     #[error("bracketed-args fallback parser: {0}")]
diff --git a/llama-cpp-bindings/src/error/xml_function_tags_failure.rs b/llama-cpp-bindings/src/error/xml_function_tags_failure.rs
index 49180c00..bdff9936 100644
--- a/llama-cpp-bindings/src/error/xml_function_tags_failure.rs
+++ b/llama-cpp-bindings/src/error/xml_function_tags_failure.rs
@@ -1,4 +1,3 @@
-/// Failures specific to the XML function-tags parser (Qwen 3.5+ `<function=name><parameter=key>val</parameter></function>`).
 #[derive(Debug, thiserror::Error)]
 pub enum XmlFunctionTagsFailure {
     #[error("tool call function tag has empty name")]
diff --git a/llama-cpp-bindings/src/ffi_error_reader.rs b/llama-cpp-bindings/src/ffi_error_reader.rs
index 59313c27..77fa0359 100644
--- a/llama-cpp-bindings/src/ffi_error_reader.rs
+++ b/llama-cpp-bindings/src/ffi_error_reader.rs
@@ -1,7 +1,5 @@
 use std::ffi::{CStr, c_char};
 
-/// Reads a C error string, converts to Rust `String`, and frees the C memory.
-///
 /// # Safety
 ///
 /// `error_ptr` must be either null or a valid pointer to a null-terminated
diff --git a/llama-cpp-bindings/src/ffi_status_is_ok.rs b/llama-cpp-bindings/src/ffi_status_is_ok.rs
index f847162a..7127c5c2 100644
--- a/llama-cpp-bindings/src/ffi_status_is_ok.rs
+++ b/llama-cpp-bindings/src/ffi_status_is_ok.rs
@@ -1,4 +1,3 @@
-/// Returns true if the given status indicates success.
 #[must_use]
 pub const fn status_is_ok(status: llama_cpp_bindings_sys::llama_rs_status) -> bool {
     status == llama_cpp_bindings_sys::LLAMA_RS_STATUS_OK
diff --git a/llama-cpp-bindings/src/ffi_status_to_i32.rs b/llama-cpp-bindings/src/ffi_status_to_i32.rs
index a181d57c..faf7e39d 100644
--- a/llama-cpp-bindings/src/ffi_status_to_i32.rs
+++ b/llama-cpp-bindings/src/ffi_status_to_i32.rs
@@ -1,4 +1,3 @@
-/// Converts a status code to its underlying `i32` representation.
 #[must_use]
 pub const fn status_to_i32(status: llama_cpp_bindings_sys::llama_rs_status) -> i32 {
     status
diff --git a/llama-cpp-bindings/src/ggml_time_us.rs b/llama-cpp-bindings/src/ggml_time_us.rs
index 4db4b490..4d9db374 100644
--- a/llama-cpp-bindings/src/ggml_time_us.rs
+++ b/llama-cpp-bindings/src/ggml_time_us.rs
@@ -1,20 +1,3 @@
-/// Get the time in microseconds according to ggml.
-///
-/// ```
-/// # use std::time::Duration;
-/// # use llama_cpp_bindings::llama_backend::LlamaBackend;
-/// let backend = LlamaBackend::init().unwrap();
-/// use llama_cpp_bindings::ggml_time_us;
-///
-/// let start = ggml_time_us();
-///
-/// std::thread::sleep(Duration::from_micros(10));
-///
-/// let end = ggml_time_us();
-///
-/// let elapsed = end - start;
-///
-/// assert!(elapsed >= 10)
 #[must_use]
 pub fn ggml_time_us() -> i64 {
     unsafe { llama_cpp_bindings_sys::ggml_time_us() }
diff --git a/llama-cpp-bindings/src/gguf_context.rs b/llama-cpp-bindings/src/gguf_context.rs
index 7ef7114c..d51e2667 100644
--- a/llama-cpp-bindings/src/gguf_context.rs
+++ b/llama-cpp-bindings/src/gguf_context.rs
@@ -1,7 +1,3 @@
-//! Safe wrapper around `gguf_context` for reading GGUF file metadata.
-//!
-//! Provides metadata-only access to GGUF files without loading tensor data.
-
 use std::ffi::{CStr, CString};
 use std::path::Path;
 use std::ptr::NonNull;
@@ -9,18 +5,12 @@ use std::ptr::NonNull;
 use crate::gguf_context_error::GgufContextError;
 use crate::gguf_type::GgufType;
 
-/// A safe wrapper around `gguf_context`.
-///
-/// Opens a GGUF file in metadata-only mode (`no_alloc = true`), allowing
-/// inspection of key-value pairs and tensor metadata without loading tensor data.
 #[derive(Debug)]
 pub struct GgufContext {
     context: NonNull<llama_cpp_bindings_sys::gguf_context>,
 }
 
 impl GgufContext {
-    /// Open a GGUF file and parse its metadata header.
-    ///
     /// # Errors
     ///
     /// Returns [`GgufContextError::InitFailed`] if the file cannot be opened or parsed.
@@ -46,14 +36,11 @@ impl GgufContext {
         Ok(Self { context })
     }
 
-    /// Returns the number of key-value pairs in the GGUF file.
     #[must_use]
     pub fn n_kv(&self) -> i64 {
         unsafe { llama_cpp_bindings_sys::gguf_get_n_kv(self.context.as_ptr()) }
     }
 
-    /// Find the index of a key by name.
-    ///
     /// # Errors
     ///
     /// Returns [`GgufContextError::KeyNotFound`] if the key does not exist.
@@ -72,8 +59,6 @@ impl GgufContext {
         Ok(index)
     }
 
-    /// Returns the key name at the given index.
-    ///
     /// # Safety considerations
     ///
     /// The caller must ensure `key_id` is in range `[0, n_kv())`.
@@ -92,8 +77,6 @@ impl GgufContext {
         Ok(c_str.to_str()?)
     }
 
-    /// Returns the value type of the key-value pair at the given index.
-    ///
     /// # Safety considerations
     ///
     /// The caller must ensure `key_id` is in range `[0, n_kv())`.
@@ -105,8 +88,6 @@ impl GgufContext {
         GgufType::from_raw(raw)
     }
 
-    /// Returns the u32 value at the given key index.
-    ///
     /// # Safety considerations
     ///
     /// The caller must ensure the key at `key_id` has type [`GgufType::Uint32`].
@@ -115,8 +96,6 @@ impl GgufContext {
         unsafe { llama_cpp_bindings_sys::gguf_get_val_u32(self.context.as_ptr(), key_id) }
     }
 
-    /// Returns the i32 value at the given key index.
-    ///
     /// # Safety considerations
     ///
     /// The caller must ensure the key at `key_id` has type [`GgufType::Int32`].
@@ -125,8 +104,6 @@ impl GgufContext {
         unsafe { llama_cpp_bindings_sys::gguf_get_val_i32(self.context.as_ptr(), key_id) }
     }
 
-    /// Returns the u64 value at the given key index.
-    ///
     /// # Safety considerations
     ///
     /// The caller must ensure the key at `key_id` has type [`GgufType::Uint64`].
@@ -135,8 +112,6 @@ impl GgufContext {
         unsafe { llama_cpp_bindings_sys::gguf_get_val_u64(self.context.as_ptr(), key_id) }
     }
 
-    /// Returns the string value at the given key index.
-    ///
     /// # Safety considerations
     ///
     /// The caller must ensure the key at `key_id` has type [`GgufType::String`].
@@ -155,7 +130,6 @@ impl GgufContext {
         Ok(c_str.to_str()?)
     }
 
-    /// Returns the number of tensors in the GGUF file.
     #[must_use]
     pub fn n_tensors(&self) -> i64 {
         unsafe { llama_cpp_bindings_sys::gguf_get_n_tensors(self.context.as_ptr()) }
diff --git a/llama-cpp-bindings/src/gguf_context_error.rs b/llama-cpp-bindings/src/gguf_context_error.rs
index ba1aa0dc..69523c9d 100644
--- a/llama-cpp-bindings/src/gguf_context_error.rs
+++ b/llama-cpp-bindings/src/gguf_context_error.rs
@@ -1,31 +1,20 @@
-//! Error types for GGUF context operations.
-
 use std::ffi::NulError;
 use std::path::PathBuf;
 
-/// Errors that can occur when working with GGUF contexts.
 #[derive(Debug, thiserror::Error)]
 pub enum GgufContextError {
-    /// Failed to initialize GGUF context from file
     #[error("Failed to initialize GGUF context from file: {0}")]
     InitFailed(PathBuf),
 
-    /// Key not found in GGUF metadata
     #[error("Key not found in GGUF context: {key}")]
-    KeyNotFound {
-        /// The key that was not found
-        key: String,
-    },
+    KeyNotFound { key: String },
 
-    /// Null byte in string
     #[error("null byte in string: {0}")]
     NulError(#[from] NulError),
 
-    /// Path cannot be converted to UTF-8
     #[error("failed to convert path {0} to str")]
     PathToStrError(PathBuf),
 
-    /// Value is not valid UTF-8
     #[error("GGUF value is not valid UTF-8: {0}")]
     Utf8Error(#[from] std::str::Utf8Error),
 }
diff --git a/llama-cpp-bindings/src/gguf_type.rs b/llama-cpp-bindings/src/gguf_type.rs
index 33de25cd..e59451e1 100644
--- a/llama-cpp-bindings/src/gguf_type.rs
+++ b/llama-cpp-bindings/src/gguf_type.rs
@@ -1,39 +1,22 @@
-//! GGUF value types.
-
-/// The type of a value stored in a GGUF key-value pair.
 #[repr(u32)]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum GgufType {
-    /// 8-bit unsigned integer
     Uint8 = 0,
-    /// 8-bit signed integer
     Int8 = 1,
-    /// 16-bit unsigned integer
     Uint16 = 2,
-    /// 16-bit signed integer
     Int16 = 3,
-    /// 32-bit unsigned integer
     Uint32 = 4,
-    /// 32-bit signed integer
     Int32 = 5,
-    /// 32-bit floating point
     Float32 = 6,
-    /// Boolean
     Bool = 7,
-    /// String
     String = 8,
-    /// Array
     Array = 9,
-    /// 64-bit unsigned integer
     Uint64 = 10,
-    /// 64-bit signed integer
     Int64 = 11,
-    /// 64-bit floating point
     Float64 = 12,
 }
 
 impl GgufType {
-    /// Converts from the raw `gguf_type` value. Returns None for unknown types.
     #[must_use]
     pub const fn from_raw(value: llama_cpp_bindings_sys::gguf_type) -> Option<Self> {
         match value {
@@ -54,7 +37,6 @@ impl GgufType {
         }
     }
 
-    /// Converts to the raw `gguf_type` value.
     #[must_use]
     pub const fn to_raw(self) -> llama_cpp_bindings_sys::gguf_type {
         self as llama_cpp_bindings_sys::gguf_type
diff --git a/llama-cpp-bindings/src/ingest_outcome.rs b/llama-cpp-bindings/src/ingest_outcome.rs
index abf3a44b..56bc0efb 100644
--- a/llama-cpp-bindings/src/ingest_outcome.rs
+++ b/llama-cpp-bindings/src/ingest_outcome.rs
@@ -3,12 +3,6 @@ use crate::sampled_token::SampledToken;
 #[derive(Clone, Debug)]
 pub struct IngestOutcome {
     pub sampled_token: SampledToken,
-    /// Empty when the token is part of a recognised marker boundary; otherwise
-    /// the decoded UTF-8 piece. Callers should stream `visible_piece` and skip
-    /// emission when it is empty.
     pub visible_piece: String,
-    /// Always the decoded UTF-8 piece, even for marker-boundary tokens. Useful
-    /// for accumulating the full raw model output (e.g. for downstream parser
-    /// cross-checks) without losing marker bytes.
     pub raw_piece: String,
 }
diff --git a/llama-cpp-bindings/src/ingest_prompt_chunk.rs b/llama-cpp-bindings/src/ingest_prompt_chunk.rs
index c17b0993..c83ff230 100644
--- a/llama-cpp-bindings/src/ingest_prompt_chunk.rs
+++ b/llama-cpp-bindings/src/ingest_prompt_chunk.rs
@@ -3,17 +3,6 @@ use crate::mtmd::MtmdInputChunkType;
 use crate::mtmd::MtmdInputChunkTypeError;
 use crate::sampled_token_classifier::SampledTokenClassifier;
 
-/// Dispatches a single multimodal chunk into the classifier:
-/// - Text chunks bump `prompt_tokens` and replay every text token through the
-///   marker state machine, so prompt-end markers like `<think>` reach the
-///   classifier and the section transitions before generation begins.
-/// - Image / Audio chunks bump only their own usage counters; they have no
-///   text token IDs to replay.
-///
-/// This is the single canonical per-chunk ingest path for the multimodal
-/// driver. Any future per-chunk invariant (e.g. cached prefix replay) lives
-/// here so it cannot diverge between consumers.
-///
 /// # Errors
 /// Returns [`MtmdInputChunkTypeError`] when the chunk reports a type unknown
 /// to this binding. Counters are not updated on error.
diff --git a/llama-cpp-bindings/src/invalid_numa_strategy.rs b/llama-cpp-bindings/src/invalid_numa_strategy.rs
index 2d00b029..9f80058f 100644
--- a/llama-cpp-bindings/src/invalid_numa_strategy.rs
+++ b/llama-cpp-bindings/src/invalid_numa_strategy.rs
@@ -1,6 +1,2 @@
-/// An invalid numa strategy was provided.
 #[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub struct InvalidNumaStrategy(
-    /// The invalid numa strategy that was provided.
-    pub llama_cpp_bindings_sys::ggml_numa_strategy,
-);
+pub struct InvalidNumaStrategy(pub llama_cpp_bindings_sys::ggml_numa_strategy);
diff --git a/llama-cpp-bindings/src/lib.rs b/llama-cpp-bindings/src/lib.rs
index 9bed927b..9d3fc7e1 100644
--- a/llama-cpp-bindings/src/lib.rs
+++ b/llama-cpp-bindings/src/lib.rs
@@ -1,14 +1,3 @@
-//! Bindings to the llama.cpp library.
-//!
-//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
-//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
-//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
-//! the API is not as nice as it could be.
-//!
-//! # Feature Flags
-//!
-//! - `cuda` enables CUDA gpu support.
-
 pub mod batch_add_error;
 pub mod chat_message_parse_outcome;
 pub mod context;
diff --git a/llama-cpp-bindings/src/llama_backend.rs b/llama-cpp-bindings/src/llama_backend.rs
index 30d83cf0..e6c8f4ee 100644
--- a/llama-cpp-bindings/src/llama_backend.rs
+++ b/llama-cpp-bindings/src/llama_backend.rs
@@ -1,22 +1,15 @@
-//! Representation of an initialized llama backend
-
 use crate::LlamaCppError;
 use crate::llama_backend_numa_strategy::NumaStrategy;
 use llama_cpp_bindings_sys::ggml_log_level;
 use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering::SeqCst;
 
-/// Representation of an initialized llama backend.
-///
-/// This is required as a parameter for most llama functions as the backend must be initialized
-/// before any llama functions are called. This type is proof of initialization.
 #[derive(Eq, PartialEq, Debug)]
 pub struct LlamaBackend {}
 
 static LLAMA_BACKEND_INITIALIZED: AtomicBool = AtomicBool::new(false);
 
 impl LlamaBackend {
-    /// Mark the llama backend as initialized
     fn mark_init() -> crate::Result<()> {
         match LLAMA_BACKEND_INITIALIZED.compare_exchange(false, true, SeqCst, SeqCst) {
             Ok(_was_uninitialized) => Ok(()),
@@ -24,25 +17,6 @@ impl LlamaBackend {
         }
     }
 
-    /// Initialize the llama backend (without numa).
-    ///
-    /// # Examples
-    ///
-    /// ```
-    ///# use llama_cpp_bindings::llama_backend::LlamaBackend;
-    ///# use llama_cpp_bindings::LlamaCppError;
-    ///# use std::error::Error;
-    ///
-    ///# fn main() -> Result<(), Box<dyn Error>> {
-    ///
-    ///
-    /// let backend = LlamaBackend::init()?;
-    /// // the llama backend can only be initialized once
-    /// assert!(matches!(LlamaBackend::init(), Err(LlamaCppError::BackendAlreadyInitialized)));
-    ///
-    ///# Ok(())
-    ///# }
-    /// ```
     /// # Errors
     /// Returns an error if the backend was already initialized.
     pub fn init() -> crate::Result<Self> {
@@ -51,19 +25,6 @@ impl LlamaBackend {
         Ok(Self {})
     }
 
-    /// Initialize the llama backend (with numa).
-    /// ```
-    ///# use llama_cpp_bindings::llama_backend::LlamaBackend;
-    ///# use std::error::Error;
-    ///# use llama_cpp_bindings::llama_backend_numa_strategy::NumaStrategy;
-    ///
-    ///# fn main() -> Result<(), Box<dyn Error>> {
-    ///
-    /// let llama_backend = LlamaBackend::init_numa(NumaStrategy::Mirror)?;
-    ///
-    ///# Ok(())
-    ///# }
-    /// ```
     /// # Errors
     /// Returns an error if the backend was already initialized.
     pub fn init_numa(strategy: NumaStrategy) -> crate::Result<Self> {
@@ -76,25 +37,21 @@ impl LlamaBackend {
         Ok(Self {})
     }
 
-    /// Was the code built for a GPU backend & is a supported one available.
     #[must_use]
     pub fn supports_gpu_offload(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::llama_supports_gpu_offload() }
     }
 
-    /// Does this platform support loading the model via mmap.
     #[must_use]
     pub fn supports_mmap(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::llama_supports_mmap() }
     }
 
-    /// Does this platform support locking the model in RAM.
     #[must_use]
     pub fn supports_mlock(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::llama_supports_mlock() }
     }
 
-    /// Change the output of llama.cpp's logging to be voided instead of pushed to `stderr`.
     pub fn void_logs(&mut self) {
         unsafe {
             llama_cpp_bindings_sys::llama_log_set(Some(void_log), std::ptr::null_mut());
@@ -109,21 +66,6 @@ const unsafe extern "C" fn void_log(
 ) {
 }
 
-/// Drops the llama backend.
-/// ```
-///
-///# use llama_cpp_bindings::llama_backend::LlamaBackend;
-///# use std::error::Error;
-///
-///# fn main() -> Result<(), Box<dyn Error>> {
-/// let backend = LlamaBackend::init()?;
-/// drop(backend);
-/// // can be initialized again after being dropped
-/// let backend = LlamaBackend::init()?;
-///# Ok(())
-///# }
-///
-/// ```
 impl Drop for LlamaBackend {
     fn drop(&mut self) {
         LLAMA_BACKEND_INITIALIZED.store(false, SeqCst);
diff --git a/llama-cpp-bindings/src/llama_backend_device.rs b/llama-cpp-bindings/src/llama_backend_device.rs
index b5851efb..aa7ce51f 100644
--- a/llama-cpp-bindings/src/llama_backend_device.rs
+++ b/llama-cpp-bindings/src/llama_backend_device.rs
@@ -4,26 +4,14 @@ use crate::llama_backend_device_type::device_type_from_raw;
 
 pub use crate::llama_backend_device_type::LlamaBackendDeviceType;
 
-/// A ggml backend device
-///
-/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
 #[derive(Debug, Clone)]
 pub struct LlamaBackendDevice {
-    /// The index of the device
-    ///
-    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
     pub index: usize,
-    /// The name of the device (e.g. "Vulkan0")
     pub name: String,
-    /// A description of the device (e.g. "NVIDIA `GeForce` RTX 3080")
     pub description: String,
-    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
     pub backend: String,
-    /// Total memory of the device in bytes
     pub memory_total: usize,
-    /// Free memory of the device in bytes
     pub memory_free: usize,
-    /// Device type
     pub device_type: LlamaBackendDeviceType,
 }
 
@@ -37,7 +25,6 @@ fn cstr_to_string(ptr: *const c_char) -> String {
     }
 }
 
-/// List ggml backend devices
 #[must_use]
 pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
     let mut devices = Vec::new();
diff --git a/llama-cpp-bindings/src/llama_backend_device_type.rs b/llama-cpp-bindings/src/llama_backend_device_type.rs
index fd22c8fd..5f1885cd 100644
--- a/llama-cpp-bindings/src/llama_backend_device_type.rs
+++ b/llama-cpp-bindings/src/llama_backend_device_type.rs
@@ -1,15 +1,9 @@
-/// Backend device type
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum LlamaBackendDeviceType {
-    /// CPU device
     Cpu,
-    /// ACCEL device
     Accelerator,
-    /// GPU device
     Gpu,
-    /// iGPU device
     IntegratedGpu,
-    /// Unknown device type
     Unknown,
 }
 
diff --git a/llama-cpp-bindings/src/llama_backend_numa_strategy.rs b/llama-cpp-bindings/src/llama_backend_numa_strategy.rs
index 2be150fa..fccef05a 100644
--- a/llama-cpp-bindings/src/llama_backend_numa_strategy.rs
+++ b/llama-cpp-bindings/src/llama_backend_numa_strategy.rs
@@ -1,17 +1,11 @@
 use crate::invalid_numa_strategy::InvalidNumaStrategy;
 
-/// NUMA (Non-Uniform Memory Access) thread affinity strategy for llama.cpp.
 #[derive(Debug, Eq, PartialEq, Copy, Clone)]
 pub enum NumaStrategy {
-    /// NUMA-aware scheduling is disabled. Threads are not pinned to specific nodes.
     Disabled,
-    /// Distributes threads across NUMA nodes in a round-robin fashion.
     Distribute,
-    /// Pins all threads to the current NUMA node to avoid cross-node memory access.
     Isolate,
-    /// Respects the CPU affinity mask set externally by the `numactl` command.
     Numactl,
-    /// Mirrors memory across NUMA nodes. Currently a no-op in llama.cpp.
     Mirror,
 }
 
diff --git a/llama-cpp-bindings/src/llama_batch.rs b/llama-cpp-bindings/src/llama_batch.rs
index b6b8b189..cc6e93ee 100644
--- a/llama-cpp-bindings/src/llama_batch.rs
+++ b/llama-cpp-bindings/src/llama_batch.rs
@@ -1,5 +1,3 @@
-//! Safe wrapper around `llama_batch`.
-
 use crate::batch_add_error::BatchAddError;
 use crate::sampled_token::SampledToken;
 use crate::token::LlamaToken;
@@ -53,32 +51,20 @@ fn checked_usize_as_llama_pos(value: usize, description: &str) -> Result<llama_p
     })
 }
 
-/// A safe wrapper around `llama_batch`.
-///
-/// `PartialEq` is intentionally not implemented because the underlying `llama_batch`
-/// from the C API contains raw pointers whose address comparison would be meaningless.
 #[derive(Debug)]
 pub struct LlamaBatch<'tokens> {
-    /// The number of tokens the batch was allocated with. they are safe to write to - but not necessarily read from as they are not necessarily initialized
     allocated: usize,
-    /// The logits that are initialized. Used by [`LlamaContext`] to ensure that only initialized logits are accessed.
     pub initialized_logits: Vec<i32>,
-    /// The underlying `llama_batch` from the C API.
     pub llama_batch: llama_batch,
     phantom: PhantomData<&'tokens [LlamaToken]>,
 }
 
 impl<'tokens> LlamaBatch<'tokens> {
-    /// Clear the batch. This does not free the memory associated with the batch, but it does reset
-    /// the number of tokens to 0.
     pub fn clear(&mut self) {
         self.llama_batch.n_tokens = 0;
         self.initialized_logits.clear();
     }
 
-    /// add a token to the batch for sequences `seq_ids` at position `pos`. If `logits` is true, the
-    /// token will be initialized and can be read from after the next decode.
-    ///
     /// # Errors
     ///
     /// Returns an error if there is insufficient space in the buffer or if integer conversions fail.
@@ -126,11 +112,6 @@ impl<'tokens> LlamaBatch<'tokens> {
         Ok(())
     }
 
-    /// Add a sequence of tokens to the batch for the given sequence id. If `logits_all` is true, the
-    /// tokens will be initialized and can be read from after the next decode.
-    ///
-    /// Either way the last token in the sequence will have its logits set to `true`.
-    ///
     /// # Errors
     ///
     /// Returns an error if there is insufficient space in the buffer or if integer conversions fail.
@@ -154,13 +135,6 @@ impl<'tokens> LlamaBatch<'tokens> {
         Ok(())
     }
 
-    /// Create a new `LlamaBatch` that can contain up to `n_tokens` tokens.
-    ///
-    /// # Arguments
-    ///
-    /// - `n_tokens`: the maximum number of tokens that can be added to the batch
-    /// - `n_seq_max`: the maximum number of sequences that can be added to the batch (generally 1 unless you know what you are doing)
-    ///
     /// # Errors
     ///
     /// Returns an error if `n_tokens` exceeds `i32::MAX`.
@@ -176,11 +150,6 @@ impl<'tokens> LlamaBatch<'tokens> {
         })
     }
 
-    /// ``llama_batch_get_one``
-    /// Return batch for single sequence of tokens
-    ///
-    /// NOTE: this is a helper function to facilitate transition to the new batch API
-    ///
     /// # Errors
     ///
     /// Returns an error if the provided token buffer is empty or if integer conversions fail.
@@ -210,7 +179,6 @@ impl<'tokens> LlamaBatch<'tokens> {
         })
     }
 
-    /// Returns the number of tokens in the batch.
     #[must_use]
     pub const fn n_tokens(&self) -> i32 {
         self.llama_batch.n_tokens
@@ -218,17 +186,6 @@ impl<'tokens> LlamaBatch<'tokens> {
 }
 
 impl Drop for LlamaBatch<'_> {
-    /// Drops the `LlamaBatch`.
-    ///
-    /// ```
-    /// # use llama_cpp_bindings::llama_batch::LlamaBatch;
-    /// # use std::error::Error;
-    /// # fn main() -> Result<(), Box<dyn Error>> {
-    /// let batch = LlamaBatch::new(512, 1)?;
-    /// // frees the memory associated with the batch. (allocated by llama.cpp)
-    /// drop(batch);
-    /// # Ok(())
-    /// # }
     fn drop(&mut self) {
         unsafe {
             if self.allocated > 0 {
diff --git a/llama-cpp-bindings/src/llama_time_us.rs b/llama-cpp-bindings/src/llama_time_us.rs
index ee1c707e..63d43ad8 100644
--- a/llama-cpp-bindings/src/llama_time_us.rs
+++ b/llama-cpp-bindings/src/llama_time_us.rs
@@ -1,12 +1,3 @@
-/// Get the time (in microseconds) according to llama.cpp.
-///
-/// ```
-/// # use llama_cpp_bindings::llama_time_us;
-/// # use llama_cpp_bindings::llama_backend::LlamaBackend;
-/// let backend = LlamaBackend::init().unwrap();
-/// let time = llama_time_us();
-/// assert!(time > 0);
-/// ```
 #[must_use]
 pub fn llama_time_us() -> i64 {
     unsafe { llama_cpp_bindings_sys::llama_time_us() }
diff --git a/llama-cpp-bindings/src/llama_token_attr.rs b/llama-cpp-bindings/src/llama_token_attr.rs
index fb9de83c..9af9fb98 100644
--- a/llama-cpp-bindings/src/llama_token_attr.rs
+++ b/llama-cpp-bindings/src/llama_token_attr.rs
@@ -1,28 +1,17 @@
 use enumflags2::bitflags;
 
-/// A rust flavored equivalent of `llama_token_type`.
 #[derive(Eq, PartialEq, Debug, Clone, Copy)]
 #[bitflags]
 #[repr(u32)]
 pub enum LlamaTokenAttr {
-    /// Unknown token attribute.
     Unknown = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_UNKNOWN as _,
-    /// Unused token attribute.
     Unused = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_UNUSED as _,
-    /// Normal text token.
     Normal = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_NORMAL as _,
-    /// Control token (e.g. BOS, EOS).
     Control = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_CONTROL as _,
-    /// User-defined token.
     UserDefined = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_USER_DEFINED as _,
-    /// Byte-level fallback token.
     Byte = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_BYTE as _,
-    /// Token with normalized text.
     Normalized = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_NORMALIZED as _,
-    /// Token with left-stripped whitespace.
     LStrip = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_LSTRIP as _,
-    /// Token with right-stripped whitespace.
     RStrip = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_RSTRIP as _,
-    /// Token representing a single word.
     SingleWord = llama_cpp_bindings_sys::LLAMA_TOKEN_ATTR_SINGLE_WORD as _,
 }
diff --git a/llama-cpp-bindings/src/llama_token_attrs.rs b/llama-cpp-bindings/src/llama_token_attrs.rs
index 872aeb4e..d5ecd6de 100644
--- a/llama-cpp-bindings/src/llama_token_attrs.rs
+++ b/llama-cpp-bindings/src/llama_token_attrs.rs
@@ -15,7 +15,6 @@ const fn llama_token_type_to_u32(value: llama_cpp_bindings_sys::llama_token_type
     value
 }
 
-/// A set of [`LlamaTokenAttr`] flags.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct LlamaTokenAttrs(pub BitFlags<LlamaTokenAttr>);
 
diff --git a/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs b/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs
index df1ad6c2..f294339d 100644
--- a/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs
+++ b/llama-cpp-bindings/src/llama_token_attrs_from_int_error.rs
@@ -1,9 +1,5 @@
-/// Returned by [`crate::llama_token_attrs::LlamaTokenAttrs::try_from`] when the
-/// integer bit pattern contains bits not defined by
-/// [`crate::llama_token_attr::LlamaTokenAttr`].
 #[derive(thiserror::Error, Debug, Eq, PartialEq)]
 pub enum LlamaTokenAttrsFromIntError {
-    /// The value is not a valid `llama_token_type`.
     #[error("Unknown Value {0}")]
     UnknownValue(std::ffi::c_uint),
 }
diff --git a/llama-cpp-bindings/src/llguidance_sampler.rs b/llama-cpp-bindings/src/llguidance_sampler.rs
index 76a987a9..c57dfe55 100644
--- a/llama-cpp-bindings/src/llguidance_sampler.rs
+++ b/llama-cpp-bindings/src/llguidance_sampler.rs
@@ -1,8 +1,3 @@
-//! Pure Rust llguidance sampler for constrained decoding.
-//!
-//! Implements a custom `llama_sampler` using the `llguidance` and `toktrie` Rust crates
-//! to enforce grammar constraints (JSON schema, regex, Lark, etc.) during token sampling.
-
 use std::ffi::c_void;
 use std::sync::Arc;
 
@@ -13,7 +8,6 @@ use crate::GrammarError;
 use crate::model::LlamaModel;
 use crate::sampling::LlamaSampler;
 
-/// Internal state for the llguidance sampler.
 struct LlgContext {
     matcher: Matcher,
     tok_env: Arc<ApproximateTokEnv>,
@@ -113,8 +107,6 @@ static mut LLG_SAMPLER_I: llama_cpp_bindings_sys::llama_sampler_i =
         backend_set_input: None,
     };
 
-/// Create an llguidance-based constrained decoding sampler.
-///
 /// # Errors
 ///
 /// Returns `GrammarError` if the parser factory, grammar, or parser cannot be created.
diff --git a/llama-cpp-bindings/src/load_backends.rs b/llama-cpp-bindings/src/load_backends.rs
index 22393a5a..54634f1a 100644
--- a/llama-cpp-bindings/src/load_backends.rs
+++ b/llama-cpp-bindings/src/load_backends.rs
@@ -3,17 +3,8 @@ use std::path::Path;
 use crate::load_backends_error::LoadBackendsError;
 use crate::load_backends_from_path::load_backends_from_path;
 
-/// Compile-time path to the built GGML backend modules directory.
-///
-/// Populated by `llama-cpp-bindings/build.rs` from the `DEP_LLAMA_BACKENDS_DIR` cargo metadata
-/// emitted by `llama-cpp-bindings-sys` when built with the `dynamic-backends` feature. `None`
-/// when the metadata is missing (e.g. when this crate is built outside the cargo workspace).
 pub const BACKENDS_DIR: Option<&str> = option_env!("GGML_BACKENDS_DIR");
 
-/// Load GGML backend modules from the compile-time default directory ([`BACKENDS_DIR`]).
-///
-/// This is a no-op when `BACKENDS_DIR` is `None`.
-///
 /// # Errors
 ///
 /// Returns [`LoadBackendsError::PathNotUtf8`] when `BACKENDS_DIR` cannot be converted to UTF-8
diff --git a/llama-cpp-bindings/src/load_backends_error.rs b/llama-cpp-bindings/src/load_backends_error.rs
index b3d628c3..3f7c5e6a 100644
--- a/llama-cpp-bindings/src/load_backends_error.rs
+++ b/llama-cpp-bindings/src/load_backends_error.rs
@@ -1,13 +1,10 @@
 use std::ffi::NulError;
 use std::path::PathBuf;
 
-/// Error returned when loading GGML backend modules from a path.
 #[derive(Debug, thiserror::Error)]
 pub enum LoadBackendsError {
-    /// The provided path could not be converted to UTF-8.
     #[error("backend directory path is not valid UTF-8: {0}")]
     PathNotUtf8(PathBuf),
-    /// The provided path contained an interior null byte.
     #[error("backend directory path contains a null byte: {0}")]
     PathNullByte(#[from] NulError),
 }
diff --git a/llama-cpp-bindings/src/load_backends_from_path.rs b/llama-cpp-bindings/src/load_backends_from_path.rs
index 7af9cce4..c2434e99 100644
--- a/llama-cpp-bindings/src/load_backends_from_path.rs
+++ b/llama-cpp-bindings/src/load_backends_from_path.rs
@@ -3,11 +3,6 @@ use std::path::Path;
 
 use crate::load_backends_error::LoadBackendsError;
 
-/// Load GGML backend modules from the given directory.
-///
-/// Call this before [`crate::llama_backend::LlamaBackend::init`] to enable runtime hardware
-/// selection (Vulkan, CPU-AVX512, CPU-AVX2, etc.) when built with the `dynamic-backends` feature.
-///
 /// # Errors
 ///
 /// Returns [`LoadBackendsError::PathNotUtf8`] when `path` cannot be converted to UTF-8 and
diff --git a/llama-cpp-bindings/src/log_options.rs b/llama-cpp-bindings/src/log_options.rs
index ca6eacca..6192d0ff 100644
--- a/llama-cpp-bindings/src/log_options.rs
+++ b/llama-cpp-bindings/src/log_options.rs
@@ -1,4 +1,3 @@
-/// Options to configure how llama.cpp logs are intercepted.
 #[derive(Default, Debug, Clone)]
 pub struct LogOptions {
     pub disabled: bool,
@@ -6,8 +5,6 @@ pub struct LogOptions {
 }
 
 impl LogOptions {
-    /// If enabled, logs are dispatched through the `log` crate. If disabled, all logs are
-    /// suppressed. Default is for logs to be dispatched.
     #[must_use]
     pub const fn with_logs_enabled(mut self, enabled: bool) -> Self {
         self.disabled = !enabled;
@@ -15,10 +12,6 @@ impl LogOptions {
         self
     }
 
-    /// When enabled, llama.cpp and ggml INFO logs are dispatched at DEBUG level. WARN and
-    /// ERROR logs retain their original severity. This suppresses verbose informational output
-    /// under a typical INFO-level logger while keeping important diagnostics visible.
-    /// All demoted logs remain available via `RUST_LOG=debug`.
     #[must_use]
     pub const fn with_demote_info_to_debug(mut self, demote: bool) -> Self {
         self.demote_info_to_debug = demote;
diff --git a/llama-cpp-bindings/src/max_devices.rs b/llama-cpp-bindings/src/max_devices.rs
index e5d12b4a..014eeaee 100644
--- a/llama-cpp-bindings/src/max_devices.rs
+++ b/llama-cpp-bindings/src/max_devices.rs
@@ -1,10 +1,3 @@
-/// Get the max number of devices according to llama.cpp (this is generally cuda devices).
-///
-/// ```
-/// # use llama_cpp_bindings::max_devices;
-/// let max_devices = max_devices();
-/// assert!(max_devices >= 0);
-/// ```
 #[must_use]
 pub fn max_devices() -> usize {
     unsafe { llama_cpp_bindings_sys::llama_max_devices() }
diff --git a/llama-cpp-bindings/src/mlock_supported.rs b/llama-cpp-bindings/src/mlock_supported.rs
index 2899a2a2..96bf1728 100644
--- a/llama-cpp-bindings/src/mlock_supported.rs
+++ b/llama-cpp-bindings/src/mlock_supported.rs
@@ -1,12 +1,3 @@
-/// Is memory locking supported according to llama.cpp.
-///
-/// ```
-/// # use llama_cpp_bindings::mlock_supported;
-/// let mlock_supported = mlock_supported();
-/// if mlock_supported {
-///    println!("mlock_supported!");
-/// }
-/// ```
 #[must_use]
 pub fn mlock_supported() -> bool {
     unsafe { llama_cpp_bindings_sys::llama_supports_mlock() }
diff --git a/llama-cpp-bindings/src/mmap_supported.rs b/llama-cpp-bindings/src/mmap_supported.rs
index b00d62c8..47ccbfe7 100644
--- a/llama-cpp-bindings/src/mmap_supported.rs
+++ b/llama-cpp-bindings/src/mmap_supported.rs
@@ -1,12 +1,3 @@
-/// Is memory mapping supported according to llama.cpp.
-///
-/// ```
-/// # use llama_cpp_bindings::mmap_supported;
-/// let mmap_supported = mmap_supported();
-/// if mmap_supported {
-///   println!("mmap_supported!");
-/// }
-/// ```
 #[must_use]
 pub fn mmap_supported() -> bool {
     unsafe { llama_cpp_bindings_sys::llama_supports_mmap() }
diff --git a/llama-cpp-bindings/src/model.rs b/llama-cpp-bindings/src/model.rs
index d55ee679..8c33486d 100644
--- a/llama-cpp-bindings/src/model.rs
+++ b/llama-cpp-bindings/src/model.rs
@@ -1,5 +1,3 @@
-//! A safe wrapper around `llama_model`.
-
 pub mod add_bos;
 pub mod llama_chat_message;
 pub mod llama_chat_template;
@@ -78,9 +76,7 @@ fn cstring_with_validated_len(str: &str) -> Result<(CString, c_int), StringToTok
     Ok((c_string, len))
 }
 
-/// A safe wrapper around `llama_model`.
 pub struct LlamaModel {
-    /// Raw pointer to the underlying `llama_model`.
     pub model: NonNull<llama_cpp_bindings_sys::llama_model>,
     tok_env: OnceLock<Arc<ApproximateTokEnv>>,
 }
@@ -98,14 +94,11 @@ unsafe impl Send for LlamaModel {}
 unsafe impl Sync for LlamaModel {}
 
 impl LlamaModel {
-    /// Returns a raw pointer to the model's vocabulary.
     #[must_use]
     pub fn vocab_ptr(&self) -> *const llama_cpp_bindings_sys::llama_vocab {
         unsafe { llama_cpp_bindings_sys::llama_model_get_vocab(self.model.as_ptr()) }
     }
 
-    /// Get the number of tokens the model was trained on.
-    ///
     /// # Errors
     ///
     /// Returns an error if the value returned by llama.cpp does not fit into a `u32`.
@@ -115,7 +108,6 @@ impl LlamaModel {
         u32::try_from(n_ctx_train)
     }
 
-    /// Get all tokens in the model.
     pub fn tokens(
         &self,
         decode_special: bool,
@@ -136,28 +128,24 @@ impl LlamaModel {
             })
     }
 
-    /// Get the beginning of stream token.
     #[must_use]
     pub fn token_bos(&self) -> LlamaToken {
         let token = unsafe { llama_cpp_bindings_sys::llama_token_bos(self.vocab_ptr()) };
         LlamaToken(token)
     }
 
-    /// Get the end of stream token.
     #[must_use]
     pub fn token_eos(&self) -> LlamaToken {
         let token = unsafe { llama_cpp_bindings_sys::llama_token_eos(self.vocab_ptr()) };
         LlamaToken(token)
     }
 
-    /// Get the newline token.
     #[must_use]
     pub fn token_nl(&self) -> LlamaToken {
         let token = unsafe { llama_cpp_bindings_sys::llama_token_nl(self.vocab_ptr()) };
         LlamaToken(token)
     }
 
-    /// Check if a token represents the end of generation (end of turn, end of sequence, etc.)
     #[must_use]
     pub fn is_eog_token(&self, token: &SampledToken) -> bool {
         let (SampledToken::Content(LlamaToken(id))
@@ -168,7 +156,6 @@ impl LlamaModel {
         unsafe { llama_cpp_bindings_sys::llama_token_is_eog(self.vocab_ptr(), id) }
     }
 
-    /// Get the decoder start token.
     #[must_use]
     pub fn decode_start_token(&self) -> LlamaToken {
         let token =
@@ -176,15 +163,12 @@ impl LlamaModel {
         LlamaToken(token)
     }
 
-    /// Get the separator token (SEP).
     #[must_use]
     pub fn token_sep(&self) -> LlamaToken {
         let token = unsafe { llama_cpp_bindings_sys::llama_vocab_sep(self.vocab_ptr()) };
         LlamaToken(token)
     }
 
-    /// Convert a string to a Vector of tokens.
-    ///
     /// # Errors
     ///
     /// - if [`str`] contains a null byte
@@ -194,14 +178,6 @@ impl LlamaModel {
     /// ```no_run
     /// use llama_cpp_bindings::model::LlamaModel;
     ///
-    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
-    /// use std::path::Path;
-    /// use llama_cpp_bindings::model::AddBos;
-    /// let backend = llama_cpp_bindings::llama_backend::LlamaBackend::init()?;
-    /// let model = LlamaModel::load_from_file(&backend, Path::new("path/to/model"), &Default::default())?;
-    /// let tokens = model.str_to_token("Hello, World!", AddBos::Always)?;
-    /// # Ok(())
-    /// # }
     pub fn str_to_token(
         &self,
         str: &str,
@@ -253,8 +229,6 @@ impl LlamaModel {
         Ok(buffer)
     }
 
-    /// Get the type of a token.
-    ///
     /// # Errors
     ///
     /// Returns an error if the token type is not known to this library.
@@ -268,16 +242,6 @@ impl LlamaModel {
         LlamaTokenAttrs::try_from(token_type)
     }
 
-    /// Convert a token to a string using the underlying llama.cpp `llama_token_to_piece` function.
-    ///
-    /// This is the new default function for token decoding and provides direct access to
-    /// the llama.cpp token decoding functionality without any special logic or filtering.
-    ///
-    /// Decoding raw string requires using an decoder, tokens from language models may not always map
-    /// to full characters depending on the encoding so stateful decoding is required, otherwise partial strings may be lost!
-    /// Invalid characters are mapped to REPLACEMENT CHARACTER making the method safe to use even if the model inherently produces
-    /// garbage.
-    ///
     /// # Errors
     ///
     /// - if the token type is unknown
@@ -310,12 +274,6 @@ impl LlamaModel {
         Ok(output_piece)
     }
 
-    /// Raw token decoding to bytes, use if you want to handle the decoding model output yourself
-    ///
-    /// Convert a token to bytes using the underlying llama.cpp `llama_token_to_piece` function. This is mostly
-    /// a thin wrapper around `llama_token_to_piece` function, that handles rust <-> c type conversions while
-    /// letting the caller handle errors. For a safer interface returning rust strings directly use `token_to_piece` instead!
-    ///
     /// # Errors
     ///
     /// - if the token type is unknown
@@ -356,17 +314,11 @@ impl LlamaModel {
         }
     }
 
-    /// The number of tokens the model was trained on.
-    ///
-    /// This returns a `c_int` for maximum compatibility. Most of the time it can be cast to an i32
-    /// without issue.
     #[must_use]
     pub fn n_vocab(&self) -> i32 {
         unsafe { llama_cpp_bindings_sys::llama_n_vocab(self.vocab_ptr()) }
     }
 
-    /// The type of vocab the model was trained on.
-    ///
     /// # Errors
     ///
     /// Returns an error if llama.cpp emits a vocab type that is not known to this library.
@@ -376,33 +328,26 @@ impl LlamaModel {
         VocabType::try_from(vocab_type)
     }
 
-    /// This returns a `c_int` for maximum compatibility. Most of the time it can be cast to an i32
-    /// without issue.
     #[must_use]
     pub fn n_embd(&self) -> c_int {
         unsafe { llama_cpp_bindings_sys::llama_n_embd(self.model.as_ptr()) }
     }
 
-    /// Returns the total size of all the tensors in the model in bytes.
     #[must_use]
     pub fn size(&self) -> u64 {
         unsafe { llama_cpp_bindings_sys::llama_model_size(self.model.as_ptr()) }
     }
 
-    /// Returns the number of parameters in the model.
     #[must_use]
     pub fn n_params(&self) -> u64 {
         unsafe { llama_cpp_bindings_sys::llama_model_n_params(self.model.as_ptr()) }
     }
 
-    /// Returns whether the model is a recurrent network (Mamba, RWKV, etc)
     #[must_use]
     pub fn is_recurrent(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::llama_model_is_recurrent(self.model.as_ptr()) }
     }
 
-    /// Returns the number of layers within the model.
-    ///
     /// # Errors
     ///
     /// Returns an error if the layer count returned by llama.cpp does not fit into a `u32`.
@@ -410,8 +355,6 @@ impl LlamaModel {
         u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_layer(self.model.as_ptr()) })
     }
 
-    /// Returns the number of attention heads within the model.
-    ///
     /// # Errors
     ///
     /// Returns an error if the head count returned by llama.cpp does not fit into a `u32`.
@@ -419,8 +362,6 @@ impl LlamaModel {
         u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_head(self.model.as_ptr()) })
     }
 
-    /// Returns the number of KV attention heads.
-    ///
     /// # Errors
     ///
     /// Returns an error if the KV head count returned by llama.cpp does not fit into a `u32`.
@@ -428,16 +369,11 @@ impl LlamaModel {
         u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_head_kv(self.model.as_ptr()) })
     }
 
-    /// Returns whether the model is a hybrid network (Jamba, Granite, Qwen3xx, etc.)
-    ///
-    /// Hybrid models have both attention layers and recurrent/SSM layers.
     #[must_use]
     pub fn is_hybrid(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::llama_model_is_hybrid(self.model.as_ptr()) }
     }
 
-    /// Get metadata value as a string by key name
-    ///
     /// # Errors
     /// Returns an error if the key is not found or the value is not valid UTF-8.
     pub fn meta_val_str(&self, key: &str) -> Result<String, MetaValError> {
@@ -457,14 +393,11 @@ impl LlamaModel {
         )
     }
 
-    /// Get the number of metadata key/value pairs
     #[must_use]
     pub fn meta_count(&self) -> i32 {
         unsafe { llama_cpp_bindings_sys::llama_model_meta_count(self.model.as_ptr()) }
     }
 
-    /// Get metadata key name by index
-    ///
     /// # Errors
     /// Returns an error if the index is out of range or the key is not valid UTF-8.
     pub fn meta_key_by_index(&self, index: i32) -> Result<String, MetaValError> {
@@ -481,8 +414,6 @@ impl LlamaModel {
         )
     }
 
-    /// Get metadata value as a string by index
-    ///
     /// # Errors
     /// Returns an error if the index is out of range or the value is not valid UTF-8.
     pub fn meta_val_str_by_index(&self, index: i32) -> Result<String, MetaValError> {
@@ -499,7 +430,6 @@ impl LlamaModel {
         )
     }
 
-    /// Returns the rope type of the model.
     #[must_use]
     pub fn rope_type(&self) -> Option<RopeType> {
         let raw = unsafe { llama_cpp_bindings_sys::llama_model_rope_type(self.model.as_ptr()) };
@@ -507,15 +437,6 @@ impl LlamaModel {
         rope_type::rope_type_from_raw(raw)
     }
 
-    /// Get chat template from model by name. If the name parameter is None, the default chat template will be returned.
-    ///
-    /// You supply this into [`Self::apply_chat_template`] to get back a string with the appropriate template
-    /// substitution applied to convert a list of messages into a prompt the LLM can use to complete
-    /// the chat.
-    ///
-    /// You could also use an external jinja parser, like [minijinja](https://github.com/mitsuhiko/minijinja),
-    /// to parse jinja templates not supported by the llama.cpp template engine.
-    ///
     /// # Errors
     ///
     /// * If the model has no chat template by that name
@@ -546,8 +467,6 @@ impl LlamaModel {
         }
     }
 
-    /// Loads a model from a file.
-    ///
     /// # Errors
     ///
     /// See [`LlamaModelLoadError`] for more information.
@@ -610,8 +529,6 @@ impl LlamaModel {
         }
     }
 
-    /// Initializes a lora adapter from a file.
-    ///
     /// # Errors
     ///
     /// See [`LlamaLoraAdapterInitError`] for more information.
@@ -643,21 +560,6 @@ impl LlamaModel {
         })
     }
 
-    /// Apply the models chat template to some messages.
-    /// See <https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template>
-    ///
-    /// Unlike the llama.cpp `apply_chat_template` which just randomly uses the `ChatML` template when given
-    /// a null pointer for the template, this requires an explicit template to be specified. If you want to
-    /// use "chatml", then just do `LlamaChatTemplate::new("chatml")` or any other model name or template
-    /// string.
-    ///
-    /// Use [`Self::chat_template`] to retrieve the template baked into the model (this is the preferred
-    /// mechanism as using the wrong chat template can result in really unexpected responses from the LLM).
-    ///
-    /// You probably want to set `add_ass` to true so that the generated template string ends with a the
-    /// opening tag of the assistant. If you fail to leave a hanging chat tag, the model will likely generate
-    /// one into the output and the output may also have unexpected output aside from that.
-    ///
     /// # Errors
     /// There are many ways this can fail. See [`ApplyChatTemplateError`] for more information.
     pub fn apply_chat_template(
@@ -720,17 +622,6 @@ impl LlamaModel {
         truncated_buffer_to_string(buff, final_size)
     }
 
-    /// Build a streaming [`SampledTokenClassifier`] for this model.
-    ///
-    /// At construction the bindings detect reasoning markers (via the
-    /// autoparser, with a chunked-thinking fallback for templates that consume
-    /// thoughts via content blocks), tool-call markers, and the trailing
-    /// generation-prompt slice. The classifier then runs a state machine over
-    /// the decoded token stream — no per-model branches.
-    ///
-    /// If the model has no usable chat template the classifier is built in a
-    /// blind mode that classifies every token as
-    /// [`SampledToken::Undeterminable`].
     pub fn sampled_token_classifier(&self) -> SampledTokenClassifier<'_> {
         let markers = match self.streaming_markers() {
             Ok(markers) => markers,
@@ -745,12 +636,6 @@ impl LlamaModel {
         SampledTokenClassifier::new(self, markers)
     }
 
-    /// Detect reasoning / tool-call markers (as token-ID sequences) and the
-    /// trailing generation-prompt slice for this model's chat template. The
-    /// returned `StreamingMarkers` carry tokenised markers — never raw strings
-    /// — so the classifier matches by `LlamaToken` equality rather than text
-    /// scanning.
-    ///
     /// # Errors
     /// Returns [`MarkerDetectionError`] when any underlying FFI call fails.
     pub fn streaming_markers(&self) -> Result<StreamingMarkers, MarkerDetectionError> {
@@ -781,9 +666,6 @@ impl LlamaModel {
         })
     }
 
-    /// When the autoparser-driven FFI returned no tool-call markers, consult the
-    /// per-template override registry so wrapper-known templates (Gemma 4,
-    /// Mistral 3, ...) still drive the classifier.
     fn resolve_tool_call_marker_strings(
         &self,
         autoparser_open: Option<String>,
@@ -828,11 +710,6 @@ impl LlamaModel {
         }
     }
 
-    /// Returns the rich tool-call marker bundle (open / separator / close /
-    /// optional value-quote pair) for this model's chat template, sourced from
-    /// the wrapper's per-template override registry. Returns `None` when no
-    /// registered override matches — callers in that case fall back to
-    /// llama.cpp's autoparser via [`Self::parse_chat_message`].
     #[must_use]
     pub fn tool_call_markers(&self) -> Option<ToolCallMarkers> {
         let template = match self.chat_template(None) {
@@ -873,27 +750,6 @@ impl LlamaModel {
         }
     }
 
-    /// Parse the assistant's output text into structured content, reasoning,
-    /// and tool calls.
-    ///
-    /// Two passes, in order:
-    /// 1. Duck-type the wrapper-side parsers across every known shape
-    ///    (Qwen XML, GLM key-value, Gemma paired-quote, Mistral bracketed-JSON).
-    ///    First match wins. The shapes are ordered so that more restrictive
-    ///    shapes run first, which keeps the duck-type pass safe for inputs
-    ///    that share an open marker but differ in inner structure.
-    /// 2. Delegate to llama.cpp's `common_chat_parse`. If it succeeds the
-    ///    result is `Recognized`; if it throws `ParseException` the result is
-    ///    `Unrecognized` with the raw input plus the FFI's diagnostic, so the
-    ///    caller can pass the unstructured tokens to the client.
-    ///
-    /// Empty tool-call `id` fields are filled with `call_{index}` before
-    /// returning, so callers always see well-formed identifiers.
-    ///
-    /// `tools_json` is a JSON-array string of OpenAI-style tool definitions
-    /// (use `"[]"` when no tools are in scope). `is_partial` switches between
-    /// mid-stream (lenient) and final (strict) parses for the FFI step.
-    ///
     /// # Errors
     ///
     /// Returns [`ParseChatMessageError`] when `tools_json` is not valid JSON,
@@ -1029,11 +885,6 @@ impl LlamaModel {
         }
     }
 
-    /// Render the model's chat template with the autoparser's synthetic
-    /// no-tools and with-tools inputs. Returns `(output_no_tools,
-    /// output_with_tools)`. Either side can be empty when the template throws
-    /// during rendering. Useful for debugging tool-call marker detection.
-    ///
     /// # Errors
     ///
     /// Returns [`MarkerDetectionError`] when the C++ analyzer throws or the FFI
@@ -1049,10 +900,6 @@ impl LlamaModel {
 }
 
 impl LlamaModel {
-    /// Returns a process-cached, approximate token environment built from this model's vocabulary.
-    ///
-    /// The first call iterates the full vocabulary and constructs the trie; subsequent calls
-    /// return the cached `Arc` without further FFI work.
     pub fn approximate_tok_env(&self) -> Arc<ApproximateTokEnv> {
         Arc::clone(self.tok_env.get_or_init(|| build_approximate_tok_env(self)))
     }
diff --git a/llama-cpp-bindings/src/model/add_bos.rs b/llama-cpp-bindings/src/model/add_bos.rs
index 1d38814a..ab257829 100644
--- a/llama-cpp-bindings/src/model/add_bos.rs
+++ b/llama-cpp-bindings/src/model/add_bos.rs
@@ -1,8 +1,5 @@
-/// How to determine if we should prepend a bos token to tokens
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum AddBos {
-    /// Add the beginning of stream token to the start of the string.
     Always,
-    /// Do not add the beginning of stream token to the start of the string.
     Never,
 }
diff --git a/llama-cpp-bindings/src/model/llama_chat_message.rs b/llama-cpp-bindings/src/model/llama_chat_message.rs
index 7920f750..51e1f086 100644
--- a/llama-cpp-bindings/src/model/llama_chat_message.rs
+++ b/llama-cpp-bindings/src/model/llama_chat_message.rs
@@ -2,7 +2,6 @@ use std::ffi::CString;
 
 use crate::NewLlamaChatMessageError;
 
-/// A Safe wrapper around `llama_chat_message`
 #[derive(Debug, Eq, PartialEq, Clone)]
 pub struct LlamaChatMessage {
     pub role: CString,
@@ -10,8 +9,6 @@ pub struct LlamaChatMessage {
 }
 
 impl LlamaChatMessage {
-    /// Create a new `LlamaChatMessage`
-    ///
     /// # Errors
     /// If either of ``role`` or ``content`` contain null bytes.
     pub fn new(role: String, content: String) -> Result<Self, NewLlamaChatMessageError> {
diff --git a/llama-cpp-bindings/src/model/llama_chat_template.rs b/llama-cpp-bindings/src/model/llama_chat_template.rs
index 54e4118a..3e8f86d0 100644
--- a/llama-cpp-bindings/src/model/llama_chat_template.rs
+++ b/llama-cpp-bindings/src/model/llama_chat_template.rs
@@ -1,40 +1,27 @@
 use std::ffi::{CStr, CString};
 use std::str::Utf8Error;
 
-/// A performance-friendly wrapper around [`super::LlamaModel::chat_template`].
-///
-/// This is fed into [`super::LlamaModel::apply_chat_template`] to convert a list of messages into
-/// an LLM prompt. Internally the template is stored as a `CString` to avoid round-trip conversions
-/// within the FFI.
 #[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash)]
 pub struct LlamaChatTemplate(pub CString);
 
 impl LlamaChatTemplate {
-    /// Create a new template from a string. This can either be the name of a llama.cpp [chat template](https://github.com/ggerganov/llama.cpp/blob/8a8c4ceb6050bd9392609114ca56ae6d26f5b8f5/src/llama-chat.cpp#L27-L61)
-    /// like "chatml" or "llama3" or an actual Jinja template for llama.cpp to interpret.
-    ///
     /// # Errors
     /// Returns an error if the template string contains null bytes.
     pub fn new(template: &str) -> Result<Self, std::ffi::NulError> {
         Ok(Self(CString::new(template)?))
     }
 
-    /// Accesses the template as a c string reference.
     #[must_use]
     pub fn as_c_str(&self) -> &CStr {
         &self.0
     }
 
-    /// Attempts to convert the `CString` into a Rust str reference.
-    ///
     /// # Errors
     /// Returns an error if the template is not valid UTF-8.
     pub fn to_str(&self) -> Result<&str, Utf8Error> {
         self.0.to_str()
     }
 
-    /// Convenience method to create an owned String.
-    ///
     /// # Errors
     /// Returns an error if the template is not valid UTF-8.
     pub fn to_string(&self) -> Result<String, Utf8Error> {
diff --git a/llama-cpp-bindings/src/model/llama_lora_adapter.rs b/llama-cpp-bindings/src/model/llama_lora_adapter.rs
index a0d754a1..a209a278 100644
--- a/llama-cpp-bindings/src/model/llama_lora_adapter.rs
+++ b/llama-cpp-bindings/src/model/llama_lora_adapter.rs
@@ -1,9 +1,7 @@
 use std::ptr::NonNull;
 
-/// A safe wrapper around `llama_lora_adapter`.
 #[derive(Debug)]
 #[repr(transparent)]
 pub struct LlamaLoraAdapter {
-    /// Raw pointer to the underlying `llama_adapter_lora`.
     pub lora_adapter: NonNull<llama_cpp_bindings_sys::llama_adapter_lora>,
 }
diff --git a/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs b/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs
index ed644534..46c246eb 100644
--- a/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs
+++ b/llama-cpp-bindings/src/model/llama_split_mode_parse_error.rs
@@ -1,8 +1,5 @@
-/// An error that occurs when unknown split mode is encountered.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct LlamaSplitModeParseError {
-    /// The value that could not be parsed as a split mode.
     pub value: i32,
-    /// Additional context about why the parse failed.
     pub context: String,
 }
diff --git a/llama-cpp-bindings/src/model/params.rs b/llama-cpp-bindings/src/model/params.rs
index 3b5bb2aa..58813490 100644
--- a/llama-cpp-bindings/src/model/params.rs
+++ b/llama-cpp-bindings/src/model/params.rs
@@ -1,5 +1,3 @@
-//! A safe wrapper around `llama_model_params`.
-
 use crate::LlamaCppError;
 use crate::context::params::LlamaContextParams;
 use crate::error::{FitError, ModelParamsError};
@@ -18,15 +16,9 @@ pub mod kv_overrides;
 pub mod param_override_value;
 pub mod unknown_kv_override_tag;
 
-/// The maximum number of devices supported.
-///
-/// The real maximum number of devices is the lesser one of this value and the value returned by
-/// `llama_cpp_bindings::max_devices()`.
 pub const LLAMA_CPP_MAX_DEVICES: usize = 16;
 
-/// A safe wrapper around `llama_model_params`.
 pub struct LlamaModelParams {
-    /// The underlying `llama_model_params` from the C API.
     pub params: llama_cpp_bindings_sys::llama_model_params,
     kv_overrides: Vec<llama_cpp_bindings_sys::llama_model_kv_override>,
     buft_overrides: Vec<llama_cpp_bindings_sys::llama_model_tensor_buft_override>,
@@ -50,47 +42,15 @@ impl Debug for LlamaModelParams {
 }
 
 impl LlamaModelParams {
-    /// See [`KvOverrides`]
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
-    /// let params = Box::pin(LlamaModelParams::default());
-    /// let kv_overrides = params.kv_overrides();
-    /// let count = kv_overrides.into_iter().count();
-    /// assert_eq!(count, 0);
-    /// ```
     #[must_use]
     pub const fn kv_overrides(&self) -> KvOverrides<'_> {
         KvOverrides::new(self)
     }
 
-    /// Appends a key-value override to the model parameters. It must be pinned as this creates a self-referential struct.
-    ///
     /// # Errors
     /// Returns [`ModelParamsError`] if the internal override vector has no available slot,
     /// the slot is not empty, or the key contains invalid characters.
     ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// # use std::ffi::{CStr, CString};
-    /// use std::pin::pin;
-    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
-    /// # use llama_cpp_bindings::model::params::param_override_value::ParamOverrideValue;
-    /// let mut params = pin!(LlamaModelParams::default());
-    /// let key = CString::new("key").expect("CString::new failed");
-    /// params.as_mut().append_kv_override(&key, ParamOverrideValue::Int(50)).unwrap();
-    ///
-    /// let kv_overrides = params.kv_overrides().into_iter().collect::<Vec<_>>();
-    /// assert_eq!(kv_overrides.len(), 1);
-    ///
-    /// let (k, v) = &kv_overrides[0];
-    /// assert_eq!(v, &ParamOverrideValue::Int(50));
-    ///
-    /// assert_eq!(k.to_bytes(), b"key", "expected key to be 'key', was {:?}", k);
-    /// ```
     pub fn append_kv_override(
         mut self: Pin<&mut Self>,
         key: &CStr,
@@ -122,10 +82,6 @@ impl LlamaModelParams {
         Ok(())
     }
 
-    /// Pushes the trailing zero-tag sentinel onto `kv_overrides` and refreshes
-    /// `params.kv_overrides`. The cached pointer is nulled before [`Vec::push`]
-    /// so that a relocation-induced panic never leaves a dangling pointer in
-    /// `params`.
     fn push_kv_override_terminator(mut self: Pin<&mut Self>) {
         self.params.kv_overrides = null();
 
@@ -143,8 +99,6 @@ impl LlamaModelParams {
 }
 
 impl LlamaModelParams {
-    /// Adds buffer type overrides to move all mixture-of-experts layers to CPU.
-    ///
     /// # Errors
     /// Returns [`ModelParamsError`] if the internal override vector has no available slot,
     /// the slot is not empty, or the key contains invalid characters.
@@ -152,9 +106,6 @@ impl LlamaModelParams {
         self.add_cpu_buft_override(c"\\.ffn_(up|down|gate)_(ch|)exps")
     }
 
-    /// Appends a buffer type override to the model parameters, to move layers matching pattern to CPU.
-    /// It must be pinned as this creates a self-referential struct.
-    ///
     /// # Errors
     /// Returns [`ModelParamsError`] if the internal override vector has no available slot,
     /// the slot is not empty, or the key contains invalid characters.
@@ -188,10 +139,6 @@ impl LlamaModelParams {
         Ok(())
     }
 
-    /// Pushes the trailing null-pattern sentinel onto `buft_overrides` and
-    /// refreshes `params.tensor_buft_overrides`. The cached pointer is nulled
-    /// before [`Vec::push`] so that a relocation-induced panic never leaves a
-    /// dangling pointer in `params`.
     fn push_buft_override_terminator(mut self: Pin<&mut Self>) {
         self.params.tensor_buft_overrides = null();
 
@@ -206,45 +153,37 @@ impl LlamaModelParams {
 }
 
 impl LlamaModelParams {
-    /// Get the number of layers to offload to the GPU.
     #[must_use]
     pub const fn n_gpu_layers(&self) -> i32 {
         self.params.n_gpu_layers
     }
 
-    /// The GPU that is used for scratch and small tensors
     #[must_use]
     pub const fn main_gpu(&self) -> i32 {
         self.params.main_gpu
     }
 
-    /// only load the vocabulary, no weights
     #[must_use]
     pub const fn vocab_only(&self) -> bool {
         self.params.vocab_only
     }
 
-    /// use mmap if possible
     #[must_use]
     pub const fn use_mmap(&self) -> bool {
         self.params.use_mmap
     }
 
-    /// force system to keep model in RAM
     #[must_use]
     pub const fn use_mlock(&self) -> bool {
         self.params.use_mlock
     }
 
-    /// get the split mode
-    ///
     /// # Errors
     /// Returns `LlamaSplitModeParseError` if the unknown split mode is encountered.
     pub fn split_mode(&self) -> Result<LlamaSplitMode, LlamaSplitModeParseError> {
         LlamaSplitMode::try_from(self.params.split_mode)
     }
 
-    /// get the devices
     #[must_use]
     pub fn devices(&self) -> Vec<usize> {
         let mut backend_devices = Vec::new();
@@ -270,13 +209,6 @@ impl LlamaModelParams {
         devices
     }
 
-    /// sets the number of gpu layers to offload to the GPU.
-    /// ```
-    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
-    /// let params = LlamaModelParams::default();
-    /// let params = params.with_n_gpu_layers(1);
-    /// assert_eq!(params.n_gpu_layers(), 1);
-    /// ```
     #[must_use]
     pub fn with_n_gpu_layers(mut self, n_gpu_layers: u32) -> Self {
         let n_gpu_layers = i32::try_from(n_gpu_layers).unwrap_or(i32::MAX);
@@ -284,54 +216,29 @@ impl LlamaModelParams {
         self
     }
 
-    /// sets the main GPU
-    ///
-    /// To enable this option, you must set `split_mode` to `LlamaSplitMode::None` to enable single GPU mode.
     #[must_use]
     pub const fn with_main_gpu(mut self, main_gpu: i32) -> Self {
         self.params.main_gpu = main_gpu;
         self
     }
 
-    /// sets `vocab_only`
     #[must_use]
     pub const fn with_vocab_only(mut self, vocab_only: bool) -> Self {
         self.params.vocab_only = vocab_only;
         self
     }
 
-    /// sets `use_mmap`
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
-    /// let params = LlamaModelParams::default().with_use_mmap(false);
-    /// assert!(!params.use_mmap());
-    /// ```
     #[must_use]
     pub const fn with_use_mmap(mut self, use_mmap: bool) -> Self {
         self.params.use_mmap = use_mmap;
         self
     }
 
-    /// Get `no_alloc`
     #[must_use]
     pub const fn no_alloc(&self) -> bool {
         self.params.no_alloc
     }
 
-    /// Set `no_alloc`. When enabled, tensor data is not allocated.
-    /// Incompatible with `use_mmap`, so enabling this also disables mmap.
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// # use llama_cpp_bindings::model::params::LlamaModelParams;
-    /// let params = LlamaModelParams::default().with_no_alloc(true);
-    /// assert!(params.no_alloc());
-    /// assert!(!params.use_mmap());
-    /// ```
     #[must_use]
     pub const fn with_no_alloc(mut self, no_alloc: bool) -> Self {
         self.params.no_alloc = no_alloc;
@@ -341,28 +248,18 @@ impl LlamaModelParams {
         self
     }
 
-    /// sets `use_mlock`
     #[must_use]
     pub const fn with_use_mlock(mut self, use_mlock: bool) -> Self {
         self.params.use_mlock = use_mlock;
         self
     }
 
-    /// sets `split_mode`
     #[must_use]
     pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self {
         self.params.split_mode = split_mode.into();
         self
     }
 
-    /// sets `devices`
-    ///
-    /// The devices are specified as indices that correspond to the ggml backend device indices.
-    ///
-    /// The maximum number of devices is 16.
-    ///
-    /// You don't need to specify CPU or ACCEL devices.
-    ///
     /// # Errors
     /// Returns `LlamaCppError::BackendDeviceNotFound` if any device index is invalid.
     pub fn with_devices(mut self, devices: &[usize]) -> Result<Self, LlamaCppError> {
@@ -387,38 +284,6 @@ impl LlamaModelParams {
 }
 
 impl LlamaModelParams {
-    /// Automatically fit model and context parameters to available device memory.
-    ///
-    /// Wraps llama.cpp's `common_fit_params`. Given a model path, available per-device memory
-    /// margins, and a minimum context size, it fills in `n_gpu_layers`, `tensor_split`, and
-    /// `tensor_buft_overrides` to fit the model to the available VRAM, and may reduce
-    /// `cparams.n_ctx` if needed. On success the model and context params are updated in place.
-    ///
-    /// # Requirements
-    ///
-    /// Per the C API docstring, only parameters that still hold their default value are
-    /// modified. In practice this means:
-    /// - `n_gpu_layers` must be at its default (`-1`). Do not call
-    ///   [`with_n_gpu_layers`](Self::with_n_gpu_layers) before this.
-    /// - No `tensor_buft_overrides` may be set. Do not call
-    ///   [`add_cpu_buft_override`](Self::add_cpu_buft_override) or
-    ///   [`add_cpu_moe_override`](Self::add_cpu_moe_override) before this.
-    /// - `cparams.n_ctx` is only auto-selected if it is `0`; otherwise it is left alone.
-    ///
-    /// # Arguments
-    ///
-    /// - `model_path` — path to the GGUF model file as a C string.
-    /// - `context_params` — context parameters; `n_ctx` may be modified (see above).
-    /// - `margins` — memory margin per device in bytes. Must have at least
-    ///   `crate::max_devices()` elements.
-    /// - `n_ctx_min` — minimum context size to preserve when reducing memory usage.
-    /// - `log_level` — minimum log level for fitting output; lower levels go to the debug log.
-    ///
-    /// # Thread safety
-    ///
-    /// This function is **not** thread safe: the underlying C call mutates the global
-    /// llama logger state.
-    ///
     /// # Errors
     ///
     /// Returns one of the [`FitError`] variants matching the vendored wrapper's status code.
@@ -499,19 +364,6 @@ impl LlamaModelParams {
     }
 }
 
-/// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`)
-/// ```
-/// # use llama_cpp_bindings::model::params::LlamaModelParams;
-/// use llama_cpp_bindings::model::split_mode::LlamaSplitMode;
-/// let params = LlamaModelParams::default();
-/// assert_eq!(params.n_gpu_layers(), -1, "n_gpu_layers should be -1");
-/// assert_eq!(params.main_gpu(), 0, "main_gpu should be 0");
-/// assert_eq!(params.vocab_only(), false, "vocab_only should be false");
-/// assert_eq!(params.use_mmap(), true, "use_mmap should be true");
-/// assert_eq!(params.use_mlock(), false, "use_mlock should be false");
-/// assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer), "split_mode should be LAYER");
-/// assert_eq!(params.devices().len(), 0, "devices should be empty");
-/// ```
 impl Default for LlamaModelParams {
     fn default() -> Self {
         let default_params = unsafe { llama_cpp_bindings_sys::llama_model_default_params() };
diff --git a/llama-cpp-bindings/src/model/params/fit_result.rs b/llama-cpp-bindings/src/model/params/fit_result.rs
index 2f89978b..655a1069 100644
--- a/llama-cpp-bindings/src/model/params/fit_result.rs
+++ b/llama-cpp-bindings/src/model/params/fit_result.rs
@@ -1,6 +1,4 @@
-/// Result of [`crate::model::params::LlamaModelParams::fit_params`].
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub struct FitResult {
-    /// The context size after fitting (may have been reduced from the requested value).
     pub n_ctx: u32,
 }
diff --git a/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs b/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs
index 6073673d..8bcdb737 100644
--- a/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs
+++ b/llama-cpp-bindings/src/model/params/kv_override_value_iterator.rs
@@ -4,7 +4,6 @@ use std::fmt::Debug;
 use crate::model::params::LlamaModelParams;
 use crate::model::params::param_override_value::ParamOverrideValue;
 
-/// An iterator over the key-value overrides for a model.
 #[derive(Debug)]
 pub struct KvOverrideValueIterator<'model_params> {
     model_params: &'model_params LlamaModelParams,
@@ -33,8 +32,6 @@ impl Iterator for KvOverrideValueIterator<'_> {
 
         loop {
             // SAFETY: llama.cpp guarantees the last element contains an empty key.
-            // We've checked the previous one in the last iteration, the next one
-            // should be valid or 0 (and thus safe to deref).
             let current = unsafe { *overrides.add(self.current) };
 
             if current.key[0] == 0 {
diff --git a/llama-cpp-bindings/src/model/params/kv_overrides.rs b/llama-cpp-bindings/src/model/params/kv_overrides.rs
index d3f46c28..618fd9cd 100644
--- a/llama-cpp-bindings/src/model/params/kv_overrides.rs
+++ b/llama-cpp-bindings/src/model/params/kv_overrides.rs
@@ -1,18 +1,14 @@
-//! Key-value overrides for a model.
-
 use std::fmt::Debug;
 
 use crate::model::params::LlamaModelParams;
 use crate::model::params::kv_override_value_iterator::KvOverrideValueIterator;
 
-/// A struct implementing [`IntoIterator`] over the key-value overrides for a model.
 #[derive(Debug)]
 pub struct KvOverrides<'model_params> {
     model_params: &'model_params LlamaModelParams,
 }
 
 impl KvOverrides<'_> {
-    /// Creates a new `KvOverrides` view over the given model parameters.
     #[must_use]
     pub const fn new(model_params: &LlamaModelParams) -> KvOverrides<'_> {
         KvOverrides { model_params }
diff --git a/llama-cpp-bindings/src/model/params/param_override_value.rs b/llama-cpp-bindings/src/model/params/param_override_value.rs
index b20e12af..041371a5 100644
--- a/llama-cpp-bindings/src/model/params/param_override_value.rs
+++ b/llama-cpp-bindings/src/model/params/param_override_value.rs
@@ -1,20 +1,14 @@
 use crate::model::params::unknown_kv_override_tag::UnknownKvOverrideTag;
 
-/// An override value for a model parameter.
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum ParamOverrideValue {
-    /// A boolean value
     Bool(bool),
-    /// A float value
     Float(f64),
-    /// A integer value
     Int(i64),
-    /// A string value
     Str([std::os::raw::c_char; 128]),
 }
 
 impl ParamOverrideValue {
-    /// Returns the FFI tag corresponding to this override value variant.
     #[must_use]
     pub const fn tag(&self) -> llama_cpp_bindings_sys::llama_model_kv_override_type {
         match self {
@@ -25,7 +19,6 @@ impl ParamOverrideValue {
         }
     }
 
-    /// Returns the FFI union value for this override.
     #[must_use]
     pub const fn value(&self) -> llama_cpp_bindings_sys::llama_model_kv_override__bindgen_ty_1 {
         match self {
diff --git a/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs b/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs
index 67978bde..da7988d0 100644
--- a/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs
+++ b/llama-cpp-bindings/src/model/params/unknown_kv_override_tag.rs
@@ -1,4 +1,3 @@
-/// Unknown KV override tag from the FFI layer.
 #[derive(Debug, thiserror::Error)]
 #[error("unknown KV override tag: {0}")]
 pub struct UnknownKvOverrideTag(pub llama_cpp_bindings_sys::llama_model_kv_override_type);
diff --git a/llama-cpp-bindings/src/model/rope_type.rs b/llama-cpp-bindings/src/model/rope_type.rs
index 35ddaa9c..2dce0526 100644
--- a/llama-cpp-bindings/src/model/rope_type.rs
+++ b/llama-cpp-bindings/src/model/rope_type.rs
@@ -1,18 +1,11 @@
-/// The Rope type that's used within the model.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum RopeType {
-    /// Standard rotary positional encoding.
     Norm,
-    /// GPT-NeoX style rotary positional encoding.
     NeoX,
-    /// Multi-dimensional rotary positional encoding.
     MRope,
-    /// Vision model rotary positional encoding.
     Vision,
 }
 
-/// Converts a raw llama.cpp rope type constant to a `RopeType`.
-/// Returns `None` for unknown or "none" rope types.
 #[must_use]
 pub const fn rope_type_from_raw(raw: i32) -> Option<RopeType> {
     match raw {
diff --git a/llama-cpp-bindings/src/model/split_mode.rs b/llama-cpp-bindings/src/model/split_mode.rs
index 170c5596..d9328a1b 100644
--- a/llama-cpp-bindings/src/model/split_mode.rs
+++ b/llama-cpp-bindings/src/model/split_mode.rs
@@ -1,16 +1,12 @@
 use crate::model::llama_split_mode_parse_error::LlamaSplitModeParseError;
 
-/// A rusty wrapper around `llama_split_mode`.
 #[repr(i8)]
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
 pub enum LlamaSplitMode {
-    /// Single GPU
     None = LLAMA_SPLIT_MODE_NONE,
-    /// Split layers and KV across GPUs
+    #[default]
     Layer = LLAMA_SPLIT_MODE_LAYER,
-    /// Split layers and KV across GPUs, use tensor parallelism if supported
     Row = LLAMA_SPLIT_MODE_ROW,
-    /// Experimental tensor parallelism across GPUs
     Tensor = LLAMA_SPLIT_MODE_TENSOR,
 }
 
@@ -35,8 +31,6 @@ const LLAMA_SPLIT_MODE_ROW: i8 = llama_cpp_bindings_sys::LLAMA_SPLIT_MODE_ROW as
 )]
 const LLAMA_SPLIT_MODE_TENSOR: i8 = llama_cpp_bindings_sys::LLAMA_SPLIT_MODE_TENSOR as i8;
 
-/// Create a `LlamaSplitMode` from a `i32`.
-///
 /// # Errors
 /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`.
 impl TryFrom<i32> for LlamaSplitMode {
@@ -63,8 +57,6 @@ impl TryFrom<i32> for LlamaSplitMode {
     }
 }
 
-/// Create a `LlamaSplitMode` from a `u32`.
-///
 /// # Errors
 /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`.
 impl TryFrom<u32> for LlamaSplitMode {
@@ -92,7 +84,6 @@ impl TryFrom<u32> for LlamaSplitMode {
     }
 }
 
-/// Create a `i32` from a `LlamaSplitMode`.
 impl From<LlamaSplitMode> for i32 {
     fn from(value: LlamaSplitMode) -> Self {
         match value {
@@ -104,7 +95,6 @@ impl From<LlamaSplitMode> for i32 {
     }
 }
 
-/// Create a `u32` from a `LlamaSplitMode`.
 impl From<LlamaSplitMode> for u32 {
     fn from(value: LlamaSplitMode) -> Self {
         match value {
@@ -116,13 +106,6 @@ impl From<LlamaSplitMode> for u32 {
     }
 }
 
-/// The default split mode is `Layer` in llama.cpp.
-impl Default for LlamaSplitMode {
-    fn default() -> Self {
-        Self::Layer
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::{
diff --git a/llama-cpp-bindings/src/model/vocab_type.rs b/llama-cpp-bindings/src/model/vocab_type.rs
index 4c790755..14e15132 100644
--- a/llama-cpp-bindings/src/model/vocab_type.rs
+++ b/llama-cpp-bindings/src/model/vocab_type.rs
@@ -1,12 +1,9 @@
 use crate::model::vocab_type_from_int_error::VocabTypeFromIntError;
 
-/// a rusty equivalent of `llama_vocab_type`
 #[repr(u32)]
 #[derive(Debug, Eq, Copy, Clone, PartialEq)]
 pub enum VocabType {
-    /// Byte Pair Encoding
     BPE = llama_cpp_bindings_sys::LLAMA_VOCAB_TYPE_BPE as _,
-    /// Sentence Piece Tokenizer
     SPM = llama_cpp_bindings_sys::LLAMA_VOCAB_TYPE_SPM as _,
 }
 
diff --git a/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs b/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs
index 3e7bcf8e..7dd3694e 100644
--- a/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs
+++ b/llama-cpp-bindings/src/model/vocab_type_from_int_error.rs
@@ -1,8 +1,5 @@
-/// Returned by [`crate::model::vocab_type::VocabType::try_from`] when the
-/// integer value does not match a known `llama_vocab_type` discriminant.
 #[derive(thiserror::Error, Debug, Eq, PartialEq)]
 pub enum VocabTypeFromIntError {
-    /// The value is not a valid `llama_vocab_type`. Contains the int value that was invalid.
     #[error("Unknown Value {0}")]
     UnknownValue(llama_cpp_bindings_sys::llama_vocab_type),
 }
diff --git a/llama-cpp-bindings/src/mtmd.rs b/llama-cpp-bindings/src/mtmd.rs
index 7d87980a..393c255a 100644
--- a/llama-cpp-bindings/src/mtmd.rs
+++ b/llama-cpp-bindings/src/mtmd.rs
@@ -1,11 +1,3 @@
-//! Safe wrapper around multimodal (MTMD) functionality in llama.cpp.
-//!
-//! This module provides Rust bindings for llama.cpp's multimodal support,
-//! allowing processing of text, image, and audio inputs through a unified interface.
-//!
-//! # Warning
-//! This API is experimental and subject to breaking changes.
-
 pub mod image_chunk_batch_size_mismatch;
 pub mod mtmd_bitmap;
 pub mod mtmd_bitmap_error;
diff --git a/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs b/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs
index dfac7f12..3763791b 100644
--- a/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs
+++ b/llama-cpp-bindings/src/mtmd/image_chunk_batch_size_mismatch.rs
@@ -1,10 +1,3 @@
-/// Carried by [`super::mtmd_eval_error::MtmdEvalError::ImageChunkExceedsBatchSize`].
-///
-/// `n_batch` is the per-decode batch budget enforced by `cparams.n_batch` in
-/// llama.cpp; `image_tokens` is the number of tokens this image chunk would
-/// hand to `llama_decode`. When `image_tokens > n_batch` the C-side
-/// `GGML_ASSERT(n_tokens_all <= cparams.n_batch)` would abort the process —
-/// the binding refuses the call instead.
 #[derive(Debug)]
 pub struct ImageChunkBatchSizeMismatch {
     pub image_tokens: u32,
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs b/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs
index 14ab3664..63dc0299 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_bitmap.rs
@@ -20,14 +20,8 @@ fn cstr_ptr_to_optional_string(ptr: *const c_char) -> Option<String> {
     }
 }
 
-/// Safe wrapper around `mtmd_bitmap`.
-///
-/// Represents bitmap data for images or audio that can be processed
-/// by the multimodal system. For images, data is stored in RGB format.
-/// For audio, data is stored as PCM F32 samples.
 #[derive(Debug, Clone)]
 pub struct MtmdBitmap {
-    /// Raw pointer to the underlying `mtmd_bitmap`.
     pub bitmap: NonNull<llama_cpp_bindings_sys::mtmd_bitmap>,
 }
 
@@ -35,25 +29,11 @@ unsafe impl Send for MtmdBitmap {}
 unsafe impl Sync for MtmdBitmap {}
 
 impl MtmdBitmap {
-    /// Create a bitmap from image data in RGB format.
-    ///
     /// # Errors
     ///
     /// * `InvalidDataSize` - Data length doesn't match `nx * ny * 3`
     /// * `NullResult` - Underlying C function returned null
     ///
-    /// # Examples
-    ///
-    /// ```
-    /// use llama_cpp_bindings::mtmd::MtmdBitmap;
-    ///
-    /// // Create a 2x2 red image
-    /// let red_pixel = [255, 0, 0]; // RGB values for red
-    /// let image_data = red_pixel.repeat(4); // 2x2 = 4 pixels
-    ///
-    /// let bitmap = MtmdBitmap::from_image_data(2, 2, &image_data);
-    /// assert!(bitmap.is_ok());
-    /// ```
     pub fn from_image_data(nx: u32, ny: u32, data: &[u8]) -> Result<Self, MtmdBitmapError> {
         if nx < 2 || ny < 2 {
             return Err(MtmdBitmapError::ImageDimensionsTooSmall(nx, ny));
@@ -70,25 +50,10 @@ impl MtmdBitmap {
         Ok(Self { bitmap })
     }
 
-    /// Create a bitmap from audio data in PCM F32 format.
-    ///
     /// # Errors
     ///
     /// * `NullResult` - Underlying C function returned null
     ///
-    /// # Examples
-    ///
-    /// ```
-    /// use llama_cpp_bindings::mtmd::MtmdBitmap;
-    ///
-    /// // Create a simple sine wave audio sample
-    /// let audio_data: Vec<f32> = (0..100)
-    ///     .map(|sample_index| (sample_index as f32 * 0.1).sin())
-    ///     .collect();
-    ///
-    /// let bitmap = MtmdBitmap::from_audio_data(&audio_data);
-    /// // Note: This will likely fail without proper MTMD context setup
-    /// ```
     pub fn from_audio_data(data: &[f32]) -> Result<Self, MtmdBitmapError> {
         let bitmap = unsafe {
             llama_cpp_bindings_sys::mtmd_bitmap_init_from_audio(data.len(), data.as_ptr())
@@ -99,12 +64,6 @@ impl MtmdBitmap {
         Ok(Self { bitmap })
     }
 
-    /// Create a bitmap from a file.
-    ///
-    /// Supported formats:
-    /// - Images: formats supported by `stb_image` (jpg, png, bmp, gif, etc.)
-    /// - Audio: formats supported by miniaudio (wav, mp3, flac)
-    ///
     /// # Errors
     ///
     /// Returns an [`MtmdBitmapError`] variant matching the wrapper's status code.
@@ -149,12 +108,6 @@ impl MtmdBitmap {
         }
     }
 
-    /// Create a bitmap from a buffer containing file data.
-    ///
-    /// Supported formats:
-    /// - Images: formats supported by `stb_image` (jpg, png, bmp, gif, etc.)
-    /// - Audio: formats supported by miniaudio (wav, mp3, flac)
-    ///
     /// # Errors
     ///
     /// * `NullResult` - Buffer could not be processed
@@ -172,22 +125,16 @@ impl MtmdBitmap {
         Ok(Self { bitmap })
     }
 
-    /// Get bitmap width in pixels.
     #[must_use]
     pub fn nx(&self) -> u32 {
         unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_nx(self.bitmap.as_ptr()) }
     }
 
-    /// Get bitmap height in pixels.
     #[must_use]
     pub fn ny(&self) -> u32 {
         unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_ny(self.bitmap.as_ptr()) }
     }
 
-    /// Get bitmap data as a byte slice.
-    ///
-    /// For images: RGB format with length `nx * ny * 3`
-    /// For audio: PCM F32 format with length `n_samples * 4`
     #[must_use]
     pub fn data(&self) -> &[u8] {
         let ptr = unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_data(self.bitmap.as_ptr()) };
@@ -195,13 +142,11 @@ impl MtmdBitmap {
         unsafe { slice::from_raw_parts(ptr, len) }
     }
 
-    /// Check if this bitmap contains audio data (vs image data).
     #[must_use]
     pub fn is_audio(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::mtmd_bitmap_is_audio(self.bitmap.as_ptr()) }
     }
 
-    /// Get the bitmap's optional ID string.
     #[must_use]
     pub fn id(&self) -> Option<String> {
         let ptr = unsafe { llama_cpp_bindings_sys::mtmd_bitmap_get_id(self.bitmap.as_ptr()) };
@@ -209,22 +154,10 @@ impl MtmdBitmap {
         cstr_ptr_to_optional_string(ptr)
     }
 
-    /// Set the bitmap's ID string.
-    ///
     /// # Errors
     ///
     /// Returns an error if the ID string contains null bytes.
     ///
-    /// # Examples
-    ///
-    /// ```no_run
-    /// # use llama_cpp_bindings::mtmd::MtmdBitmap;
-    /// # fn example(bitmap: &MtmdBitmap) -> Result<(), Box<dyn std::error::Error>> {
-    /// bitmap.set_id("image_001")?;
-    /// assert_eq!(bitmap.id(), Some("image_001".to_string()));
-    /// # Ok(())
-    /// # }
-    /// ```
     pub fn set_id(&self, id: &str) -> Result<(), std::ffi::NulError> {
         let id_cstr = CString::new(id)?;
         unsafe {
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_context.rs b/llama-cpp-bindings/src/mtmd/mtmd_context.rs
index 21ab2c11..28d4091e 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_context.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_context.rs
@@ -67,13 +67,8 @@ fn map_encode_chunk_status(
     }
 }
 
-/// Safe wrapper around `mtmd_context`.
-///
-/// This represents an initialized multimodal context that can process
-/// text, images, and audio through llama.cpp's multimodal interface.
 #[derive(Debug)]
 pub struct MtmdContext {
-    /// Raw pointer to the underlying `mtmd_context`.
     pub context: NonNull<llama_cpp_bindings_sys::mtmd_context>,
 }
 
@@ -81,8 +76,6 @@ unsafe impl Send for MtmdContext {}
 unsafe impl Sync for MtmdContext {}
 
 impl MtmdContext {
-    /// Initialize MTMD context from a multimodal projection file.
-    ///
     /// # Errors
     ///
     /// Returns an [`MtmdInitError`] variant matching the wrapper's status code.
@@ -132,8 +125,6 @@ impl MtmdContext {
         }
     }
 
-    /// Check whether non-causal attention mask is needed before `llama_decode`
-    /// for the given input chunk.
     #[must_use]
     pub fn decode_use_non_causal(&self, chunk: &MtmdInputChunk) -> bool {
         unsafe {
@@ -144,26 +135,21 @@ impl MtmdContext {
         }
     }
 
-    /// Check whether the current model uses M-RoPE for `llama_decode`.
     #[must_use]
     pub fn decode_use_mrope(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::mtmd_decode_use_mrope(self.context.as_ptr()) }
     }
 
-    /// Check whether the current model supports vision input.
     #[must_use]
     pub fn support_vision(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::mtmd_support_vision(self.context.as_ptr()) }
     }
 
-    /// Check whether the current model supports audio input.
     #[must_use]
     pub fn support_audio(&self) -> bool {
         unsafe { llama_cpp_bindings_sys::mtmd_support_audio(self.context.as_ptr()) }
     }
 
-    /// Get audio sample rate in Hz (e.g., 16000 for Whisper).
-    /// Returns None if audio is not supported.
     #[must_use]
     pub fn get_audio_sample_rate(&self) -> Option<u32> {
         let rate =
@@ -171,12 +157,6 @@ impl MtmdContext {
         (rate > 0).then_some(rate.unsigned_abs())
     }
 
-    /// Tokenize input text and bitmaps into chunks.
-    ///
-    /// The input text must contain media markers (default: `<__media__>`) that will be
-    /// replaced with the corresponding bitmap data from the `bitmaps` array.
-    /// The number of bitmaps must equal the number of markers in the text.
-    ///
     /// # Errors
     ///
     /// Returns an [`MtmdTokenizeError`] variant matching the wrapper's status code.
@@ -217,8 +197,6 @@ impl MtmdContext {
         Ok(chunks)
     }
 
-    /// Encode a chunk for image/audio processing.
-    ///
     /// # Errors
     ///
     /// Returns an [`MtmdEncodeError`] variant matching the wrapper's status code.
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs b/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs
index ec6fe674..b850580b 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_context_params.rs
@@ -1,29 +1,10 @@
 use std::ffi::{CStr, CString};
 
-/// Configuration parameters for MTMD context
-///
-/// # Examples
-///
-/// ```
-/// use llama_cpp_bindings::mtmd::{MtmdContextParams, mtmd_default_marker};
-/// use std::ffi::CString;
-///
-/// let params = MtmdContextParams {
-///     use_gpu: false,
-///     print_timings: true,
-///     n_threads: 4,
-///     media_marker: CString::new(mtmd_default_marker()).unwrap(),
-/// };
-/// ```
 #[derive(Debug, Clone)]
 pub struct MtmdContextParams {
-    /// Whether to use GPU acceleration
     pub use_gpu: bool,
-    /// Whether to print timing information
     pub print_timings: bool,
-    /// Number of threads to use for processing
     pub n_threads: i32,
-    /// Media marker string used to identify media positions in text
     pub media_marker: CString,
 }
 
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs b/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs
index 2d559b5e..5209e6f2 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_default_marker.rs
@@ -1,22 +1,5 @@
 use std::ffi::CStr;
 
-/// Get the default media marker string.
-///
-/// Returns the default marker used to identify media positions in text
-/// (typically `"<__media__>"`). This marker should be used in your input text
-/// to indicate where media content should be inserted.
-///
-/// # Examples
-///
-/// ```
-/// use llama_cpp_bindings::mtmd::mtmd_default_marker;
-///
-/// let marker = mtmd_default_marker();
-/// assert!(!marker.is_empty());
-///
-/// let text = format!("Describe this image: {}", marker);
-/// assert!(text.contains(marker));
-/// ```
 #[must_use]
 pub fn mtmd_default_marker() -> &'static str {
     unsafe {
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs
index 88d1358c..f10a5bca 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk.rs
@@ -34,21 +34,13 @@ const unsafe fn tokens_from_raw_ptr<'chunk>(
     }
 }
 
-/// Safe wrapper around `mtmd_input_chunk`.
-///
-/// Represents a single chunk of input data, which can be either text tokens,
-/// image tokens, or audio tokens. The chunk type determines what kind of
-/// data and operations are available.
 #[derive(Debug)]
 pub struct MtmdInputChunk {
-    /// Raw pointer to the underlying `mtmd_input_chunk`.
     pub chunk: NonNull<llama_cpp_bindings_sys::mtmd_input_chunk>,
     pub owned: bool,
 }
 
 impl MtmdInputChunk {
-    /// Get the type of this chunk
-    ///
     /// # Errors
     /// Returns an error if the chunk type is unknown.
     pub fn chunk_type(&self) -> Result<MtmdInputChunkType, MtmdInputChunkTypeError> {
@@ -57,9 +49,6 @@ impl MtmdInputChunk {
         MtmdInputChunkType::try_from(chunk_type)
     }
 
-    /// Get text tokens from this chunk.
-    ///
-    /// Only valid for text chunks. Returns `None` for image or audio chunks.
     #[must_use]
     pub fn text_tokens(&self) -> Option<&[LlamaToken]> {
         if self.chunk_type() != Ok(MtmdInputChunkType::Text) {
@@ -77,21 +66,16 @@ impl MtmdInputChunk {
         unsafe { tokens_from_raw_ptr(tokens_ptr, n_tokens) }
     }
 
-    /// Get the number of tokens in this chunk
     #[must_use]
     pub fn n_tokens(&self) -> usize {
         unsafe { llama_cpp_bindings_sys::mtmd_input_chunk_get_n_tokens(self.chunk.as_ptr()) }
     }
 
-    /// Get the number of positions in this chunk.
     #[must_use]
     pub fn n_positions(&self) -> i32 {
         unsafe { llama_cpp_bindings_sys::mtmd_input_chunk_get_n_pos(self.chunk.as_ptr()) }
     }
 
-    /// Get chunk ID if available.
-    ///
-    /// Returns `None` for text chunks, may return an ID for image/audio chunks.
     #[must_use]
     pub fn id(&self) -> Option<String> {
         let ptr = unsafe { llama_cpp_bindings_sys::mtmd_input_chunk_get_id(self.chunk.as_ptr()) };
@@ -105,8 +89,6 @@ impl MtmdInputChunk {
         }
     }
 
-    /// Create a copy of this chunk that you own.
-    ///
     /// # Errors
     ///
     /// Returns `MtmdInputChunkError::ChunkOperationFailed` if copying fails.
@@ -117,19 +99,6 @@ impl MtmdInputChunk {
         Ok(Self { chunk, owned: true })
     }
 
-    /// Evaluate this single chunk through the multimodal helper.
-    ///
-    /// Mirrors `MtmdInputChunks::eval_chunks` but for one chunk at a time, so
-    /// callers can interleave per-chunk decode with per-chunk bookkeeping
-    /// (token counting, marker state-machine replay) inside one loop instead
-    /// of running the helper-level all-chunks eval and a separate ingest pass.
-    ///
-    /// Image chunks are decoded as one `llama_decode` call inside the helper,
-    /// so their token count must fit in `n_batch`. When it would not, the
-    /// binding refuses the call up front because the C-side
-    /// `GGML_ASSERT(n_tokens_all <= cparams.n_batch)` would otherwise abort
-    /// the process.
-    ///
     /// # Errors
     ///
     /// Returns [`MtmdEvalError::ImageChunkExceedsBatchSize`] when this is an
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs
index ef628b89..5392d85e 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type.rs
@@ -1,29 +1,10 @@
 use crate::mtmd::mtmd_input_chunk_type_error::MtmdInputChunkTypeError;
 
-/// Input chunk types for multimodal data
-///
-/// # Examples
-///
-/// ```
-/// use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-///
-/// let text_chunk = MtmdInputChunkType::Text;
-/// let image_chunk = MtmdInputChunkType::Image;
-/// let audio_chunk = MtmdInputChunkType::Audio;
-///
-/// assert_eq!(text_chunk, MtmdInputChunkType::Text);
-/// let converted: MtmdInputChunkType = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_TEXT.try_into().unwrap();
-/// assert_eq!(text_chunk, converted);
-/// assert_ne!(text_chunk, image_chunk);
-/// ```
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[repr(u32)]
 pub enum MtmdInputChunkType {
-    /// Text input chunk
     Text = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_TEXT as _,
-    /// Image input chunk
     Image = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_IMAGE as _,
-    /// Audio input chunk
     Audio = llama_cpp_bindings_sys::MTMD_INPUT_CHUNK_TYPE_AUDIO as _,
 }
 
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs
index ae3ca7e8..0bc0a6c7 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunk_type_error.rs
@@ -1,4 +1,3 @@
-/// Error when converting from an unknown MTMD input chunk type value.
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("Unknown MTMD input chunk type: {0}")]
 pub struct MtmdInputChunkTypeError(pub llama_cpp_bindings_sys::mtmd_input_chunk_type);
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs
index 9ac2705b..f592c42c 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs
@@ -15,34 +15,17 @@ const fn check_eval_result(result: i32) -> Result<(), MtmdEvalError> {
     }
 }
 
-/// Safe wrapper around `mtmd_input_chunks`.
-///
-/// This is a collection of input chunks created from tokenizing text and media.
-/// The chunks represent the tokenized input that can be processed by the model,
-/// with text chunks containing tokens and media chunks containing embeddings.
 #[derive(Debug)]
 pub struct MtmdInputChunks {
-    /// Raw pointer to the underlying `mtmd_input_chunks`.
     pub chunks: NonNull<llama_cpp_bindings_sys::mtmd_input_chunks>,
 }
 
 impl MtmdInputChunks {
-    /// Create a new empty input chunks collection.
-    ///
     /// # Errors
     ///
     /// Returns `MtmdInputChunksError::ChunksCreationFailed` if the underlying llama.cpp function
     /// returns null.
     ///
-    /// # Examples
-    ///
-    /// ```
-    /// use llama_cpp_bindings::mtmd::MtmdInputChunks;
-    ///
-    /// let chunks = MtmdInputChunks::new().unwrap();
-    /// assert_eq!(chunks.len(), 0);
-    /// assert!(chunks.is_empty());
-    /// ```
     pub fn new() -> Result<Self, MtmdInputChunksError> {
         let chunks = unsafe { llama_cpp_bindings_sys::mtmd_input_chunks_init() };
         let chunks = NonNull::new(chunks).ok_or(MtmdInputChunksError::ChunksCreationFailed)?;
@@ -50,19 +33,16 @@ impl MtmdInputChunks {
         Ok(Self { chunks })
     }
 
-    /// Get the number of chunks
     #[must_use]
     pub fn len(&self) -> usize {
         unsafe { llama_cpp_bindings_sys::mtmd_input_chunks_size(self.chunks.as_ptr()) }
     }
 
-    /// Check if chunks collection is empty
     #[must_use]
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
 
-    /// Get a chunk by index
     #[must_use]
     pub fn get(&self, index: usize) -> Option<MtmdInputChunk> {
         if index >= self.len() {
@@ -78,20 +58,16 @@ impl MtmdInputChunks {
         })
     }
 
-    /// Get total number of tokens across all chunks.
     #[must_use]
     pub fn total_tokens(&self) -> usize {
         unsafe { llama_cpp_bindings_sys::mtmd_helper_get_n_tokens(self.chunks.as_ptr()) }
     }
 
-    /// Get total position count across all chunks.
     #[must_use]
     pub fn total_positions(&self) -> i32 {
         unsafe { llama_cpp_bindings_sys::mtmd_helper_get_n_pos(self.chunks.as_ptr()) }
     }
 
-    /// Evaluate chunks using the multimodal context and LLAMA context.
-    ///
     /// # Errors
     ///
     /// Returns `MtmdEvalError::EvalFailure` if any encoding or decoding operation fails.
@@ -113,11 +89,6 @@ impl MtmdInputChunks {
             });
         }
 
-        // mtmd_helper_eval_chunks overwrites `*new_n_past` at the end of its
-        // chunk loop (mtmd-helper.cpp:413), so any seed would be fine — but
-        // we mirror the per-chunk wrapper's `start_position` / `final_position`
-        // shape here for parity, keeping the read-only input and write-only
-        // output strictly separated.
         let mut final_position: llama_cpp_bindings_sys::llama_pos = start_position;
 
         let result = unsafe {
diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs
index db61b6ec..4f99a8f6 100644
--- a/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs
+++ b/llama-cpp-bindings/src/mtmd/mtmd_input_text.rs
@@ -1,22 +1,6 @@
-/// Text input configuration
-///
-/// # Examples
-///
-/// ```
-/// use llama_cpp_bindings::mtmd::MtmdInputText;
-///
-/// let input = MtmdInputText {
-///     text: "Describe this image.".to_string(),
-///     add_special: true,
-///     parse_special: true,
-/// };
-/// ```
 #[derive(Debug, Clone)]
 pub struct MtmdInputText {
-    /// The input text string
     pub text: String,
-    /// Whether to add special tokens
     pub add_special: bool,
-    /// Whether to parse special tokens
     pub parse_special: bool,
 }
diff --git a/llama-cpp-bindings/src/resolved_tool_call_markers.rs b/llama-cpp-bindings/src/resolved_tool_call_markers.rs
index ced6510c..3b2e6a3c 100644
--- a/llama-cpp-bindings/src/resolved_tool_call_markers.rs
+++ b/llama-cpp-bindings/src/resolved_tool_call_markers.rs
@@ -1,9 +1,3 @@
-/// Effective tool-call marker strings resolved from either the autoparser
-/// output or the per-template override registry.
-///
-/// Each side is independently optional because the autoparser may report only
-/// one of the two strings, and the override registry may not match the
-/// template at all.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct ResolvedToolCallMarkers {
     pub open: Option<String>,
diff --git a/llama-cpp-bindings/src/sampled_token_classifier.rs b/llama-cpp-bindings/src/sampled_token_classifier.rs
index aae24fc3..26fa65eb 100644
--- a/llama-cpp-bindings/src/sampled_token_classifier.rs
+++ b/llama-cpp-bindings/src/sampled_token_classifier.rs
@@ -70,15 +70,6 @@ impl<'model> SampledTokenClassifier<'model> {
         }
     }
 
-    /// Ingest one sampled token. Returns the outcomes that have finalised this
-    /// turn — typically a single outcome, occasionally zero (the classifier is
-    /// holding back tokens that may yet form a marker), or several when a
-    /// buffered marker prefix diverges and the held-back tokens flush.
-    ///
-    /// Each [`IngestOutcome`] carries both the [`SampledToken`] variant for
-    /// classification and the decoded `visible_piece` for streaming. Marker
-    /// boundaries get an empty `visible_piece` so their text never reaches
-    /// user-visible streams.
     pub fn ingest(&mut self, token: LlamaToken) -> Vec<IngestOutcome> {
         if !self.markers.has_any() {
             self.usage.record_undeterminable_token();
@@ -120,15 +111,6 @@ impl<'model> SampledTokenClassifier<'model> {
         )
     }
 
-    /// Replay one prompt token through the marker state machine so that the
-    /// section at end-of-prompt reflects the chat template's rendered tail
-    /// (e.g. for Qwen3.5/3.6 with `enable_thinking=false` the prompt ends with
-    /// a closed empty `<think>...</think>` block, leaving the section in
-    /// `Content`; with `enable_thinking=true` it ends inside an open `<think>`,
-    /// leaving the section in `Reasoning`).
-    ///
-    /// Prompt tokens never produce [`IngestOutcome`]s and never increment usage
-    /// counters — they are not generated content.
     pub fn ingest_prompt_token(&mut self, token: LlamaToken) {
         if !self.markers.has_any() {
             return;
@@ -156,9 +138,6 @@ impl<'model> SampledTokenClassifier<'model> {
         }
     }
 
-    /// Drain every still-buffered token. Call once at end of generation (EOG)
-    /// to make sure no decoded text is silently dropped. After `flush()` the
-    /// classifier behaves as if freshly constructed in terms of buffer state.
     pub fn flush(&mut self) -> Vec<IngestOutcome> {
         self.probe_mode = ProbeMode::Idle;
         let mut outcomes = Vec::with_capacity(self.pending.len());
diff --git a/llama-cpp-bindings/src/sampling.rs b/llama-cpp-bindings/src/sampling.rs
index ac1bfb5c..7be49c06 100644
--- a/llama-cpp-bindings/src/sampling.rs
+++ b/llama-cpp-bindings/src/sampling.rs
@@ -1,5 +1,3 @@
-//! Safe wrapper around `llama_sampler`.
-
 use std::borrow::Borrow;
 use std::ffi::{CString, c_char};
 use std::fmt::{Debug, Formatter};
@@ -41,9 +39,7 @@ fn checked_usize_as_i32_sampling(value: usize) -> Result<i32, SamplingError> {
     })
 }
 
-/// A safe wrapper around `llama_sampler`.
 pub struct LlamaSampler {
-    /// Raw pointer to the underlying `llama_sampler`.
     pub sampler: *mut llama_cpp_bindings_sys::llama_sampler,
 }
 
@@ -54,8 +50,6 @@ impl Debug for LlamaSampler {
 }
 
 impl LlamaSampler {
-    /// Sample and accept a token from the idx-th output of the last evaluation.
-    ///
     /// # Errors
     ///
     /// Returns [`SampleError`] if the C++ sampler throws an exception or if the index is invalid.
@@ -86,23 +80,16 @@ impl LlamaSampler {
         }
     }
 
-    /// Applies this sampler to a [`LlamaTokenDataArray`].
     pub fn apply(&self, data_array: &mut LlamaTokenDataArray) {
         data_array.apply_sampler(self);
     }
 
-    /// Accepts a token from the sampler, possibly updating the internal state of certain samplers
-    /// (e.g. grammar, repetition, etc.)
-    ///
     /// # Errors
     /// Returns [`SamplerAcceptError`] if the underlying sampler rejects the token.
     pub fn accept(&mut self, token: LlamaToken) -> Result<(), SamplerAcceptError> {
         self.try_accept(token)
     }
 
-    /// Accepts several tokens from the sampler or context, possibly updating the internal state of
-    /// certain samplers (e.g. grammar, repetition, etc.)
-    ///
     /// # Errors
     /// Returns [`SamplerAcceptError`] if the underlying sampler rejects any token.
     pub fn accept_many(
@@ -116,9 +103,6 @@ impl LlamaSampler {
         Ok(())
     }
 
-    /// Accepts several tokens from the sampler or context, possibly updating the internal state of
-    /// certain samplers (e.g. grammar, repetition, etc.)
-    ///
     /// # Errors
     /// Returns [`SamplerAcceptError`] if the underlying sampler rejects any token.
     pub fn with_tokens(
@@ -130,8 +114,6 @@ impl LlamaSampler {
         Ok(self)
     }
 
-    /// Try accepting a token from the sampler. Returns an error if the sampler throws.
-    ///
     /// # Errors
     /// Returns an error if the underlying sampler rejects the token.
     pub fn try_accept(&mut self, token: LlamaToken) -> Result<(), SamplerAcceptError> {
@@ -148,32 +130,17 @@ impl LlamaSampler {
         check_sampler_accept_status(status, error_ptr)
     }
 
-    /// Resets the internal state of the sampler.
-    ///
-    /// This can be useful when you want to start fresh with a sampler without creating a new instance.
     pub fn reset(&mut self) {
         unsafe {
             llama_cpp_bindings_sys::llama_sampler_reset(self.sampler);
         }
     }
 
-    /// Gets the random seed used by this sampler.
-    ///
-    /// Returns:
-    /// - For random samplers (dist, mirostat, `mirostat_v2)`: returns their current seed
-    /// - For sampler chains: returns the first non-default seed found in reverse order
-    /// - For all other samplers: returns 0xFFFFFFFF
     #[must_use]
     pub fn get_seed(&self) -> u32 {
         unsafe { llama_cpp_bindings_sys::llama_sampler_get_seed(self.sampler) }
     }
 
-    /// Combines a list of samplers into a single sampler that applies each component sampler one
-    /// after another.
-    ///
-    /// If you are using a chain to select a token, the chain should always end with one of
-    /// [`LlamaSampler::greedy`], [`LlamaSampler::dist`], [`LlamaSampler::mirostat`], and
-    /// [`LlamaSampler::mirostat_v2`].
     #[must_use]
     pub fn chain(samplers: impl IntoIterator<Item = Self>, no_perf: bool) -> Self {
         unsafe {
@@ -190,74 +157,17 @@ impl LlamaSampler {
         }
     }
 
-    /// Same as [`Self::chain`] with `no_perf = false`.
-    ///
-    /// # Example
-    /// ```rust
-    /// use llama_cpp_bindings::token::{
-    ///    LlamaToken,
-    ///    data::LlamaTokenData,
-    ///    data_array::LlamaTokenDataArray
-    /// };
-    /// use llama_cpp_bindings::sampling::LlamaSampler;
-    /// use llama_cpp_bindings::llama_backend::LlamaBackend;
-    /// let backend = LlamaBackend::init().unwrap();
-    ///
-    /// let mut data_array = LlamaTokenDataArray::new(vec![
-    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
-    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
-    ///     LlamaTokenData::new(LlamaToken(2), 2., 0.),
-    /// ], false);
-    ///
-    /// data_array.apply_sampler(&mut LlamaSampler::chain_simple([
-    ///     LlamaSampler::temp(0.5),
-    ///     LlamaSampler::greedy(),
-    /// ]));
-    ///
-    /// assert_eq!(data_array.data[0].logit(), 0.);
-    /// assert_eq!(data_array.data[1].logit(), 2.);
-    /// assert_eq!(data_array.data[2].logit(), 4.);
-    ///
-    /// assert_eq!(data_array.data.len(), 3);
-    /// assert_eq!(data_array.selected_token(), Some(LlamaToken(2)));
-    /// ```
     #[must_use]
     pub fn chain_simple(samplers: impl IntoIterator<Item = Self>) -> Self {
         Self::chain(samplers, false)
     }
 
-    /// Updates the logits `l_i' = l_i/t`. When `t <= 0.0`, the maximum logit is kept at its original
-    /// value, the rest are set to -inf
-    ///
-    /// # Example:
-    /// ```rust
-    /// use llama_cpp_bindings::token::{
-    ///    LlamaToken,
-    ///    data::LlamaTokenData,
-    ///    data_array::LlamaTokenDataArray
-    /// };
-    /// use llama_cpp_bindings::sampling::LlamaSampler;
-    ///
-    /// let mut data_array = LlamaTokenDataArray::new(vec![
-    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
-    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
-    ///     LlamaTokenData::new(LlamaToken(2), 2., 0.),
-    /// ], false);
-    ///
-    /// data_array.apply_sampler(&mut LlamaSampler::temp(0.5));
-    ///
-    /// assert_eq!(data_array.data[0].logit(), 0.);
-    /// assert_eq!(data_array.data[1].logit(), 2.);
-    /// assert_eq!(data_array.data[2].logit(), 4.);
-    /// ```
     #[must_use]
     pub fn temp(t: f32) -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_temp(t) };
         Self { sampler }
     }
 
-    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper
-    /// <https://arxiv.org/abs/2309.02772>.
     #[must_use]
     pub fn temp_ext(t: f32, delta: f32, exponent: f32) -> Self {
         let sampler =
@@ -265,91 +175,36 @@ impl LlamaSampler {
         Self { sampler }
     }
 
-    /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration"
-    /// <https://arxiv.org/abs/1904.09751>
-    ///
-    /// # Example:
-    /// ```rust
-    /// use llama_cpp_bindings::token::{
-    ///    LlamaToken,
-    ///    data::LlamaTokenData,
-    ///    data_array::LlamaTokenDataArray
-    /// };
-    /// use llama_cpp_bindings::sampling::LlamaSampler;
-    ///
-    /// let mut data_array = LlamaTokenDataArray::new(vec![
-    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
-    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
-    ///     LlamaTokenData::new(LlamaToken(2), 2., 0.),
-    ///     LlamaTokenData::new(LlamaToken(3), 3., 0.),
-    /// ], false);
-    ///
-    /// data_array.apply_sampler(&mut LlamaSampler::top_k(2));
-    ///
-    /// assert_eq!(data_array.data.len(), 2);
-    /// assert_eq!(data_array.data[0].id(), LlamaToken(3));
-    /// assert_eq!(data_array.data[1].id(), LlamaToken(2));
-    /// ```
     #[must_use]
     pub fn top_k(k: i32) -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_top_k(k) };
         Self { sampler }
     }
 
-    /// Top-nσ sampling as described in academic paper "Top-nσ: Not All Logits Are You Need"
-    /// <https://arxiv.org/pdf/2411.07641>
-    ///
-    /// This method filters logits by selecting only those within *n* standard deviations of the mean.
-    ///
-    /// # Parameters
-    /// - `n`: Number of standard deviations from the mean to include in sampling
-    ///
-    /// # Example
-    /// ```rust
-    /// use llama_cpp_bindings::sampling::LlamaSampler;
-    /// use llama_cpp_bindings::token::{
-    ///     LlamaToken,
-    ///     data::LlamaTokenData,
-    ///     data_array::LlamaTokenDataArray
-    /// };
-    ///
-    /// let mut data_array = LlamaTokenDataArray::new(vec![
-    ///     LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
-    ///     LlamaTokenData::new(LlamaToken(1), 1.0, 0.0),
-    ///     LlamaTokenData::new(LlamaToken(2), 2.0, 0.0),
-    /// ], false);
-    ///
-    /// data_array.apply_sampler(&mut LlamaSampler::top_n_sigma(2.0));
-    /// ```
     #[must_use]
     pub fn top_n_sigma(n: f32) -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_top_n_sigma(n) };
         Self { sampler }
     }
 
-    /// Locally Typical Sampling implementation described in the paper <https://arxiv.org/abs/2202.00666>.
     #[must_use]
     pub fn typical(p: f32, min_keep: usize) -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_typical(p, min_keep) };
         Self { sampler }
     }
 
-    /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration"
-    /// <https://arxiv.org/abs/1904.09751>
     #[must_use]
     pub fn top_p(p: f32, min_keep: usize) -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_top_p(p, min_keep) };
         Self { sampler }
     }
 
-    /// Minimum P sampling as described in <https://github.com/ggerganov/llama.cpp/pull/3841>
     #[must_use]
     pub fn min_p(p: f32, min_keep: usize) -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_min_p(p, min_keep) };
         Self { sampler }
     }
 
-    /// XTC sampler as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>
     #[must_use]
     pub fn xtc(p: f32, t: f32, min_keep: usize, seed: u32) -> Self {
         let sampler =
@@ -357,8 +212,6 @@ impl LlamaSampler {
         Self { sampler }
     }
 
-    /// Grammar sampler
-    ///
     /// # Errors
     /// Returns an error if the grammar is invalid or the sampler cannot be initialized.
     pub fn grammar(
@@ -401,10 +254,6 @@ impl LlamaSampler {
         }
     }
 
-    /// Lazy grammar sampler, introduced in <https://github.com/ggerganov/llama.cpp/pull/9639>
-    ///
-    /// This sampler enforces grammar rules only when specific trigger words or tokens are encountered.
-    ///
     /// # Errors
     /// Returns an error if the grammar or trigger words are invalid.
     pub fn grammar_lazy(
@@ -457,12 +306,6 @@ impl LlamaSampler {
         }
     }
 
-    /// Lazy grammar sampler using regex trigger patterns.
-    ///
-    /// Trigger patterns are regular expressions matched from the start of the
-    /// generation output. The grammar sampler will be fed content starting from
-    /// the first match group.
-    ///
     /// # Errors
     /// Returns an error if the grammar or trigger patterns are invalid.
     pub fn grammar_lazy_patterns(
@@ -519,11 +362,6 @@ impl LlamaSampler {
         }
     }
 
-    /// `LLGuidance` sampler for constrained decoding.
-    ///
-    /// Uses the `llguidance` and `toktrie` Rust crates to enforce grammar constraints
-    /// during token sampling. Supports JSON schema, regex, Lark, and other grammar types.
-    ///
     /// # Errors
     ///
     /// Returns [`GrammarError`] if the grammar is invalid or the sampler cannot be initialized.
@@ -567,10 +405,6 @@ impl LlamaSampler {
             .collect()
     }
 
-    /// DRY sampler, designed by p-e-w, as described in:
-    /// <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp
-    /// implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
-    ///
     /// # Errors
     /// Returns an error if any string in `seq_breakers` contains null bytes.
     pub fn dry(
@@ -612,13 +446,6 @@ impl LlamaSampler {
         Ok(Self { sampler })
     }
 
-    /// Penalizes tokens for being present in the context.
-    ///
-    /// Parameters:
-    /// - ``penalty_last_n``: last n tokens to penalize (0 = disable penalty, -1 = context size)
-    /// - ``penalty_repeat``: 1.0 = disabled
-    /// - ``penalty_freq``: 0.0 = disabled
-    /// - ``penalty_present``: 0.0 = disabled
     #[must_use]
     pub fn penalties(
         penalty_last_n: i32,
@@ -637,21 +464,6 @@ impl LlamaSampler {
         Self { sampler }
     }
 
-    /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
-    ///
-    /// # Parameters:
-    /// - ``n_vocab``: [`LlamaModel::n_vocab`]
-    /// - ``seed``: Seed to initialize random generation with.
-    /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the
-    ///   generated text. A higher value corresponds to more surprising or less predictable text,
-    ///   while a lower value corresponds to less surprising or more predictable text.
-    /// - ``eta``: The learning rate used to update `mu` based on the error between the target and
-    ///   observed surprisal of the sampled word. A larger learning rate will cause `mu` to be
-    ///   updated more quickly, while a smaller learning rate will result in slower updates.
-    /// - ``m``: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary
-    ///   value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`.
-    ///   In the paper, they use `m = 100`, but you can experiment with different values to see how
-    ///   it affects the performance of the algorithm.
     #[must_use]
     pub fn mirostat(n_vocab: i32, seed: u32, tau: f32, eta: f32, m: i32) -> Self {
         let sampler = unsafe {
@@ -660,16 +472,6 @@ impl LlamaSampler {
         Self { sampler }
     }
 
-    /// Mirostat 2.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
-    ///
-    /// # Parameters:
-    /// - ``seed``: Seed to initialize random generation with.
-    /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the
-    ///   generated text. A higher value corresponds to more surprising or less predictable text,
-    ///   while a lower value corresponds to less surprising or more predictable text.
-    /// - ``eta``: The learning rate used to update `mu` based on the error between the target and
-    ///   observed surprisal of the sampled word. A larger learning rate will cause `mu` to be
-    ///   updated more quickly, while a smaller learning rate will result in slower updates.
     #[must_use]
     pub fn mirostat_v2(seed: u32, tau: f32, eta: f32) -> Self {
         let sampler =
@@ -677,62 +479,21 @@ impl LlamaSampler {
         Self { sampler }
     }
 
-    /// Selects a token at random based on each token's probabilities
     #[must_use]
     pub fn dist(seed: u32) -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_dist(seed) };
         Self { sampler }
     }
 
-    /// Selects the most likely token
-    ///
-    /// # Example:
-    /// ```rust
-    /// use llama_cpp_bindings::token::{
-    ///    LlamaToken,
-    ///    data::LlamaTokenData,
-    ///    data_array::LlamaTokenDataArray
-    /// };
-    /// use llama_cpp_bindings::sampling::LlamaSampler;
-    ///
-    /// let mut data_array = LlamaTokenDataArray::new(vec![
-    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
-    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
-    /// ], false);
-    ///
-    /// data_array.apply_sampler(&mut LlamaSampler::greedy());
-    ///
-    /// assert_eq!(data_array.data.len(), 2);
-    /// assert_eq!(data_array.selected_token(), Some(LlamaToken(1)));
-    /// ```
     #[must_use]
     pub fn greedy() -> Self {
         let sampler = unsafe { llama_cpp_bindings_sys::llama_sampler_init_greedy() };
         Self { sampler }
     }
 
-    /// Creates a sampler that applies bias values to specific tokens during sampling.
-    ///
-    /// # Parameters
-    /// - ``n_vocab``: [`LlamaModel::n_vocab`]
-    /// - ``biases``: Slice of [`LlamaLogitBias`] values specifying token-bias pairs
-    ///
     /// # Errors
     /// Returns [`SamplingError::IntegerOverflow`] if `biases.len()` exceeds `i32::MAX`.
     ///
-    /// # Example
-    /// ```rust
-    /// use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias};
-    /// use llama_cpp_bindings::sampling::LlamaSampler;
-    ///
-    /// let biases = vec![
-    ///     LlamaLogitBias::new(LlamaToken(1), 1.5),  // Increase probability of token 1
-    ///     LlamaLogitBias::new(LlamaToken(2), -1.0), // Decrease probability of token 2
-    /// ];
-    ///
-    /// // Assuming vocab_size of 32000
-    /// let sampler = LlamaSampler::logit_bias(32000, &biases).unwrap();
-    /// ```
     pub fn logit_bias(n_vocab: i32, biases: &[LlamaLogitBias]) -> Result<Self, SamplingError> {
         let bias_count = checked_usize_as_i32_sampling(biases.len())?;
         let data = biases
diff --git a/llama-cpp-bindings/src/streaming_json_probe.rs b/llama-cpp-bindings/src/streaming_json_probe.rs
index 3560be7b..9e17bd9a 100644
--- a/llama-cpp-bindings/src/streaming_json_probe.rs
+++ b/llama-cpp-bindings/src/streaming_json_probe.rs
@@ -449,8 +449,6 @@ mod tests {
 
     #[test]
     fn syntactically_malformed_object_is_failed() {
-        // Input starts with `{` (passes the cheap prefix check) but cannot parse — the syntax
-        // error path classifies as `Category::Syntax`, surfacing the `Failed` arm.
         assert_eq!(
             JsonProbeOutcome::validate_prefix("{,}"),
             JsonProbeOutcome::Failed,
diff --git a/llama-cpp-bindings/src/streaming_markers.rs b/llama-cpp-bindings/src/streaming_markers.rs
index 9eaaddf2..e34636f7 100644
--- a/llama-cpp-bindings/src/streaming_markers.rs
+++ b/llama-cpp-bindings/src/streaming_markers.rs
@@ -8,11 +8,6 @@ pub enum MarkerKind {
     ToolCallClose,
 }
 
-/// Tokenized marker sequences (token IDs, not strings).
-///
-/// Each marker is a `Vec<LlamaToken>` of length `>= 1`; absent markers are
-/// `None`. Sequence matching at every `ingest()` is by token-ID equality,
-/// never by substring scanning of decoded text.
 #[derive(Clone, Debug, Default, Eq, PartialEq)]
 pub struct StreamingMarkers {
     pub reasoning_open: Option<Vec<LlamaToken>>,
diff --git a/llama-cpp-bindings/src/timing.rs b/llama-cpp-bindings/src/timing.rs
index 5c07eab8..e0ea3482 100644
--- a/llama-cpp-bindings/src/timing.rs
+++ b/llama-cpp-bindings/src/timing.rs
@@ -1,23 +1,11 @@
-//! Safe wrapper around `llama_timings`.
 use std::fmt::{Debug, Display, Formatter};
 
-/// A wrapper around `llama_timings`.
 #[derive(Clone, Copy, Debug)]
 pub struct LlamaTimings {
-    /// The underlying `llama_perf_context_data` from the C API.
     pub timings: llama_cpp_bindings_sys::llama_perf_context_data,
 }
 
 impl LlamaTimings {
-    /// Create a new `LlamaTimings`.
-    /// ```
-    /// # use llama_cpp_bindings::timing::LlamaTimings;
-    /// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6, 1);
-    /// let timings_str = "load time = 2.00 ms
-    /// prompt eval time = 3.00 ms / 5 tokens (0.60 ms per token, 1666.67 tokens per second)
-    /// eval time = 4.00 ms / 6 runs (0.67 ms per token, 1500.00 tokens per second)\n";
-    /// assert_eq!(timings_str, format!("{}", timings));
-    /// ```
     #[must_use]
     pub const fn new(
         t_start_ms: f64,
@@ -41,68 +29,56 @@ impl LlamaTimings {
         }
     }
 
-    /// Get the start time in milliseconds.
     #[must_use]
     pub const fn t_start_ms(&self) -> f64 {
         self.timings.t_start_ms
     }
 
-    /// Get the load time in milliseconds.
     #[must_use]
     pub const fn t_load_ms(&self) -> f64 {
         self.timings.t_load_ms
     }
 
-    /// Get the prompt evaluation time in milliseconds.
     #[must_use]
     pub const fn t_p_eval_ms(&self) -> f64 {
         self.timings.t_p_eval_ms
     }
 
-    /// Get the evaluation time in milliseconds.
     #[must_use]
     pub const fn t_eval_ms(&self) -> f64 {
         self.timings.t_eval_ms
     }
 
-    /// Get the number of prompt evaluations.
     #[must_use]
     pub const fn n_p_eval(&self) -> i32 {
         self.timings.n_p_eval
     }
 
-    /// Get the number of evaluations.
     #[must_use]
     pub const fn n_eval(&self) -> i32 {
         self.timings.n_eval
     }
 
-    /// Set the start time in milliseconds.
     pub const fn set_t_start_ms(&mut self, t_start_ms: f64) {
         self.timings.t_start_ms = t_start_ms;
     }
 
-    /// Set the load time in milliseconds.
     pub const fn set_t_load_ms(&mut self, t_load_ms: f64) {
         self.timings.t_load_ms = t_load_ms;
     }
 
-    /// Set the prompt evaluation time in milliseconds.
     pub const fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
         self.timings.t_p_eval_ms = t_p_eval_ms;
     }
 
-    /// Set the evaluation time in milliseconds.
     pub const fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
         self.timings.t_eval_ms = t_eval_ms;
     }
 
-    /// Set the number of prompt evaluations.
     pub const fn set_n_p_eval(&mut self, n_p_eval: i32) {
         self.timings.n_p_eval = n_p_eval;
     }
 
-    /// Set the number of evaluations.
     pub const fn set_n_eval(&mut self, n_eval: i32) {
         self.timings.n_eval = n_eval;
     }
diff --git a/llama-cpp-bindings/src/token.rs b/llama-cpp-bindings/src/token.rs
index 5249baa9..4b87459e 100644
--- a/llama-cpp-bindings/src/token.rs
+++ b/llama-cpp-bindings/src/token.rs
@@ -1,5 +1,3 @@
-//! Safe wrappers around `llama_token_data` and `llama_token_data_array`.
-
 use std::fmt::Debug;
 use std::fmt::Display;
 
@@ -7,7 +5,6 @@ pub mod data;
 pub mod data_array;
 pub mod logit_bias;
 
-/// A safe wrapper for `llama_token`.
 #[repr(transparent)]
 #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
 pub struct LlamaToken(pub llama_cpp_bindings_sys::llama_token);
@@ -19,13 +16,6 @@ impl Display for LlamaToken {
 }
 
 impl LlamaToken {
-    /// Create a new `LlamaToken` from a i32.
-    ///
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// let token = LlamaToken::new(0);
-    /// assert_eq!(token, LlamaToken(0));
-    /// ```
     #[must_use]
     pub const fn new(token_id: i32) -> Self {
         Self(token_id)
diff --git a/llama-cpp-bindings/src/token/data.rs b/llama-cpp-bindings/src/token/data.rs
index 7f75203f..ce4b2eaf 100644
--- a/llama-cpp-bindings/src/token/data.rs
+++ b/llama-cpp-bindings/src/token/data.rs
@@ -1,10 +1,5 @@
-//! Safe wrapper around `llama_token_data`.
 use crate::token::LlamaToken;
 
-/// A transparent wrapper around `llama_token_data`.
-///
-/// Do not rely on `repr(transparent)` for this type. It should be considered an implementation
-/// detail and may change across minor versions.
 #[derive(Clone, Copy, Debug, PartialEq)]
 #[repr(transparent)]
 pub struct LlamaTokenData {
@@ -12,92 +7,35 @@ pub struct LlamaTokenData {
 }
 
 impl LlamaTokenData {
-    /// Create a new token data from a token, logit, and probability.
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// let token = LlamaToken::new(1);
-    /// let token_data = LlamaTokenData::new(token, 1.0, 1.0);
     #[must_use]
     pub const fn new(LlamaToken(id): LlamaToken, logit: f32, p: f32) -> Self {
         Self {
             data: llama_cpp_bindings_sys::llama_token_data { id, logit, p },
         }
     }
-    /// Get the token's id
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// let token = LlamaToken::new(1);
-    /// let token_data = LlamaTokenData::new(token, 1.0, 1.0);
-    /// assert_eq!(token_data.id(), token);
-    /// ```
     #[must_use]
     pub const fn id(&self) -> LlamaToken {
         LlamaToken(self.data.id)
     }
 
-    /// Get the token's logit
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// let token = LlamaToken::new(1);
-    /// let token_data = LlamaTokenData::new(token, 1.0, 1.0);
-    /// assert_eq!(token_data.logit(), 1.0);
-    /// ```
     #[must_use]
     pub const fn logit(&self) -> f32 {
         self.data.logit
     }
 
-    /// Get the token's probability
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// let token = LlamaToken::new(1);
-    /// let token_data = LlamaTokenData::new(token, 1.0, 1.0);
-    /// assert_eq!(token_data.p(), 1.0);
-    /// ```
     #[must_use]
     pub const fn p(&self) -> f32 {
         self.data.p
     }
 
-    /// Set the token's id
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// let token = LlamaToken::new(1);
-    /// let mut token_data = LlamaTokenData::new(token, 1.0, 1.0);
-    /// token_data.set_id(LlamaToken::new(2));
-    /// assert_eq!(token_data.id(), LlamaToken::new(2));
-    /// ```
     pub const fn set_id(&mut self, id: LlamaToken) {
         self.data.id = id.0;
     }
 
-    /// Set the token's logit
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// let token = LlamaToken::new(1);
-    /// let mut token_data = LlamaTokenData::new(token, 1.0, 1.0);
-    /// token_data.set_logit(2.0);
-    /// assert_eq!(token_data.logit(), 2.0);
-    /// ```
     pub const fn set_logit(&mut self, logit: f32) {
         self.data.logit = logit;
     }
 
-    /// Set the token's probability
-    /// ```
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// let token = LlamaToken::new(1);
-    /// let mut token_data = LlamaTokenData::new(token, 1.0, 1.0);
-    /// token_data.set_p(2.0);
-    /// assert_eq!(token_data.p(), 2.0);
-    /// ```
     pub const fn set_p(&mut self, p: f32) {
         self.data.p = p;
     }
diff --git a/llama-cpp-bindings/src/token/data_array.rs b/llama-cpp-bindings/src/token/data_array.rs
index 40933d7a..3e9f901d 100644
--- a/llama-cpp-bindings/src/token/data_array.rs
+++ b/llama-cpp-bindings/src/token/data_array.rs
@@ -1,4 +1,3 @@
-//! an rusty equivalent of `llama_token_data_array`.
 use std::ptr;
 
 use crate::error::TokenSamplingError;
@@ -7,31 +6,14 @@ use crate::token::data::LlamaTokenData;
 
 use super::LlamaToken;
 
-/// a safe wrapper around `llama_token_data_array`.
 #[derive(Debug, Clone, PartialEq)]
 pub struct LlamaTokenDataArray {
-    /// the underlying data
     pub data: Vec<LlamaTokenData>,
-    /// the index of the selected token in ``data``
     pub selected: Option<usize>,
-    /// is the data sorted?
     pub sorted: bool,
 }
 
 impl LlamaTokenDataArray {
-    /// Create a new `LlamaTokenDataArray` from a vector and whether or not the data is sorted.
-    ///
-    /// ```
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// # use llama_cpp_bindings::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// let array = LlamaTokenDataArray::new(vec![
-    ///         LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
-    ///         LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
-    ///    ], false);
-    /// assert_eq!(array.data.len(), 2);
-    /// assert_eq!(array.sorted, false);
-    /// ```
     #[must_use]
     pub const fn new(data: Vec<LlamaTokenData>, sorted: bool) -> Self {
         Self {
@@ -41,17 +23,6 @@ impl LlamaTokenDataArray {
         }
     }
 
-    /// Create a new `LlamaTokenDataArray` from an iterator and whether or not the data is sorted.
-    /// ```
-    /// # use llama_cpp_bindings::token::data::LlamaTokenData;
-    /// # use llama_cpp_bindings::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_bindings::token::LlamaToken;
-    /// let array = LlamaTokenDataArray::from_iter([
-    ///     LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
-    ///     LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
-    /// ], false);
-    /// assert_eq!(array.data.len(), 2);
-    /// assert_eq!(array.sorted, false);
     pub fn from_iter<TIterator>(data: TIterator, sorted: bool) -> Self
     where
         TIterator: IntoIterator<Item = LlamaTokenData>,
@@ -59,7 +30,6 @@ impl LlamaTokenDataArray {
         Self::new(data.into_iter().collect(), sorted)
     }
 
-    /// Returns the current selected token, if one exists.
     #[must_use]
     pub fn selected_token(&self) -> Option<LlamaToken> {
         self.data.get(self.selected?).map(LlamaTokenData::id)
@@ -67,8 +37,6 @@ impl LlamaTokenDataArray {
 }
 
 impl LlamaTokenDataArray {
-    /// Modify the underlying data as a `llama_token_data_array`. and reconstruct the `LlamaTokenDataArray`.
-    ///
     /// # Panics
     ///
     /// Panics if some of the safety conditions are not met. (we cannot check all of them at
@@ -125,8 +93,6 @@ impl LlamaTokenDataArray {
         result
     }
 
-    /// Modifies the data array by applying a sampler to it.
-    ///
     /// # Panics
     ///
     /// Panics if the vendored sampler throws a C++ exception. `llama_sampler_apply` is
@@ -149,15 +115,12 @@ impl LlamaTokenDataArray {
         }
     }
 
-    /// Modifies the data array by applying a sampler to it
     #[must_use]
     pub fn with_sampler(mut self, sampler: &mut LlamaSampler) -> Self {
         self.apply_sampler(sampler);
         self
     }
 
-    /// Randomly selects a token from the candidates based on their probabilities.
-    ///
     /// # Errors
     /// Returns [`TokenSamplingError::NoTokenSelected`] if the sampler fails to select a token.
     pub fn sample_token(&mut self, seed: u32) -> Result<LlamaToken, TokenSamplingError> {
@@ -166,8 +129,6 @@ impl LlamaTokenDataArray {
             .ok_or(TokenSamplingError::NoTokenSelected)
     }
 
-    /// Selects the token with the highest probability.
-    ///
     /// # Errors
     /// Returns [`TokenSamplingError::NoTokenSelected`] if the sampler fails to select a token.
     pub fn sample_token_greedy(&mut self) -> Result<LlamaToken, TokenSamplingError> {
diff --git a/llama-cpp-bindings/src/token/logit_bias.rs b/llama-cpp-bindings/src/token/logit_bias.rs
index 52d91522..6d5a1502 100644
--- a/llama-cpp-bindings/src/token/logit_bias.rs
+++ b/llama-cpp-bindings/src/token/logit_bias.rs
@@ -1,13 +1,5 @@
-//! Safe wrapper around `llama_logit_bias`.
 use crate::token::LlamaToken;
 
-/// A transparent wrapper around `llama_logit_bias`.
-///
-/// Represents a bias to be applied to a specific token during text generation.
-/// The bias modifies the likelihood of the token being selected.
-///
-/// Do not rely on `repr(transparent)` for this type. It should be considered an implementation
-/// detail and may change across minor versions.
 #[derive(Clone, Copy, Debug, PartialEq)]
 #[repr(transparent)]
 pub struct LlamaLogitBias {
@@ -15,14 +7,6 @@ pub struct LlamaLogitBias {
 }
 
 impl LlamaLogitBias {
-    /// Creates a new logit bias for a specific token with the given bias value.
-    ///
-    /// # Examples
-    /// ```
-    /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias};
-    /// let token = LlamaToken::new(1);
-    /// let bias = LlamaLogitBias::new(token, 1.5);
-    /// ```
     #[must_use]
     pub const fn new(LlamaToken(token): LlamaToken, bias: f32) -> Self {
         Self {
@@ -30,59 +14,20 @@ impl LlamaLogitBias {
         }
     }
 
-    /// Gets the token this bias applies to.
-    ///
-    /// # Examples
-    /// ```
-    /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias};
-    /// let token = LlamaToken::new(1);
-    /// let bias = LlamaLogitBias::new(token, 1.5);
-    /// assert_eq!(bias.token(), token);
-    /// ```
     #[must_use]
     pub const fn token(&self) -> LlamaToken {
         LlamaToken(self.logit_bias.token)
     }
 
-    /// Gets the bias value.
-    ///
-    /// # Examples
-    /// ```
-    /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias};
-    /// let token = LlamaToken::new(1);
-    /// let bias = LlamaLogitBias::new(token, 1.5);
-    /// assert_eq!(bias.bias(), 1.5);
-    /// ```
     #[must_use]
     pub const fn bias(&self) -> f32 {
         self.logit_bias.bias
     }
 
-    /// Sets the token this bias applies to.
-    ///
-    /// # Examples
-    /// ```
-    /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias};
-    /// let token = LlamaToken::new(1);
-    /// let mut bias = LlamaLogitBias::new(token, 1.5);
-    /// let new_token = LlamaToken::new(2);
-    /// bias.set_token(new_token);
-    /// assert_eq!(bias.token(), new_token);
-    /// ```
     pub const fn set_token(&mut self, token: LlamaToken) {
         self.logit_bias.token = token.0;
     }
 
-    /// Sets the bias value.
-    ///
-    /// # Examples
-    /// ```
-    /// # use llama_cpp_bindings::token::{LlamaToken, logit_bias::LlamaLogitBias};
-    /// let token = LlamaToken::new(1);
-    /// let mut bias = LlamaLogitBias::new(token, 1.5);
-    /// bias.set_bias(2.0);
-    /// assert_eq!(bias.bias(), 2.0);
-    /// ```
     pub const fn set_bias(&mut self, bias: f32) {
         self.logit_bias.bias = bias;
     }
diff --git a/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs b/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs
index 2ed0cd89..b27878fb 100644
--- a/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs
+++ b/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs
@@ -208,9 +208,6 @@ mod tests {
 
     #[test]
     fn rejects_truncated_json_arguments_with_unterminated_failure() {
-        // serde_json's iterator returns None when the deserializer has no token to start from.
-        // Constructing such an input requires whitespace-only input after the separator — the
-        // iterator finds nothing parseable and yields None, surfacing the Unterminated arm.
         let failure = parse(
             "[TOOL_CALLS]get_weather[ARGS]   ",
             &mistral3_markers(),
@@ -226,8 +223,6 @@ mod tests {
 
     #[test]
     fn returns_empty_vec_for_separator_with_only_whitespace_name() {
-        // `get_weather` is replaced with whitespace before the separator, so `name.trim()` is
-        // empty and the parser returns `ParseStep::Done` — covers the empty-name early return.
         let parsed = parse(
             "[TOOL_CALLS]   [ARGS]{\"x\":1}",
             &mistral3_markers(),
@@ -240,8 +235,6 @@ mod tests {
 
     #[test]
     fn returns_empty_vec_when_shape_has_empty_separator() {
-        // When `name_args_separator` is empty, `parse` short-circuits to `Vec::new()` —
-        // covers the early-return guard.
         let mut shape = mistral3_shape();
         shape.name_args_separator.clear();
         let parsed = parse(
diff --git a/llama-cpp-bindings/src/tool_call_format/json_object.rs b/llama-cpp-bindings/src/tool_call_format/json_object.rs
index c9038152..af9f58ea 100644
--- a/llama-cpp-bindings/src/tool_call_format/json_object.rs
+++ b/llama-cpp-bindings/src/tool_call_format/json_object.rs
@@ -187,9 +187,6 @@ mod tests {
 
     #[test]
     fn returns_empty_when_object_is_not_a_tool_call_shape() {
-        // The body opens with `{` (so try_parse_one_object enters the JSON path) but the parsed
-        // value is a top-level non-object — the early `let Value::Object(map) = value else
-        // { return Ok(None) };` arm fires.
         let parsed = parse("{ \"foo\": 1 }", &qwen3_shape()).expect("must parse");
 
         assert!(parsed.is_empty());
diff --git a/llama-cpp-log-decoder/src/lib.rs b/llama-cpp-log-decoder/src/lib.rs
index b7a96a37..369e4837 100644
--- a/llama-cpp-log-decoder/src/lib.rs
+++ b/llama-cpp-log-decoder/src/lib.rs
@@ -1,13 +1,3 @@
-//! Decoder for the llama.cpp / ggml log callback stream.
-//!
-//! The C side delivers log lines in fragments: a missing trailing newline
-//! signals that more fragments will follow at `GGML_LOG_LEVEL_CONT`. This
-//! crate is a pure `&mut self` transducer — feed `(level, text)` pairs, get
-//! complete [`LogLine`]s back when the trailing newline arrives. No globals,
-//! no atomics, no FFI, no logger.
-//!
-//! [`LogLine`]: log_line::LogLine
-
 pub mod decode_anomaly;
 pub mod decode_output;
 pub mod decode_result;
diff --git a/llama-cpp-test-harness-macros/src/lib.rs b/llama-cpp-test-harness-macros/src/lib.rs
index b36048fc..4021ea43 100644
--- a/llama-cpp-test-harness-macros/src/lib.rs
+++ b/llama-cpp-test-harness-macros/src/lib.rs
@@ -1,9 +1,3 @@
-//! Procedural macros for `llama-cpp-test-harness`.
-//!
-//! Provides the `#[llama_test(...)]` attribute that declaratively binds a test function to a
-//! specific GGUF model and inference parameter set. The macro emits the original function plus
-//! an `inventory::submit!` block that registers the test with the harness runtime.
-
 mod expand;
 mod parsed_args;
 mod parsed_context_params;
@@ -22,9 +16,6 @@ fn dispatch(attribute: TokenStream2, item: TokenStream2) -> TokenStream2 {
     }
 }
 
-/// Registers a function as a llama-cpp test with explicit model + inference parameters.
-///
-/// See the `llama-cpp-test-harness` crate for the full attribute schema and usage.
 #[proc_macro_attribute]
 pub fn llama_test(attribute: TokenStream, item: TokenStream) -> TokenStream {
     dispatch(attribute.into(), item.into()).into()
diff --git a/llama-cpp-test-harness-macros/src/parsed_args.rs b/llama-cpp-test-harness-macros/src/parsed_args.rs
index 795261f3..c5b50788 100644
--- a/llama-cpp-test-harness-macros/src/parsed_args.rs
+++ b/llama-cpp-test-harness-macros/src/parsed_args.rs
@@ -869,8 +869,6 @@ mod tests {
 
     #[test]
     fn unparseable_attribute_token_stream_is_rejected() {
-        // `Punctuated::parse_terminated` rejects input that can't be split into Meta items by
-        // commas; passing a stray symbol surfaces that `?` Err arm in `ParsedArgs::parse`.
         let result = parse("@&^!");
 
         assert!(
diff --git a/llama-cpp-bindings-tests/fixtures/ggml-vocab-bert-bge.gguf b/llama-cpp-test-harness/fixtures/ggml-vocab-bert-bge.gguf
similarity index 100%
rename from llama-cpp-bindings-tests/fixtures/ggml-vocab-bert-bge.gguf
rename to llama-cpp-test-harness/fixtures/ggml-vocab-bert-bge.gguf
diff --git a/llama-cpp-bindings-tests/fixtures/llamas.jpg b/llama-cpp-test-harness/fixtures/llamas.jpg
similarity index 100%
rename from llama-cpp-bindings-tests/fixtures/llamas.jpg
rename to llama-cpp-test-harness/fixtures/llamas.jpg
diff --git a/llama-cpp-test-harness/src/download_model.rs b/llama-cpp-test-harness/src/download_model.rs
index 3ffd5a5b..e7cf1aa2 100644
--- a/llama-cpp-test-harness/src/download_model.rs
+++ b/llama-cpp-test-harness/src/download_model.rs
@@ -2,8 +2,6 @@ use std::path::PathBuf;
 
 use anyhow::Result;
 
-/// Downloads a single file from a Hugging Face repo via `hf-hub`'s sync API.
-///
 /// # Errors
 ///
 /// Returns an error if the HF client cannot be built or the file cannot be downloaded
diff --git a/llama-cpp-test-harness/src/execution_plan.rs b/llama-cpp-test-harness/src/execution_plan.rs
index 927c87a8..669b7524 100644
--- a/llama-cpp-test-harness/src/execution_plan.rs
+++ b/llama-cpp-test-harness/src/execution_plan.rs
@@ -1,18 +1,3 @@
-//! Deterministic execution plan for the test harness.
-//!
-//! [`ExecutionPlan::from_registrations`] takes the registrations collected from `inventory` and
-//! groups them into [`ExecutionPhase`]s by [`crate::LoadKey`]. The result is a sorted list of
-//! phases — each phase corresponds to exactly one model-load cycle (load → run trials → drop).
-//!
-//! # Invariants
-//!
-//! - For every distinct [`crate::LoadKey`] the planner produces exactly one
-//!   [`ExecutionPhase`]; the same key never produces two phases.
-//! - Phases are sorted by [`crate::LoadKey`] (lexicographic order on the full key tuple).
-//! - Registrations inside a phase are sorted by their `name`.
-//! - [`crate::ContextParams`] differences within registrations sharing a key do **not** split a
-//!   phase — the model loads once and each trial constructs its own `LlamaContext`.
-
 use std::collections::BTreeMap;
 use std::sync::Arc;
 
diff --git a/llama-cpp-bindings-tests/src/test_model.rs b/llama-cpp-test-harness/src/fixtures_dir.rs
similarity index 72%
rename from llama-cpp-bindings-tests/src/test_model.rs
rename to llama-cpp-test-harness/src/fixtures_dir.rs
index 22082498..55f44c60 100644
--- a/llama-cpp-bindings-tests/src/test_model.rs
+++ b/llama-cpp-test-harness/src/fixtures_dir.rs
@@ -1,8 +1,5 @@
-//! Path helper for image and audio fixtures used by multimodal integration tests.
-
 use std::path::PathBuf;
 
-/// Returns the absolute path to the test fixtures directory.
 #[must_use]
 pub fn fixtures_dir() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures")
diff --git a/llama-cpp-test-harness/src/lib.rs b/llama-cpp-test-harness/src/lib.rs
index 8f112b9f..656513fc 100644
--- a/llama-cpp-test-harness/src/lib.rs
+++ b/llama-cpp-test-harness/src/lib.rs
@@ -1,16 +1,8 @@
-//! Declarative, deterministic, phase-batched integration-test harness for `llama-cpp-bindings`.
-//!
-//! Tests tag their functions with `#[llama_test(model_source = HuggingFace("…", "…"), …)]`
-//! (or `model_source = LocalPath("…")` for a local GGUF). The harness groups tests with
-//! identical [`LoadKey`]s into [`ExecutionPhase`]s, loads each phase's model exactly once, and
-//! runs every test in the phase sequentially against the shared [`LlamaFixture`].
-//!
-//! See the workspace README and `tests/` directory for usage examples.
-
 pub mod context_params;
 pub mod download_model;
 pub mod execution_phase;
 pub mod execution_plan;
+pub mod fixtures_dir;
 pub mod harness_arguments_error;
 pub mod llama_fixture;
 pub mod llama_test_fn;
diff --git a/llama-cpp-test-harness/src/llama_fixture.rs b/llama-cpp-test-harness/src/llama_fixture.rs
index 04ae60b4..f427f8aa 100644
--- a/llama-cpp-test-harness/src/llama_fixture.rs
+++ b/llama-cpp-test-harness/src/llama_fixture.rs
@@ -1,5 +1,7 @@
 use std::path::Path;
 
+use anyhow::Result;
+use llama_cpp_bindings::context::LlamaContext;
 use llama_cpp_bindings::llama_backend::LlamaBackend;
 use llama_cpp_bindings::model::LlamaModel;
 use llama_cpp_bindings::mtmd::MtmdContext;
@@ -13,3 +15,15 @@ pub struct LlamaFixture<'fixture> {
     pub mtmd_context: Option<&'fixture MtmdContext>,
     pub model_path: &'fixture Path,
 }
+
+impl LlamaFixture<'_> {
+    /// # Errors
+    /// Forwards [`LlamaContext::from_model`] errors verbatim.
+    pub fn build_context(&self) -> Result<LlamaContext<'_>> {
+        Ok(LlamaContext::from_model(
+            self.model,
+            self.backend,
+            (*self.context_params).into_llama_context_params(),
+        )?)
+    }
+}
diff --git a/llama-cpp-test-harness/src/llama_tests_main_macro.rs b/llama-cpp-test-harness/src/llama_tests_main_macro.rs
index fc047cfc..18ed6dab 100644
--- a/llama-cpp-test-harness/src/llama_tests_main_macro.rs
+++ b/llama-cpp-test-harness/src/llama_tests_main_macro.rs
@@ -1,6 +1,3 @@
-/// Generates a `fn main() -> ExitCode` that dispatches via the harness.
-///
-/// Place once at module scope in a test binary that uses `#[llama_test(...)]`.
 #[macro_export]
 macro_rules! llama_tests_main {
     () => {
diff --git a/llama-cpp-test-harness/src/load_key.rs b/llama-cpp-test-harness/src/load_key.rs
index af34b972..5fad7200 100644
--- a/llama-cpp-test-harness/src/load_key.rs
+++ b/llama-cpp-test-harness/src/load_key.rs
@@ -1,22 +1,3 @@
-//! Identity of one model-load operation.
-//!
-//! Two registrations with different [`LoadKey`]s require separate model loads. Two registrations
-//! with identical [`LoadKey`]s share one load — even if every other attribute (such as
-//! [`crate::ContextParams`]) differs.
-//!
-//! # What forces a model reload
-//!
-//! Only the fields of [`LoadKey`]: the model source ([`crate::ModelSource`]), the mmproj source
-//! (optional [`crate::MmprojSource`]), and the [`crate::ModelLoadParams`] (`n_gpu_layers`,
-//! `use_mmap`, `use_mlock`).
-//!
-//! # What is runtime-flexible
-//!
-//! Every `LlamaContextParams` setter (`n_ctx`, `n_batch`, `n_ubatch`, `n_seq_max`,
-//! `n_threads_batch`, `embeddings`, and the further setters not yet surfaced in the attribute
-//! schema). The harness builds a fresh `LlamaContext` per trial from `fixture.context_params`,
-//! so differences here never reload the model.
-
 use std::sync::Arc;
 
 use anyhow::Result;
@@ -38,9 +19,6 @@ pub struct LoadKey {
 }
 
 impl LoadKey {
-    /// Downloads (or resolves) the model and optional mmproj, loads them, and returns the live
-    /// [`PhaseState`] that the harness keeps alive for the duration of the phase.
-    ///
     /// # Errors
     ///
     /// Returns an error if any of: source resolution fails, loading the model into llama.cpp
@@ -139,12 +117,7 @@ mod tests {
         assert_ne!(baseline(), other);
     }
 
-    // The next three tests exercise the three error-propagation paths inside
-    // `load_phase_state` — model load failure, mmproj download failure, and mmproj load failure.
-    // Each constructs a LoadKey whose resolution succeeds (so the path is computed) but whose
-    // subsequent load step deliberately fails, then asserts the appropriate `Err` propagates.
     //
-    // They share BACKEND_INIT_GATE because `LlamaBackend::init` is once-per-process.
 
     use std::sync::Arc;
 
@@ -152,9 +125,6 @@ mod tests {
 
     use crate::test_backend_gate::BACKEND_INIT_GATE;
 
-    /// Path to the workspace `Cargo.toml`, which exists at test time but isn't a valid GGUF and
-    /// isn't a valid mmproj — perfect for exercising the `load_from_file` / `init_from_file`
-    /// error arms in `load_phase_state`.
     const NON_GGUF_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.toml");
 
     #[test]
diff --git a/llama-cpp-test-harness/src/mmproj_source.rs b/llama-cpp-test-harness/src/mmproj_source.rs
index e33fa0c4..6fb3b7fb 100644
--- a/llama-cpp-test-harness/src/mmproj_source.rs
+++ b/llama-cpp-test-harness/src/mmproj_source.rs
@@ -1,9 +1,3 @@
-//! Identity of the mmproj GGUF file the harness optionally loads for a phase.
-//!
-//! Same shape and semantics as [`crate::ModelSource`], but for the multimodal projection file.
-//! Independent of the model's source — a test may mix any combination (HF model + local mmproj,
-//! local model + HF mmproj, both local, both HF).
-
 use std::path::PathBuf;
 
 use anyhow::Result;
@@ -20,8 +14,6 @@ pub enum MmprojSource {
 }
 
 impl MmprojSource {
-    /// Resolves the source to an on-disk path.
-    ///
     /// # Errors
     ///
     /// Returns an error if the HF download fails. `LocalPath` is infallible here — file
diff --git a/llama-cpp-test-harness/src/model_source.rs b/llama-cpp-test-harness/src/model_source.rs
index c29d9205..b3dd347d 100644
--- a/llama-cpp-test-harness/src/model_source.rs
+++ b/llama-cpp-test-harness/src/model_source.rs
@@ -1,14 +1,3 @@
-//! Identity of the GGUF file the harness loads for a phase.
-//!
-//! Two variants, mutually exclusive by construction:
-//! - [`ModelSource::HuggingFace`] — pull via `hf-hub` (cached); the on-disk path is wherever the
-//!   cache resolves to.
-//! - [`ModelSource::LocalPath`] — use the file at the given absolute path verbatim; no download,
-//!   no cache.
-//!
-//! Mutual exclusion is enforced at compile time by the enum's variant set. There is no string
-//! heuristic anywhere — the proc-macro dispatches on syntactic path identifiers.
-
 use std::path::PathBuf;
 
 use anyhow::Result;
@@ -25,8 +14,6 @@ pub enum ModelSource {
 }
 
 impl ModelSource {
-    /// Resolves the source to an on-disk path.
-    ///
     /// # Errors
     ///
     /// Returns an error if the HF download fails. `LocalPath` is infallible here — file
diff --git a/llama-cpp-test-harness/src/no_op.rs b/llama-cpp-test-harness/src/no_op.rs
index 7672de54..95c62c54 100644
--- a/llama-cpp-test-harness/src/no_op.rs
+++ b/llama-cpp-test-harness/src/no_op.rs
@@ -1,11 +1,5 @@
 use crate::llama_fixture::LlamaFixture;
 
-/// No-op test function with the [`crate::LlamaTestFn`] signature. Always returns `Ok(())`.
-///
-/// Useful as a placeholder for [`crate::LlamaTestRegistration`] in unit tests that exercise
-/// grouping/sorting logic without needing real trial bodies. Also covered by a self-test
-/// trial so the function shows up in coverage.
-///
 /// # Errors
 ///
 /// Never; always returns `Ok(())`. The `Result` return type matches `LlamaTestFn`.
diff --git a/llama-cpp-test-harness/src/parse_harness_arguments.rs b/llama-cpp-test-harness/src/parse_harness_arguments.rs
index b4b3ce72..176f3df5 100644
--- a/llama-cpp-test-harness/src/parse_harness_arguments.rs
+++ b/llama-cpp-test-harness/src/parse_harness_arguments.rs
@@ -12,11 +12,6 @@ fn validate(mut arguments: Arguments) -> Result<Arguments, HarnessArgumentsError
     }
 }
 
-/// Parses the test-binary CLI into [`libtest_mimic::Arguments`], enforcing the harness's
-/// single-thread requirement.
-///
-/// `--test-threads` left unset is treated as `1`; `--test-threads=1` is accepted unchanged.
-///
 /// # Errors
 ///
 /// Returns [`HarnessArgumentsError::ConflictingTestThreads`] when `--test-threads` is set to
diff --git a/llama-cpp-test-harness/src/run_to_conclusions.rs b/llama-cpp-test-harness/src/run_to_conclusions.rs
index 67c90003..5af64dab 100644
--- a/llama-cpp-test-harness/src/run_to_conclusions.rs
+++ b/llama-cpp-test-harness/src/run_to_conclusions.rs
@@ -6,12 +6,6 @@ use llama_cpp_bindings::llama_backend::LlamaBackend;
 use crate::execution_plan::ExecutionPlan;
 use crate::parse_harness_arguments::parse_harness_arguments;
 
-/// Runs every registered test against its declared model and returns one [`Conclusion`] per phase.
-///
-/// Self-tests use this entry point to inspect pass/fail counts without surrendering the
-/// binary's exit code to libtest-mimic. Initializes the backend; panics with a descriptive
-/// message if init fails (that's a programming error in test setup).
-///
 /// # Panics
 ///
 /// Panics if [`LlamaBackend::init`] fails or if the CLI arguments conflict with the harness's
@@ -43,9 +37,6 @@ mod tests {
 
     #[test]
     fn empty_inventory_yields_no_conclusions_and_skips_void_logs() {
-        // The lib's own inventory has no #[llama_test] registrations, so
-        // ExecutionPlan::from_inventory() returns an empty plan. requests_void_logs() returns
-        // false → the `backend.void_logs()` branch is skipped — this test covers that path.
         let _gate = BACKEND_INIT_GATE
             .lock()
             .unwrap_or_else(std::sync::PoisonError::into_inner);
diff --git a/llama-cpp-test-harness/src/test_backend_gate.rs b/llama-cpp-test-harness/src/test_backend_gate.rs
index 74fa6245..c762fece 100644
--- a/llama-cpp-test-harness/src/test_backend_gate.rs
+++ b/llama-cpp-test-harness/src/test_backend_gate.rs
@@ -1,8 +1,2 @@
-//! Process-wide serialization for tests that need to initialize `LlamaBackend`.
-//!
-//! `LlamaBackend::init` is a once-per-process operation; concurrent attempts collide. Tests in
-//! multiple modules each need access to a shared mutex so they take turns. This module exports
-//! that shared mutex.
-
 #[cfg(test)]
 pub static BACKEND_INIT_GATE: std::sync::Mutex<()> = std::sync::Mutex::new(());
diff --git a/llama-cpp-test-harness/tests/harness_self_test.rs b/llama-cpp-test-harness/tests/harness_self_test.rs
index eea30660..db17915d 100644
--- a/llama-cpp-test-harness/tests/harness_self_test.rs
+++ b/llama-cpp-test-harness/tests/harness_self_test.rs
@@ -12,10 +12,6 @@ use llama_cpp_test_harness::llama_test;
 use llama_cpp_test_harness::no_op;
 use llama_cpp_test_harness::run_to_conclusions;
 
-// Phase A: small Qwen text model, three trials sharing the exact same attribute tuple.
-// Two of these pass, one bails — exercising both branches of trial-body dispatch on the same
-// loaded model.
-
 #[llama_test(
     model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
     n_gpu_layers = 999,
@@ -59,8 +55,6 @@ fn phase_a_intentionally_failing_trial(_fixture: &LlamaFixture<'_>) -> Result<()
     bail!("intentional failure to exercise the trial-failure dispatch path");
 }
 
-// Phase B: distinct model (smaller embedding GGUF). Two trials share this key.
-
 #[llama_test(
     model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
     n_gpu_layers = 999,
@@ -89,13 +83,7 @@ fn phase_b_second_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> {
     Ok(())
 }
 
-// Phase C: intentionally invalid HF repo. The phase-setup path fails to download the model,
-// which routes the trial through `failing_trials` (one failed trial per registration).
 //
-// The trial function is shared with an additional Phase A registration so that the function
-// itself is exercised at least once (Phase A's setup succeeds and dispatches into the body).
-// Phase C's setup fails before reaching the body, but the registration still exercises the
-// `failing_trials` path in `ExecutionPhase::run`.
 
 #[llama_test(
     model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
@@ -136,16 +124,10 @@ fn phase_b_second_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> {
     mmproj_source = LocalPath("/nonexistent/llama-cpp-test-harness/no-such-mmproj.gguf"),
 )]
 fn shared_setup_failure_and_phase_a_trial(fixture: &LlamaFixture<'_>) -> Result<()> {
-    // Phase A reaches the body and verifies the fixture is wired up; the failure phases
-    // (Phase C model download, mmproj download, mmproj load) never reach it.
     assert!(fixture.model_path.exists());
     Ok(())
 }
 
-// Phase D: same text model as Phase A but with mmproj — exercises the multimodal-load path
-// in LoadKey::load_phase_state. Distinct LoadKey (mmproj_file differs) → distinct phase +
-// distinct model load.
-
 #[llama_test(
     model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
     n_gpu_layers = 999,