diff --git a/Cargo.lock b/Cargo.lock index f9c99776..05880f38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1234,6 +1234,7 @@ dependencies = [ "libtest-mimic", "llama-cpp-bindings", "llama-cpp-test-harness-macros", + "thiserror", ] [[package]] diff --git a/llama-cpp-bindings-tests/Cargo.toml b/llama-cpp-bindings-tests/Cargo.toml index c17b881d..cba73b08 100644 --- a/llama-cpp-bindings-tests/Cargo.toml +++ b/llama-cpp-bindings-tests/Cargo.toml @@ -15,135 +15,19 @@ llama-cpp-test-harness = { workspace = true } serde_json = { workspace = true } [[test]] -name = "context" +name = "backend_initialization" harness = false [[test]] -name = "llama_backend" +name = "chat_template_and_message_parsing" harness = false [[test]] -name = "context_kv_cache" +name = "embedding_and_encoder" harness = false [[test]] -name = "deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" -harness = false - -[[test]] -name = "deepseek_r1_8b_classifier_emits_reasoning" -harness = false - -[[test]] -name = "deepseek_r1_8b_duck_types_gemma_paired_quote" -harness = false - -[[test]] -name = "deepseek_r1_8b_duck_types_glm_key_value_tags" -harness = false - -[[test]] -name = "deepseek_r1_8b_duck_types_mistral_bracketed_json" -harness = false - -[[test]] -name = "deepseek_r1_8b_duck_types_qwen_xml" -harness = false - -[[test]] -name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested" -harness = false - -[[test]] -name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested" -harness = false - -[[test]] -name = "context_session" -harness = false - -[[test]] -name = "embeddings" -harness = false - -[[test]] -name = "gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" -harness = false - -[[test]] -name = "gemma4_classifier_emits_reasoning" -harness = false - -[[test]] -name = "gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt" -harness = false - -[[test]] -name = "gemma4_parses_tool_call_payload" -harness = false - -[[test]] -name = "gemma4_template_override_returns_full_markers" -harness = false - -[[test]] -name = "glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" -harness = false - -[[test]] -name = "glm47_classifier_emits_reasoning" -harness = false - -[[test]] -name = "glm47_parses_tool_call_payload" -harness = false - -[[test]] -name = "glm47_template_override_returns_full_markers" -harness = false - -[[test]] -name = "mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" -harness = false - -[[test]] -name = "mistral3_classifier_emits_reasoning" -harness = false - -[[test]] -name = "mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt" -harness = false - -[[test]] -name = "mistral3_parses_tool_call_payload" -harness = false - -[[test]] -name = "eval_multimodal_chunks_records_exact_token_counts" -harness = false - -[[test]] -name = "ingest_prompt_chunk" -harness = false - -[[test]] -name = "llguidance" -harness = false - -[[test]] -name = "model_chat_template" -harness = false - -[[test]] -name = "model_context_creation" -harness = false - -[[test]] -name = "model_helpers" -harness = false - -[[test]] -name = "model_params" +name = "kv_cache_and_session" harness = false [[test]] @@ -151,127 +35,19 @@ name = "model_loading_errors" harness = false [[test]] -name = "model_lora_adapter_errors" -harness = false - -[[test]] -name = "model_metadata_kv" -harness = false - -[[test]] -name = "model_properties" -harness = false - -[[test]] -name = "model_sampling" -harness = false - -[[test]] -name = "model_special_tokens" -harness = false - -[[test]] -name = "model_str_to_token" -harness = false - -[[test]] -name = "model_token_to_piece" -harness = false - -[[test]] -name = "model_tokens_iterator" -harness = false - -[[test]] -name = "mtmd_bitmap" -harness = false - -[[test]] -name = "mtmd_chunk_operations" -harness = false - -[[test]] -name = "mtmd_chunk_structure" -harness = false - -[[test]] -name = "mtmd_context" -harness = false - -[[test]] -name = "mtmd_evaluation" -harness = false - -[[test]] -name = "mtmd_tokenization" -harness = false - -[[test]] -name = "multimodal" -harness = false - -[[test]] -name = "parse_chat_message" -harness = false - -[[test]] -name = "qwen35_chat_inference_emits_reasoning_when_template_auto_opens" -harness = false - -[[test]] -name = "qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" -harness = false - -[[test]] -name = "qwen35_classifier_emits_reasoning" -harness = false - -[[test]] -name = "qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt" -harness = false - -[[test]] -name = "qwen35_parses_constrained_schema_payload" -harness = false - -[[test]] -name = "qwen35_parses_tool_call_payload" -harness = false - -[[test]] -name = "qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested" -harness = false - -[[test]] -name = "qwen36_chat_inference_emits_reasoning_when_template_auto_opens" -harness = false - -[[test]] -name = "qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" -harness = false - -[[test]] -name = "qwen36_classifier_emits_reasoning" -harness = false - -[[test]] -name = "qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt" -harness = false - -[[test]] -name = "reranker" +name = "multimodal_vision" harness = false [[test]] -name = "sampled_token_classifier_markers" +name = "reasoning_markers_and_tool_calls" harness = false [[test]] -name = "sampling" +name = "sampling_and_constrained_decoding" harness = false [[test]] -name = "text_generation" +name = "vocabulary_and_metadata" harness = false [features] diff --git a/llama-cpp-bindings-tests/tests/llama_backend.rs b/llama-cpp-bindings-tests/tests/backend_initialization.rs similarity index 100% rename from llama-cpp-bindings-tests/tests/llama_backend.rs rename to llama-cpp-bindings-tests/tests/backend_initialization.rs diff --git a/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs new file mode 100644 index 00000000..a7e18245 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs @@ -0,0 +1,567 @@ +use llama_cpp_test_harness::llama_tests_main; + +mod model_chat_template { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::ChatTemplateError; + use llama_cpp_bindings::model::LlamaChatMessage; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> { + let template = fixture.model.chat_template(None); + assert!(template.is_ok()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model.chat_template(None)?; + let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?; + let prompt = model.apply_chat_template(&template, &[message], true); + + assert!(prompt.is_ok()); + assert!(!prompt?.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn apply_chat_template_buffer_resize_with_long_messages( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let template = model.chat_template(None)?; + let long_content = "a".repeat(2000); + let message = LlamaChatMessage::new("user".to_string(), long_content)?; + let prompt = model.apply_chat_template(&template, &[message], true); + + assert!(prompt.is_ok()); + assert!(!prompt?.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture + .model + .chat_template(Some("nonexistent_template_name_xyz")); + assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate); + Ok(()) + } +} + +mod parse_chat_message { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message("[]", "hello world", false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for plain content; got Unrecognized"); + }; + assert!(parsed.tool_calls.is_empty()); + assert!(!parsed.is_empty()); + assert!(parsed.content.contains("hello world")); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> { + let input = "step one, step two\n\nactual response"; + let outcome = fixture.model.parse_chat_message("[]", input, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for reasoning section; got Unrecognized"); + }; + assert!( + parsed.reasoning_content.contains("step") || parsed.content.contains("step"), + "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}", + parsed.content, + parsed.reasoning_content + ); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture.model.parse_chat_message("[]", "", false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for empty input; got Unrecognized"); + }; + assert!(parsed.tool_calls.is_empty()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn parses_malformed_tools_json_returns_tools_json_invalid_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let result = fixture + .model + .parse_chat_message("not_a_json[}", "hello", false); + + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( + _ + )) + )); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn parses_non_array_tools_json_returns_tools_json_not_array_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let result = fixture + .model + .parse_chat_message("{\"foo\": 1}", "hello", false); + + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray) + )); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn parses_with_tools_null_byte_returns_tools_json_invalid_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let result = fixture + .model + .parse_chat_message("[]\0extra", "hello", false); + + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( + _ + )) + )); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn parses_with_input_null_byte_returns_tools_serialization_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let result = fixture + .model + .parse_chat_message("[]", "hello\0world", false); + + assert!(matches!( + result, + Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_)) + )); + Ok(()) + } +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/constrained_decoding.rs b/llama-cpp-bindings-tests/tests/constrained_decoding.rs deleted file mode 100644 index 533981c9..00000000 --- a/llama-cpp-bindings-tests/tests/constrained_decoding.rs +++ /dev/null @@ -1,124 +0,0 @@ -use std::io::Write; - -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampled_token::SampledToken; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n"; - - let mut ctx = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let tokens_list = model.str_to_token(prompt, AddBos::Always)?; - - let mut batch = LlamaBatch::new(512, 1)?; - let last_index = i32::try_from(tokens_list.len())? - 1; - - for (index, token) in (0_i32..).zip(&tokens_list) { - batch.add( - &SampledToken::Content(*token), - index, - &[0], - index == last_index, - )?; - } - - ctx.decode(&mut batch)?; - - let schema = r#"{ - "type": "object", - "properties": { - "city": { "type": "string" }, - "temperature": { "type": "number" } - }, - "required": ["city", "temperature"] -}"#; - - let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?; - let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); - - let mut n_cur = batch.n_tokens(); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let mut generated = String::new(); - - while n_cur <= 128 { - let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?); - - if model.is_eog_token(&token) { - break; - } - - let output_string = model.token_to_piece(&token, &mut decoder, true, None)?; - generated.push_str(&output_string); - print!("{output_string}"); - std::io::stdout().flush()?; - - batch.clear(); - batch.add(&token, n_cur, &[0], true)?; - n_cur += 1; - ctx.decode(&mut batch)?; - } - - println!(); - - let parsed = serde_json::Deserializer::from_str(&generated) - .into_iter::() - .next() - .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??; - - assert!(parsed.get("city").is_some()); - assert!(parsed.get("temperature").is_some()); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/context.rs b/llama-cpp-bindings-tests/tests/context.rs deleted file mode 100644 index 1e3a6b08..00000000 --- a/llama-cpp-bindings-tests/tests/context.rs +++ /dev/null @@ -1,917 +0,0 @@ -use std::ptr::NonNull; -use std::sync::Arc; -use std::sync::atomic::AtomicBool; - -use anyhow::Result; -use llama_cpp_bindings::DecodeError; -use llama_cpp_bindings::LogitsError; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaLoraAdapter; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -// ========================================================================================= -// Group A: default Qwen model, embeddings=false. Most context tests fall here. -// ========================================================================================= - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - assert!(context.n_ctx() > 0); - assert!(context.n_batch() > 0); - assert!(context.n_ubatch() > 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let decode_result = context.decode(&mut batch); - assert!(decode_result.is_ok()); - - let logits = context.get_logits()?; - assert!(!logits.is_empty()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.reset_timings(); - let timings = context.timings(); - assert!(timings.t_start_ms() >= 0.0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let token_data_array = context.token_data_array()?; - - assert!(!token_data_array.data.is_empty()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let logits = context.get_logits_ith(last_index)?; - - assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let token_data_array = context.token_data_array_ith(last_index)?; - - assert_eq!( - token_data_array.data.len(), - usize::try_from(fixture.model.n_vocab())? - ); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn embeddings_ith_returns_error_when_embeddings_disabled(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.embeddings_ith(0); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn embeddings_seq_ith_returns_error_when_embeddings_disabled( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.embeddings_seq_ith(0); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let count = context.candidates()?.count(); - - assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let debug_output = format!("{context:?}"); - - assert!(debug_output.contains("LlamaContext")); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let count = context.candidates_ith(last_index)?.count(); - - assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let mut adapter = LlamaLoraAdapter { - lora_adapter: NonNull::dangling(), - }; - - let result = context.lora_adapter_remove(&mut adapter); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.encode(&mut batch); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let mut adapter = LlamaLoraAdapter { - lora_adapter: NonNull::dangling(), - }; - - let result = context.lora_adapter_set(&mut adapter, 1.0); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, -)] -fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let result = context.embeddings_seq_ith(999); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let mut batch = LlamaBatch::new(512, 1)?; - - let result = context.decode(&mut batch); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let abort_flag = Arc::new(AtomicBool::new(true)); - context.set_abort_flag(abort_flag); - - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert_eq!(result, Err(DecodeError::Aborted)); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let abort_flag = Arc::new(AtomicBool::new(false)); - context.set_abort_flag(abort_flag); - - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let abort_flag = Arc::new(AtomicBool::new(true)); - context.set_abort_flag(abort_flag); - context.clear_abort_callback(); - - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.synchronize(); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.detach_threadpool(); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn get_logits_ith_returns_token_not_initialized_for_unknown_index( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.get_logits_ith(7); - - assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7)))); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 64, - n_batch = 2048, - n_ubatch = 512, -)] -fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let huge_index = i32::try_from(context.n_ctx())?; - context.mark_logits_initialized(huge_index); - let result = context.get_logits_ith(huge_index); - - assert!(matches!( - result, - Err(LogitsError::TokenIndexExceedsContext { .. }) - )); - - Ok(()) -} - -// ========================================================================================= -// Group B: Qwen embedding model, embeddings=true. Six embedding-specific tests. -// ========================================================================================= - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, -)] -fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, -)] -fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let embeddings = context.embeddings_seq_ith(0)?; - - assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_seq_max = 4, - embeddings = true, -)] -fn multi_sequence_embeddings_returns_one_embedding_per_sequence( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let inputs = [ - "alpha is here", - "beta runs fast", - "gamma waits", - "delta jumps", - ]; - let mut batch = LlamaBatch::new(64, 4)?; - - for (sequence_index, text) in inputs.iter().enumerate() { - let tokens = fixture.model.str_to_token(text, AddBos::Always)?; - let sequence_id = i32::try_from(sequence_index)?; - - batch.add_sequence(&tokens, sequence_id, true)?; - } - - context.decode(&mut batch)?; - - let n_embd = usize::try_from(fixture.model.n_embd())?; - let mut collected: Vec> = Vec::with_capacity(inputs.len()); - - for sequence_index in 0..inputs.len() { - let sequence_id = i32::try_from(sequence_index)?; - let embedding = context.embeddings_seq_ith(sequence_id)?; - - assert_eq!( - embedding.len(), - n_embd, - "sequence {sequence_index} embedding length mismatch" - ); - - collected.push(embedding.to_vec()); - } - - for (left_index, left) in collected.iter().enumerate() { - for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { - assert_ne!( - left, right, - "embedding for sequence {left_index} must differ from sequence {right_index}", - ); - } - } - - Ok(()) -} - -/// Reproduces paddler's embedding batching loop exactly with the document strings, batch -/// shape, and iteration pattern from the failing harness test -/// `agent_embedding_batch_distribution_independent_of_context_size`. -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_seq_max = 4, - embeddings = true, -)] -fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let iterations = [ - [ - "This is the first document with enough content to contribute meaningfully to the batch size calculation", - "This is the second document that should be processed in a potentially different batch from the first", - ], - [ - "This is the third document adding more content to ensure the total exceeds the configured chunk limit", - "This is the fourth document which should demonstrate that batching distributes across agent requests", - ], - ]; - - let n_embd = usize::try_from(fixture.model.n_embd())?; - let mut batch = LlamaBatch::new(64, 4)?; - let mut collected: Vec> = Vec::new(); - - for iteration_inputs in iterations { - for (sequence_index, text) in iteration_inputs.iter().enumerate() { - let tokens = fixture.model.str_to_token(text, AddBos::Always)?; - let sequence_id = i32::try_from(sequence_index)?; - - batch.add_sequence(&tokens, sequence_id, true)?; - } - - context.clear_kv_cache(); - context.decode(&mut batch)?; - - for sequence_index in 0..iteration_inputs.len() { - let sequence_id = i32::try_from(sequence_index)?; - let embedding = context.embeddings_seq_ith(sequence_id)?; - - assert_eq!( - embedding.len(), - n_embd, - "iteration sequence {sequence_index} embedding length mismatch" - ); - - collected.push(embedding.to_vec()); - } - - batch.clear(); - } - - assert_eq!( - collected.len(), - iterations.iter().flatten().count(), - "expected one embedding per input across every iteration" - ); - - for (left_index, left) in collected.iter().enumerate() { - for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { - assert_ne!( - left, right, - "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations", - ); - } - } - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, -)] -fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let embeddings = context.embeddings_ith(last_index)?; - - assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, -)] -fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let result = context.embeddings_ith(999); - - assert!(result.is_err()); - - Ok(()) -} - -// ========================================================================================= -// Group C: t5-small encoder model, embeddings=true. Single trial. -// ========================================================================================= - -#[llama_test( - model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - embeddings = true, -)] -fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("hello", AddBos::Never)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.encode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/context_kv_cache.rs b/llama-cpp-bindings-tests/tests/context_kv_cache.rs deleted file mode 100644 index 467a2aa4..00000000 --- a/llama-cpp-bindings-tests/tests/context_kv_cache.rs +++ /dev/null @@ -1,961 +0,0 @@ -use std::num::NonZeroU8; - -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::kv_cache::KvCacheConversionError; -use llama_cpp_bindings::error::KvCacheSeqAddError; -use llama_cpp_bindings::error::KvCacheSeqDivError; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result> { - Ok(LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?) -} - -fn decode_hello_world(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> { - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - context.clear_kv_cache(); - assert_eq!(context.kv_cache_seq_pos_max(0), -1); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - assert!(context.kv_cache_seq_pos_max(0) >= 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1)); - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.copy_kv_cache_seq(0, 1, None, None); - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let pos_max = context.kv_cache_seq_pos_max(0); - context.copy_cache(0, 1, pos_max + 1); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.kv_cache_seq_add(0, Some(0), None, 1); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqAddError::IncompatibleRopeType, - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(0), None, divisor); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqDivError::IncompatibleRopeType, - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - context.kv_cache_seq_keep(0); - - assert!(context.kv_cache_seq_pos_max(0) >= 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1)); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let result = context.kv_cache_seq_add(0, Some(0), None, 1); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - decode_hello_world(fixture, &mut context)?; - - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(0), None, divisor); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = build_context(fixture)?; - - let result = context.kv_cache_seq_pos_max(999); - - assert_eq!(result, -1); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P0TooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX)); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P1TooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::SeqIdTooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P0TooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX)); - - assert!(matches!( - result.unwrap_err(), - KvCacheConversionError::P1TooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqAddError::P0TooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqAddError::P1TooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqDivError::P0TooLarge(_), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; - let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor); - - assert!(matches!( - result.unwrap_err(), - KvCacheSeqDivError::P1TooLarge(_), - )); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/context_session.rs b/llama-cpp-bindings-tests/tests/context_session.rs deleted file mode 100644 index d32f7ecf..00000000 --- a/llama-cpp-bindings-tests/tests/context_session.rs +++ /dev/null @@ -1,1162 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result> { - Ok(LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_session.bin"); - context.state_save_file(&session_path, &tokens)?; - - let loaded_tokens = context.state_load_file(&session_path, 512)?; - assert_eq!(loaded_tokens, tokens); - - std::fs::remove_file(&session_path)?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = build_context(fixture)?; - - assert!(context.get_state_size() > 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_seq_state.bin"); - let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?; - assert!(bytes_written > 0); - - let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?; - assert_eq!(loaded_tokens, tokens); - assert!(bytes_read > 0); - - std::fs::remove_file(&session_path)?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let state_size = context.get_state_size(); - let mut state_data = vec![0u8; state_size]; - let bytes_copied = unsafe { context.copy_state_data(&mut state_data) }; - assert!(bytes_copied > 0); - - let bytes_read = unsafe { context.set_state_data(&state_data) }; - assert!(bytes_read > 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_load_file_with_nonexistent_file_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.state_load_file("/nonexistent/session.bin", 512); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_load_file_with_nonexistent_file_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = build_context(fixture)?; - - let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_save_file_to_invalid_directory_returns_failed_to_save( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = build_context(fixture)?; - - let result = context.state_save_file("/nonexistent_dir/session.bin", &[]); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_save_file_to_invalid_directory_returns_failed_to_save( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = build_context(fixture)?; - - let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_load_file_with_zero_max_tokens_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin"); - context.state_save_file(&session_path, &tokens)?; - - let result = context.state_load_file(&session_path, 0); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_load_file_with_zero_max_tokens_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin"); - context.state_seq_save_file(&session_path, 0, &tokens)?; - - let result = context.state_seq_load_file(&session_path, 0, 0); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_load_file_with_insufficient_max_tokens_returns_length_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token( - "Hello world this is a longer string for more tokens", - AddBos::Always, - )?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin"); - context.state_save_file(&session_path, &tokens)?; - - let result = context.state_load_file(&session_path, 1); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token( - "Hello world this is a longer string for more tokens", - AddBos::Always, - )?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin"); - context.state_seq_save_file(&session_path, 0, &tokens)?; - - let result = context.state_seq_load_file(&session_path, 0, 1); - - assert!(result.is_err()); - let _ = std::fs::remove_file(&session_path); - - Ok(()) -} - -#[cfg(unix)] -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_save_file(non_utf8_path, &[]); - - assert!(result.is_err()); - - Ok(()) -} - -#[cfg(unix)] -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let mut context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_load_file(non_utf8_path, 512); - - assert!(result.is_err()); - - Ok(()) -} - -#[cfg(unix)] -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_seq_save_file(non_utf8_path, 0, &[]); - - assert!(result.is_err()); - - Ok(()) -} - -#[cfg(unix)] -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let mut context = build_context(fixture)?; - - let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); - let result = context.state_seq_load_file(non_utf8_path, 0, 512); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_save_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_save_file(path_with_null, &[]); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_load_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_load_file(path_with_null, 512); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_save_file_with_null_byte_in_path_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_seq_save_file(path_with_null, 0, &[]); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_load_file_with_null_byte_in_path_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mut context = build_context(fixture)?; - - let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); - let result = context.state_seq_load_file(path_with_null, 0, 512); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_get_size_ext_returns_size_for_decoded_sequence( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; - - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let flags = LlamaStateSeqFlags::empty(); - let size = context.state_seq_get_size_ext(0, &flags); - - assert!(size > 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn state_seq_get_data_ext_and_set_data_ext_round_trip(fixture: &LlamaFixture<'_>) -> Result<()> { - use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; - - let mut context = build_context(fixture)?; - - let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let flags = LlamaStateSeqFlags::empty(); - let size = context.state_seq_get_size_ext(0, &flags); - let mut buffer = vec![0u8; size]; - let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) }; - - assert!(bytes_written > 0); - - let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) }; - - assert!(bytes_read > 0); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs deleted file mode 100644 index 712397df..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ /dev/null @@ -1,126 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\ -<|User|>What is 2 + 2?<|Assistant|> - - - -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = - model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!( - !outcome.generated_raw.is_empty(), - "DeepSeek-R1-8B: must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the think block before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert!( - outcome.observed_content > 0, - "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens" - ); - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs deleted file mode 100644 index 6bed6bbe..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs +++ /dev/null @@ -1,151 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 1500; - -// DeepSeek-R1-Distill-Llama-8B uses `...` reasoning markers -// and full-width-bar role tokens `<|User|>` / `<|Assistant|>` (U+FF5C, -// not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends -// `<|Assistant|>\n` — DeepSeek-R1 is a pure reasoner with no -// thinking-disabled mode — so the model resumes generation already inside -// the reasoning block. -const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\ -<|User|>What is 2 + 2?<|Assistant|> -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!( - !outcome.generated_raw.is_empty(), - "DeepSeek-R1-8B: must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \ - opens a block; outcome={outcome:?}", - ); - assert!( - usage.reasoning_tokens > 0, - "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \ - block; usage was {usage:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \ - so no Undeterminable tokens may be emitted; outcome={outcome:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning" - ); - - if parsed.reasoning_content.is_empty() { - eprintln!( - "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \ - tokens — skipping strict parser-equality assertions" - ); - } else { - assert_eq!( - outcome.reasoning_stream, parsed.reasoning_content, - "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \ - (any difference means a marker leaked into the user-visible stream)", - ); - assert_eq!( - outcome.content_stream, parsed.content, - "DeepSeek-R1-8B: per-token content stream must equal parser-side content \ - (any difference means a marker leaked into the user-visible stream)", - ); - } - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs deleted file mode 100644 index ce2b922d..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs +++ /dev/null @@ -1,70 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const GEMMA_PAIRED_QUOTE_PAYLOAD: &str = "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise Gemma paired-quote on a model with no registered \ - template; got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs deleted file mode 100644 index 7b9e052b..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs +++ /dev/null @@ -1,72 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const GLM_KEY_VALUE_PAYLOAD: &str = "get_weather\ -location\ -Paris\ -"; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise GLM key-value tags on a model with no registered \ - template; got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs deleted file mode 100644 index 66b4caab..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs +++ /dev/null @@ -1,70 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const MISTRAL_BRACKETED_JSON_PAYLOAD: &str = r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \ - template; got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs deleted file mode 100644 index 203ae0e8..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs +++ /dev/null @@ -1,75 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const QWEN_XML_PAYLOAD: &str = "\n\ -\n\ -\n\ -Paris\n\ -\n\ -\n\ -"; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "duck-type pass must recognise Qwen XML on a model with no registered template; \ - got Unrecognized" - ); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs deleted file mode 100644 index 2921b3d6..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs +++ /dev/null @@ -1,58 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "plain content with tools requested must produce Recognized (with empty tool_calls); \ - got Unrecognized" - ); - }; - assert!( - parsed.tool_calls.is_empty(), - "expected no tool calls; got {:?}", - parsed.tool_calls - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs deleted file mode 100644 index cc48350f..00000000 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs +++ /dev/null @@ -1,38 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const PLAIN_CONTENT: &str = "Hello there."; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message("[]", PLAIN_CONTENT, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("plain content with empty tools array must produce Recognized; got Unrecognized"); - }; - assert!( - parsed.tool_calls.is_empty(), - "expected no tool calls; got {:?}", - parsed.tool_calls - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs new file mode 100644 index 00000000..cebd47c1 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs @@ -0,0 +1,707 @@ +use llama_cpp_test_harness::llama_tests_main; + +mod embeddings { + use std::time::Duration; + + use anyhow::{Context, Result}; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::ggml_time_us; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + fn normalize(input: &[f32]) -> Vec { + let magnitude = input + .iter() + .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator)) + .sqrt(); + + input.iter().map(|&value| value / magnitude).collect() + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_threads_batch = 8, + embeddings = true, + )] + fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; + + let prompt = "Hello my name is"; + let tokens = model + .str_to_token(prompt, AddBos::Always) + .with_context(|| format!("failed to tokenize {prompt}"))?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let n_ctx = usize::try_from(ctx.n_ctx())?; + assert!(tokens.len() <= n_ctx, "prompt exceeds context window size"); + + let t_main_start = ggml_time_us(); + + let mut classifier = model.sampled_token_classifier(); + let mut batch = LlamaBatch::new(n_ctx, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); + + ctx.clear_kv_cache(); + ctx.decode(&mut batch) + .with_context(|| "llama_decode() failed")?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let embedding = ctx + .embeddings_seq_ith(0) + .with_context(|| "failed to get embeddings")?; + let normalized = normalize(embedding); + + let t_main_end = ggml_time_us(); + let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); + + eprintln!( + "created embedding with {} dimensions in {:.2} s", + normalized.len(), + duration.as_secs_f32() + ); + + assert!( + !normalized.is_empty(), + "embedding should have at least one dimension" + ); + + let magnitude: f32 = normalized + .iter() + .map(|value| value * value) + .sum::() + .sqrt(); + assert!( + (magnitude - 1.0).abs() < 0.01, + "normalized embedding magnitude should be approximately 1.0, got {magnitude}" + ); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.completion_tokens(), 0); + + Ok(()) + } +} + +mod reranker { + use std::time::Duration; + + use anyhow::{Context, Result, bail}; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::ggml_time_us; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + fn normalize(input: &[f32]) -> Vec { + let magnitude = input + .iter() + .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator)) + .sqrt(); + + input.iter().map(|&value| value / magnitude).collect() + } + + fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 { + vec_a + .iter() + .zip(vec_b.iter()) + .map(|(left, right)| left * right) + .sum::() + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 2, + n_threads_batch = 8, + embeddings = true, + )] + fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + let query = "What is machine learning?"; + let documents = [ + "Machine learning is a subset of artificial intelligence.", + "The weather today is sunny and warm.", + ]; + + let document_count = documents.len(); + assert_eq!( + u32::try_from(document_count)?, + fixture.context_params.n_seq_max, + "attribute n_seq_max must match the document count this trial expects", + ); + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; + + let prompt_lines: Vec = documents + .iter() + .map(|document| format!("{query}{document}")) + .collect(); + + let tokens_lines_list = prompt_lines + .iter() + .map(|line| model.str_to_token(line, AddBos::Always)) + .collect::, _>>() + .with_context(|| "failed to tokenize prompts")?; + + let n_ctx = usize::try_from(ctx.n_ctx())?; + + if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) { + bail!("one of the provided prompts exceeds the size of the context window"); + } + + let mut classifier = model.sampled_token_classifier(); + let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?; + let t_main_start = ggml_time_us(); + + for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() { + classifier.feed_prompt_sequence_to_batch( + &mut batch, + tokens, + i32::try_from(sequence_index)?, + false, + )?; + } + + let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum(); + let total_token_count = u64::try_from(total_tokens)?; + + assert_eq!(classifier.pending_prompt_tokens(), total_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); + + ctx.clear_kv_cache(); + ctx.decode(&mut batch) + .with_context(|| "llama_decode() failed")?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, total_token_count); + + let mut embeddings = Vec::with_capacity(document_count); + + for sequence_index in 0..document_count { + let raw_embedding = ctx + .embeddings_seq_ith(i32::try_from(sequence_index)?) + .with_context(|| "failed to get sequence embeddings")?; + embeddings.push(normalize(raw_embedding)); + } + + let t_main_end = ggml_time_us(); + let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); + + #[expect( + clippy::cast_precision_loss, + reason = "logged throughput tolerates f32 precision" + )] + let tokens_per_second = total_tokens as f32 / duration.as_secs_f32(); + + eprintln!( + "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", + duration.as_secs_f32(), + ); + + assert_eq!( + embeddings.len(), + document_count, + "should produce one embedding per document" + ); + + for (index, embedding) in embeddings.iter().enumerate() { + assert!( + !embedding.is_empty(), + "embedding {index} should not be empty" + ); + } + + let similarity = cosine_similarity(&embeddings[0], &embeddings[1]); + eprintln!("cosine similarity between document embeddings: {similarity:.4}"); + + assert!( + similarity.is_finite(), + "cosine similarity should be a finite number" + ); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, total_token_count); + assert_eq!(usage.completion_tokens(), 0); + + Ok(()) + } +} + +mod context_embedding_and_encoder { + + use anyhow::Result; + + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + // ========================================================================================= + // Group A: default Qwen model, embeddings=false. Most context tests fall here. + // ========================================================================================= + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, + )] + fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, + )] + fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let embeddings = context.embeddings_seq_ith(0)?; + + assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 4, + embeddings = true, + )] + fn multi_sequence_embeddings_returns_one_embedding_per_sequence( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let inputs = [ + "alpha is here", + "beta runs fast", + "gamma waits", + "delta jumps", + ]; + let mut batch = LlamaBatch::new(64, 4)?; + + for (sequence_index, text) in inputs.iter().enumerate() { + let tokens = fixture.model.str_to_token(text, AddBos::Always)?; + let sequence_id = i32::try_from(sequence_index)?; + + batch.add_sequence(&tokens, sequence_id, true)?; + } + + context.decode(&mut batch)?; + + let n_embd = usize::try_from(fixture.model.n_embd())?; + let mut collected: Vec> = Vec::with_capacity(inputs.len()); + + for sequence_index in 0..inputs.len() { + let sequence_id = i32::try_from(sequence_index)?; + let embedding = context.embeddings_seq_ith(sequence_id)?; + + assert_eq!( + embedding.len(), + n_embd, + "sequence {sequence_index} embedding length mismatch" + ); + + collected.push(embedding.to_vec()); + } + + for (left_index, left) in collected.iter().enumerate() { + for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { + assert_ne!( + left, right, + "embedding for sequence {left_index} must differ from sequence {right_index}", + ); + } + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 4, + embeddings = true, + )] + fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let iterations = [ + [ + "This is the first document with enough content to contribute meaningfully to the batch size calculation", + "This is the second document that should be processed in a potentially different batch from the first", + ], + [ + "This is the third document adding more content to ensure the total exceeds the configured chunk limit", + "This is the fourth document which should demonstrate that batching distributes across agent requests", + ], + ]; + + let n_embd = usize::try_from(fixture.model.n_embd())?; + let mut batch = LlamaBatch::new(64, 4)?; + let mut collected: Vec> = Vec::new(); + + for iteration_inputs in iterations { + for (sequence_index, text) in iteration_inputs.iter().enumerate() { + let tokens = fixture.model.str_to_token(text, AddBos::Always)?; + let sequence_id = i32::try_from(sequence_index)?; + + batch.add_sequence(&tokens, sequence_id, true)?; + } + + context.clear_kv_cache(); + context.decode(&mut batch)?; + + for sequence_index in 0..iteration_inputs.len() { + let sequence_id = i32::try_from(sequence_index)?; + let embedding = context.embeddings_seq_ith(sequence_id)?; + + assert_eq!( + embedding.len(), + n_embd, + "iteration sequence {sequence_index} embedding length mismatch" + ); + + collected.push(embedding.to_vec()); + } + + batch.clear(); + } + + assert_eq!( + collected.len(), + iterations.iter().flatten().count(), + "expected one embedding per input across every iteration" + ); + + for (left_index, left) in collected.iter().enumerate() { + for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) { + assert_ne!( + left, right, + "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations", + ); + } + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, + )] + fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let embeddings = context.embeddings_ith(last_index)?; + + assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, + )] + fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.embeddings_ith(999); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, + )] + fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Never)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.encode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) + } +} + +mod context_kv_cache_embedding { + use std::num::NonZeroU8; + + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + fn build_context<'context>( + fixture: &'context LlamaFixture<'_>, + ) -> Result> { + Ok(LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?) + } + + fn decode_hello_world( + fixture: &LlamaFixture<'_>, + context: &mut LlamaContext<'_>, + ) -> Result<()> { + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let result = context.kv_cache_seq_add(0, Some(0), None, 1); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(0), None, divisor); + + assert!(result.is_ok()); + + Ok(()) + } +} + +mod model_helpers_embedding { + #![expect( + clippy::unnecessary_wraps, + reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" + )] + + use anyhow::Result; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 + )] + fn embedding_model_tool_call_markers_call_does_not_panic( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let _markers = fixture.model.tool_call_markers(); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 + )] + fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let _markers = fixture.model.streaming_markers()?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 + )] + fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let env = fixture.model.approximate_tok_env(); + let env_again = fixture.model.approximate_tok_env(); + + assert!( + std::sync::Arc::ptr_eq(&env, &env_again), + "approximate_tok_env must return the same cached Arc for any model, including \ + the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)" + ); + + Ok(()) + } +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/embeddings.rs b/llama-cpp-bindings-tests/tests/embeddings.rs deleted file mode 100644 index 7e531cec..00000000 --- a/llama-cpp-bindings-tests/tests/embeddings.rs +++ /dev/null @@ -1,103 +0,0 @@ -use std::time::Duration; - -use anyhow::{Context, Result}; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::ggml_time_us; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -fn normalize(input: &[f32]) -> Vec { - let magnitude = input - .iter() - .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator)) - .sqrt(); - - input.iter().map(|&value| value / magnitude).collect() -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_threads_batch = 8, - embeddings = true, -)] -fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - - let mut ctx = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create context")?; - - let prompt = "Hello my name is"; - let tokens = model - .str_to_token(prompt, AddBos::Always) - .with_context(|| format!("failed to tokenize {prompt}"))?; - let prompt_token_count = u64::try_from(tokens.len())?; - - let n_ctx = usize::try_from(ctx.n_ctx())?; - assert!(tokens.len() <= n_ctx, "prompt exceeds context window size"); - - let t_main_start = ggml_time_us(); - - let mut classifier = model.sampled_token_classifier(); - let mut batch = LlamaBatch::new(n_ctx, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); - - ctx.clear_kv_cache(); - ctx.decode(&mut batch) - .with_context(|| "llama_decode() failed")?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let embedding = ctx - .embeddings_seq_ith(0) - .with_context(|| "failed to get embeddings")?; - let normalized = normalize(embedding); - - let t_main_end = ggml_time_us(); - let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); - - eprintln!( - "created embedding with {} dimensions in {:.2} s", - normalized.len(), - duration.as_secs_f32() - ); - - assert!( - !normalized.is_empty(), - "embedding should have at least one dimension" - ); - - let magnitude: f32 = normalized - .iter() - .map(|value| value * value) - .sum::() - .sqrt(); - assert!( - (magnitude - 1.0).abs() < 0.01, - "normalized embedding magnitude should be approximately 1.0, got {magnitude}" - ); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, prompt_token_count); - assert_eq!(usage.completion_tokens(), 0); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs b/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs deleted file mode 100644 index dcef4ded..00000000 --- a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs +++ /dev/null @@ -1,185 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::TokenUsage; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputChunkType; -use llama_cpp_bindings::mtmd::MtmdInputChunks; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings_tests::test_model::fixtures_dir; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const PROMPT_QUESTION: &str = "What animals do you see in this image?"; - -struct ExpectedChunkTotals { - text: u64, - image: u64, - audio: u64, -} - -fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result { - let mut totals = ExpectedChunkTotals { - text: 0, - image: 0, - audio: 0, - }; - for index in 0..chunks.len() { - let chunk = chunks - .get(index) - .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?; - let n_tokens = u64::try_from(chunk.n_tokens())?; - match chunk.chunk_type()? { - MtmdInputChunkType::Text => { - totals.text = totals.text.saturating_add(n_tokens); - } - MtmdInputChunkType::Image => { - totals.image = totals.image.saturating_add(n_tokens); - } - MtmdInputChunkType::Audio => { - totals.audio = totals.audio.saturating_add(n_tokens); - } - } - } - Ok(totals) -} - -fn build_multimodal_chunks_and_eval_into_usage( - fixture: &LlamaFixture<'_>, -) -> Result<(TokenUsage, ExpectedChunkTotals)> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!("{marker}{PROMPT_QUESTION}"); - - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let expected = sum_chunk_token_counts_by_type(&chunks)?; - - let context_params = (*fixture.context_params).into_llama_context_params(); - let context = LlamaContext::from_model(model, fixture.backend, context_params)?; - - let mut classifier = model.sampled_token_classifier(); - classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - Ok((classifier.into_usage(), expected)) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if usage.prompt_tokens != expected.text { - anyhow::bail!( - "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}", - expected.text, - usage.prompt_tokens - ); - } - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if usage.input_image_tokens != expected.image { - anyhow::bail!( - "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}", - expected.image, - usage.input_image_tokens - ); - } - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if expected.audio != 0 { - anyhow::bail!( - "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}", - expected.audio - ); - } - if usage.input_audio_tokens != 0 { - anyhow::bail!( - "input_audio_tokens must be zero when no audio chunks are evaluated; got {}", - usage.input_audio_tokens - ); - } - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn completion_tokens_are_zero_after_eval_before_generation( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; - - if usage.completion_tokens() != 0 { - anyhow::bail!( - "completion_tokens must be zero immediately after eval (no generation has occurred); got {}", - usage.completion_tokens() - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs deleted file mode 100644 index e20b99a2..00000000 --- a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ /dev/null @@ -1,115 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\ -user\nReply with the single word: four. Do not explain.\n\ -model\n<|channel>thought\n\n"; - -const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!( - !outcome.generated_raw.is_empty(), - "Gemma 4 must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the thought channel before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert!( - outcome.observed_content > 0, - "Gemma 4 thinking-disabled: classifier must emit at least one Content token" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens" - ); - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs deleted file mode 100644 index 6a7aaba0..00000000 --- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs +++ /dev/null @@ -1,124 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 1500; - -const GEMMA4_THINKING_PROMPT: &str = "\ -user\nReply with the single word: four. Do not explain.\n\ -model\n<|channel>thought\n"; - -const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn gemma4_classifier_emits_reasoning_for_thinking_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!( - !outcome.generated_raw.is_empty(), - "Gemma 4 must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "Gemma 4 classifier must emit at least one Reasoning token when the model \ - emits a `<|channel>thought` block; outcome={outcome:?}", - ); - assert!( - usage.reasoning_tokens > 0, - "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \ - reasoning block; usage was {usage:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Gemma 4: classifier must not emit Undeterminable when the model emits a \ - detected `<|channel>thought` marker; outcome={outcome:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - "Gemma 4: completion tokens must equal observed Content + Reasoning" - ); - assert!( - !parsed.reasoning_content.is_empty(), - "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \ - increase the budget or pick a more direct prompt. generated={:?}", - outcome.generated_raw, - ); - - for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "Gemma 4: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs deleted file mode 100644 index e810ca3e..00000000 --- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ /dev/null @@ -1,101 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::test_model::fixtures_dir; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -#[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"), -)] -fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "user\n{marker}What animals do you see in this image?\nmodel\n<|channel>thought\n" - ); - - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \ - when the prompt opens a `<|channel>thought` block; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs deleted file mode 100644 index 2f3d3eaa..00000000 --- a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs +++ /dev/null @@ -1,68 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str = - "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; - -#[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized"); - }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs deleted file mode 100644 index dc8099d7..00000000 --- a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs +++ /dev/null @@ -1,50 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::ToolCallArgsShape; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let template = model - .chat_template(None) - .expect("Gemma 4 chat template must be present"); - let template_str = template.to_str().expect("template must be valid UTF-8"); - assert!( - template_str.contains("<|tool_call>call:"), - "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \ - template starts with: {:?}", - &template_str[..template_str.len().min(200)], - ); - - let markers = model - .tool_call_markers() - .expect("Gemma 4 must produce ToolCallMarkers via override registry"); - - assert_eq!(markers.open, "<|tool_call>call:"); - assert_eq!(markers.close, "}"); - let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else { - panic!("expected PairedQuote variant, got {:?}", markers.args_shape); - }; - assert_eq!(shape.name_args_separator, "{"); - assert_eq!(shape.value_quote.open, "<|\"|>"); - assert_eq!(shape.value_quote.close, "<|\"|>"); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs deleted file mode 100644 index 7b614ef9..00000000 --- a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ /dev/null @@ -1,93 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -const GLM47_THINKING_DISABLED_PROMPT: &str = "\ -<|user|> -What is 2 + 2? -<|assistant|> - - -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs deleted file mode 100644 index d4677a14..00000000 --- a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs +++ /dev/null @@ -1,111 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 1500; - -const GLM47_THINKING_PROMPT: &str = "\ -<|user|> -What is 2 + 2? -<|assistant|> - -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning - ); - - if parsed.reasoning_content.is_empty() { - eprintln!( - "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ - skipping strict parser-equality assertions" - ); - } else { - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - } - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs deleted file mode 100644 index 8f31901e..00000000 --- a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs +++ /dev/null @@ -1,66 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const GLM47_KEY_VALUE_PAYLOAD: &str = "get_weather\ -location\ -Paris\ -"; - -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized" - ); - }; - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs deleted file mode 100644 index 491c46c4..00000000 --- a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs +++ /dev/null @@ -1,49 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::ToolCallArgsShape; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let template = model - .chat_template(None) - .expect("GLM-4.7 chat template must be present"); - let template_str = template.to_str().expect("template must be valid UTF-8"); - assert!(template_str.contains("")); - - let markers = model - .tool_call_markers() - .expect("GLM-4.7 must produce ToolCallMarkers via override registry"); - - assert_eq!(markers.open, ""); - assert_eq!(markers.close, ""); - let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else { - panic!( - "expected KeyValueXmlTags variant, got {:?}", - markers.args_shape - ); - }; - assert_eq!(shape.key_open, ""); - assert_eq!(shape.key_close, ""); - assert_eq!(shape.value_open, ""); - assert_eq!(shape.value_close, ""); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs b/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs deleted file mode 100644 index 24045f7c..00000000 --- a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs +++ /dev/null @@ -1,181 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputChunkType; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings_tests::test_model::fixtures_dir; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let input_text = MtmdInputText { - text: "hello world".to_owned(), - add_special: false, - parse_special: false, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[])?; - - let text_chunk = (0..chunks.len()) - .filter_map(|index| chunks.get(index)) - .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text)) - .ok_or_else(|| { - anyhow::anyhow!("text-only tokenization should produce at least one text chunk") - })?; - - let n_tokens = u64::try_from(text_chunk.n_tokens())?; - - let mut classifier = model.sampled_token_classifier(); - - ingest_prompt_chunk(&mut classifier, &text_chunk)?; - - let usage = classifier.usage(); - if usage.prompt_tokens != n_tokens { - anyhow::bail!( - "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}", - usage.prompt_tokens - ); - } - if usage.input_image_tokens != 0 { - anyhow::bail!( - "text chunk must not bump input_image_tokens; got {}", - usage.input_image_tokens - ); - } - if usage.input_audio_tokens != 0 { - anyhow::bail!( - "text chunk must not bump input_audio_tokens; got {}", - usage.input_audio_tokens - ); - } - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let input_text = MtmdInputText { - text: marker.to_owned(), - add_special: false, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let image_chunk = (0..chunks.len()) - .filter_map(|index| chunks.get(index)) - .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image)) - .ok_or_else(|| anyhow::anyhow!("multimodal tokenization should produce an image chunk"))?; - - let n_tokens = u64::try_from(image_chunk.n_tokens())?; - if n_tokens == 0 { - anyhow::bail!("image chunk should report at least one token"); - } - - let mut classifier = model.sampled_token_classifier(); - - ingest_prompt_chunk(&mut classifier, &image_chunk)?; - - let usage = classifier.usage(); - if usage.input_image_tokens != n_tokens { - anyhow::bail!( - "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}", - usage.input_image_tokens - ); - } - if usage.prompt_tokens != 0 { - anyhow::bail!( - "image chunk must not bump prompt_tokens; got {}", - usage.prompt_tokens - ); - } - if usage.input_audio_tokens != 0 { - anyhow::bail!( - "image chunk must not bump input_audio_tokens; got {}", - usage.input_audio_tokens - ); - } - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn text_chunk_drives_marker_state_machine_to_reasoning(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let input_text = MtmdInputText { - text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n\n".to_owned(), - add_special: false, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[])?; - - let mut classifier = model.sampled_token_classifier(); - - for index in 0..chunks.len() { - let chunk = chunks - .get(index) - .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?; - ingest_prompt_chunk(&mut classifier, &chunk)?; - } - - if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning { - anyhow::bail!( - "text chunk replay must transition the classifier section to Reasoning when the \ - prompt opens a `` block; got {:?}", - classifier.current_section() - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs new file mode 100644 index 00000000..de316e42 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs @@ -0,0 +1,2836 @@ +use llama_cpp_test_harness::llama_tests_main; + +mod model_context_creation { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + assert!(context.n_ctx() > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, + )] + fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ); + + assert!(result.is_err()); + Ok(()) + } +} + +mod context { + use std::ptr::NonNull; + use std::sync::Arc; + use std::sync::atomic::AtomicBool; + + use anyhow::Result; + use llama_cpp_bindings::DecodeError; + use llama_cpp_bindings::LogitsError; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::model::LlamaLoraAdapter; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + // ========================================================================================= + // Group A: default Qwen model, embeddings=false. Most context tests fall here. + // ========================================================================================= + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + assert!(context.n_ctx() > 0); + assert!(context.n_batch() > 0); + assert!(context.n_ubatch() > 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let decode_result = context.decode(&mut batch); + assert!(decode_result.is_ok()); + + let logits = context.get_logits()?; + assert!(!logits.is_empty()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.reset_timings(); + let timings = context.timings(); + assert!(timings.t_start_ms() >= 0.0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let token_data_array = context.token_data_array()?; + + assert!(!token_data_array.data.is_empty()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let logits = context.get_logits_ith(last_index)?; + + assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let token_data_array = context.token_data_array_ith(last_index)?; + + assert_eq!( + token_data_array.data.len(), + usize::try_from(fixture.model.n_vocab())? + ); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn embeddings_ith_returns_error_when_embeddings_disabled( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.embeddings_ith(0); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn embeddings_seq_ith_returns_error_when_embeddings_disabled( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.embeddings_seq_ith(0); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let count = context.candidates()?.count(); + + assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let debug_output = format!("{context:?}"); + + assert!(debug_output.contains("LlamaContext")); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let count = context.candidates_ith(last_index)?.count(); + + assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut adapter = LlamaLoraAdapter { + lora_adapter: NonNull::dangling(), + }; + + let result = context.lora_adapter_remove(&mut adapter); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.encode(&mut batch); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut adapter = LlamaLoraAdapter { + lora_adapter: NonNull::dangling(), + }; + + let result = context.lora_adapter_set(&mut adapter, 1.0); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, + )] + fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let result = context.embeddings_seq_ith(999); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut batch = LlamaBatch::new(512, 1)?; + + let result = context.decode(&mut batch); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(true)); + context.set_abort_flag(abort_flag); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert_eq!(result, Err(DecodeError::Aborted)); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(false)); + context.set_abort_flag(abort_flag); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(true)); + context.set_abort_flag(abort_flag); + context.clear_abort_callback(); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.synchronize(); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.detach_threadpool(); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn get_logits_ith_returns_token_not_initialized_for_unknown_index( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.get_logits_ith(7); + + assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7)))); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 2048, + n_ubatch = 512, + )] + fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let huge_index = i32::try_from(context.n_ctx())?; + context.mark_logits_initialized(huge_index); + let result = context.get_logits_ith(huge_index); + + assert!(matches!( + result, + Err(LogitsError::TokenIndexExceedsContext { .. }) + )); + + Ok(()) + } +} + +mod context_kv_cache { + use std::num::NonZeroU8; + + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::context::kv_cache::KvCacheConversionError; + use llama_cpp_bindings::error::KvCacheSeqAddError; + use llama_cpp_bindings::error::KvCacheSeqDivError; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + fn build_context<'context>( + fixture: &'context LlamaFixture<'_>, + ) -> Result> { + Ok(LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?) + } + + fn decode_hello_world( + fixture: &LlamaFixture<'_>, + context: &mut LlamaContext<'_>, + ) -> Result<()> { + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + context.clear_kv_cache(); + assert_eq!(context.kv_cache_seq_pos_max(0), -1); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + assert!(context.kv_cache_seq_pos_max(0) >= 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1)); + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let result = context.copy_kv_cache_seq(0, 1, None, None); + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let pos_max = context.kv_cache_seq_pos_max(0); + context.copy_cache(0, 1, pos_max + 1); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let result = context.kv_cache_seq_add(0, Some(0), None, 1); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqAddError::IncompatibleRopeType, + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(0), None, divisor); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqDivError::IncompatibleRopeType, + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + context.kv_cache_seq_keep(0); + + assert!(context.kv_cache_seq_pos_max(0) >= 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; + + let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1)); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = build_context(fixture)?; + + let result = context.kv_cache_seq_pos_max(999); + + assert_eq!(result, -1); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P0TooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX)); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P1TooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::SeqIdTooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P0TooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX)); + + assert!(matches!( + result.unwrap_err(), + KvCacheConversionError::P1TooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqAddError::P0TooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqAddError::P1TooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqDivError::P0TooLarge(_), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; + let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor); + + assert!(matches!( + result.unwrap_err(), + KvCacheSeqDivError::P1TooLarge(_), + )); + + Ok(()) + } +} + +mod context_session { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + fn build_context<'context>( + fixture: &'context LlamaFixture<'_>, + ) -> Result> { + Ok(LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_session.bin"); + context.state_save_file(&session_path, &tokens)?; + + let loaded_tokens = context.state_load_file(&session_path, 512)?; + assert_eq!(loaded_tokens, tokens); + + std::fs::remove_file(&session_path)?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = build_context(fixture)?; + + assert!(context.get_state_size() > 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_seq_state.bin"); + let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?; + assert!(bytes_written > 0); + + let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?; + assert_eq!(loaded_tokens, tokens); + assert!(bytes_read > 0); + + std::fs::remove_file(&session_path)?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let state_size = context.get_state_size(); + let mut state_data = vec![0u8; state_size]; + let bytes_copied = unsafe { context.copy_state_data(&mut state_data) }; + assert!(bytes_copied > 0); + + let bytes_read = unsafe { context.set_state_data(&state_data) }; + assert!(bytes_read > 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_load_file_with_nonexistent_file_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.state_load_file("/nonexistent/session.bin", 512); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_load_file_with_nonexistent_file_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_save_file_to_invalid_directory_returns_failed_to_save( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = build_context(fixture)?; + + let result = context.state_save_file("/nonexistent_dir/session.bin", &[]); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_save_file_to_invalid_directory_returns_failed_to_save( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = build_context(fixture)?; + + let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_load_file_with_zero_max_tokens_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin"); + context.state_save_file(&session_path, &tokens)?; + + let result = context.state_load_file(&session_path, 0); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_load_file_with_zero_max_tokens_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin"); + context.state_seq_save_file(&session_path, 0, &tokens)?; + + let result = context.state_seq_load_file(&session_path, 0, 0); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_load_file_with_insufficient_max_tokens_returns_length_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token( + "Hello world this is a longer string for more tokens", + AddBos::Always, + )?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin"); + context.state_save_file(&session_path, &tokens)?; + + let result = context.state_load_file(&session_path, 1); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token( + "Hello world this is a longer string for more tokens", + AddBos::Always, + )?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin"); + context.state_seq_save_file(&session_path, 0, &tokens)?; + + let result = context.state_seq_load_file(&session_path, 0, 1); + + assert!(result.is_err()); + let _ = std::fs::remove_file(&session_path); + + Ok(()) + } + + #[cfg(unix)] + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let context = build_context(fixture)?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_save_file(non_utf8_path, &[]); + + assert!(result.is_err()); + + Ok(()) + } + + #[cfg(unix)] + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let mut context = build_context(fixture)?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_load_file(non_utf8_path, 512); + + assert!(result.is_err()); + + Ok(()) + } + + #[cfg(unix)] + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_save_file_with_non_utf8_path_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let context = build_context(fixture)?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_seq_save_file(non_utf8_path, 0, &[]); + + assert!(result.is_err()); + + Ok(()) + } + + #[cfg(unix)] + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_load_file_with_non_utf8_path_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let mut context = build_context(fixture)?; + + let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); + let result = context.state_seq_load_file(non_utf8_path, 0, 512); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_save_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = build_context(fixture)?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_save_file(path_with_null, &[]); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_load_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_load_file(path_with_null, 512); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_save_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let context = build_context(fixture)?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_seq_save_file(path_with_null, 0, &[]); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_load_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mut context = build_context(fixture)?; + + let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); + let result = context.state_seq_load_file(path_with_null, 0, 512); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_get_size_ext_returns_size_for_decoded_sequence( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; + + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let flags = LlamaStateSeqFlags::empty(); + let size = context.state_seq_get_size_ext(0, &flags); + + assert!(size > 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn state_seq_get_data_ext_and_set_data_ext_round_trip( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; + + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let flags = LlamaStateSeqFlags::empty(); + let size = context.state_seq_get_size_ext(0, &flags); + let mut buffer = vec![0u8; size]; + let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) }; + + assert!(bytes_written > 0); + + let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) }; + + assert!(bytes_read > 0); + + Ok(()) + } +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/llguidance.rs b/llama-cpp-bindings-tests/tests/llguidance.rs deleted file mode 100644 index 74bd229a..00000000 --- a/llama-cpp-bindings-tests/tests/llguidance.rs +++ /dev/null @@ -1,686 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use std::ffi::CStr; -use std::sync::Arc; - -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::llguidance_sampler::create_llg_sampler; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings::token::LlamaToken; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const JSON_SCHEMA: &str = - r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#; -const REGEX_GRAMMAR: &str = r"yes|no"; -const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?; - - assert!(!sampler.sampler.is_null()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - assert!(!sampler.sampler.is_null()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?; - - assert!(!sampler.sampler.is_null()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything"); - - assert!(result.is_err()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = create_llg_sampler(fixture.model, "json", "{this is not valid json"); - - assert!(result.is_err()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = create_llg_sampler(fixture.model, "regex", "[invalid"); - - assert!(result.is_err()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) }; - assert!(!name_ptr.is_null()); - let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?; - - assert_eq!(name, "llguidance"); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) }; - - assert!(!cloned.is_null()); - - unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) }; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = "Answer yes or no:"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; - let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); - - let token = chain.sample(&context, batch.n_tokens() - 1)?; - chain.accept(token)?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - let huge_token = LlamaToken(i32::MAX - 1); - let _ = sampler.accept(huge_token); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { - let first = fixture.model.approximate_tok_env(); - let second = fixture.model.approximate_tok_env(); - - assert!(Arc::ptr_eq(&first, &second)); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn approximate_tok_env_drives_consistent_grammar_constraint( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - - assert!(!first.sampler.is_null()); - assert!(!second.sampler.is_null()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let tokens = model.str_to_token("Answer:", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; - let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); - let _ = chain.sample(&context, batch.n_tokens() - 1); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; - let huge_token = LlamaToken(i32::MAX - 1); - let _ = sampler.accept(huge_token); - sampler.reset(); - let after = sampler.accept(LlamaToken(0)); - assert!( - after.is_ok() || after.is_err(), - "after reset, sampler.accept must return Ok or Err (not panic)" - ); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs deleted file mode 100644 index 6ae1d9cd..00000000 --- a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ /dev/null @@ -1,81 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\ -[INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]"; - -const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; - -#[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs deleted file mode 100644 index 296ad348..00000000 --- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs +++ /dev/null @@ -1,101 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 768; - -const MISTRAL3_THINKING_PROMPT: &str = "\ -[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ -First draft your thinking process (inner monologue) until you arrive at a response. \ -Format your response using Markdown, and use LaTeX for any mathematical equations. \ -Write both your thoughts and the response in the same language as the input.\n\n\ -Your thinking process must follow the template below:\ -[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ -Be as casual and as long as you want until you are confident to generate the response \ -to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ -[INST]Reply with the single word: four. Do not explain.[/INST]"; - -const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; - -#[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn mistral3_classifier_emits_reasoning_for_thinking_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - ); - assert!(!parsed.reasoning_content.is_empty()); - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs deleted file mode 100644 index abb5c39f..00000000 --- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ /dev/null @@ -1,101 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::test_model::fixtures_dir; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 768; - -#[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"), -)] -fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ - First draft your thinking process (inner monologue) until you arrive at a response. \ - Format your response using Markdown, and use LaTeX for any mathematical equations. \ - Write both your thoughts and the response in the same language as the input.\n\n\ - Your thinking process must follow the template below:\ - [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ - Be as casual and as long as you want until you are confident to generate the response \ - to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ - [INST]{marker}What animals do you see in this image?[/INST]" - ); - - let input_text = MtmdInputText { - text: prompt, - add_special: true, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::greedy(); - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \ - when the model opens a `[THINK]` block; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs deleted file mode 100644 index b67e0765..00000000 --- a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs +++ /dev/null @@ -1,65 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str = - r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; - -#[llama_test( - model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = - fixture - .model - .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized" - ); - }; - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_chat_template.rs b/llama-cpp-bindings-tests/tests/model_chat_template.rs deleted file mode 100644 index 88511471..00000000 --- a/llama-cpp-bindings-tests/tests/model_chat_template.rs +++ /dev/null @@ -1,194 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::ChatTemplateError; -use llama_cpp_bindings::model::LlamaChatMessage; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> { - let template = fixture.model.chat_template(None); - assert!(template.is_ok()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let template = model.chat_template(None)?; - let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?; - let prompt = model.apply_chat_template(&template, &[message], true); - - assert!(prompt.is_ok()); - assert!(!prompt?.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn apply_chat_template_buffer_resize_with_long_messages(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let template = model.chat_template(None)?; - let long_content = "a".repeat(2000); - let message = LlamaChatMessage::new("user".to_string(), long_content)?; - let prompt = model.apply_chat_template(&template, &[message], true); - - assert!(prompt.is_ok()); - assert!(!prompt?.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture - .model - .chat_template(Some("nonexistent_template_name_xyz")); - assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_context_creation.rs b/llama-cpp-bindings-tests/tests/model_context_creation.rs deleted file mode 100644 index 300027ec..00000000 --- a/llama-cpp-bindings-tests/tests/model_context_creation.rs +++ /dev/null @@ -1,106 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> { - let context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - assert!(context.n_ctx() > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4294967295, - n_batch = 128, - n_ubatch = 64, -)] -fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ); - - assert!(result.is_err()); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_helpers.rs b/llama-cpp-bindings-tests/tests/model_helpers.rs deleted file mode 100644 index 3efeae82..00000000 --- a/llama-cpp-bindings-tests/tests/model_helpers.rs +++ /dev/null @@ -1,103 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" -)] - -use anyhow::Result; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 -)] -fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> { - let formatted = format!("{:?}", fixture.model); - - assert!(formatted.contains("LlamaModel")); - assert!(formatted.contains("model")); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 -)] -fn embedding_model_tool_call_markers_call_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { - let _markers = fixture.model.tool_call_markers(); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 -)] -fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let _markers = fixture.model.streaming_markers()?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 -)] -fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { - let first = fixture.model.approximate_tok_env(); - let second = fixture.model.approximate_tok_env(); - - assert!(std::sync::Arc::ptr_eq(&first, &second)); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128 -)] -fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let env = fixture.model.approximate_tok_env(); - let env_again = fixture.model.approximate_tok_env(); - - assert!( - std::sync::Arc::ptr_eq(&env, &env_again), - "approximate_tok_env must return the same cached Arc for any model, including \ - the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)" - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_loading_errors.rs b/llama-cpp-bindings-tests/tests/model_loading_errors.rs index cd36eb46..6cf63144 100644 --- a/llama-cpp-bindings-tests/tests/model_loading_errors.rs +++ b/llama-cpp-bindings-tests/tests/model_loading_errors.rs @@ -4,8 +4,10 @@ )] use std::path::Path; +use std::path::PathBuf; use anyhow::Result; +use llama_cpp_bindings::LlamaLoraAdapterInitError; use llama_cpp_bindings::LlamaModelLoadError; use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::model::params::LlamaModelParams; @@ -169,4 +171,151 @@ fn load_model_with_non_utf8_path_returns_path_to_str_error( Ok(()) } +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn lora_adapter_init_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture + .model + .lora_adapter_init("/nonexistent/path/lora.gguf"); + assert_eq!( + result.unwrap_err(), + LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf")) + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn lora_adapter_init_with_invalid_gguf_returns_unloadable( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf"); + std::fs::write(&dummy_path, b"not a valid gguf")?; + + let result = fixture.model.lora_adapter_init(&dummy_path); + + assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable); + let _ = std::fs::remove_file(&dummy_path); + Ok(()) +} + +#[cfg(unix)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn lora_adapter_init_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf")); + let result = fixture.model.lora_adapter_init(non_utf8_path); + + assert_eq!( + result.unwrap_err(), + LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf()) + ); + Ok(()) +} + llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs b/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs deleted file mode 100644 index ae04dad8..00000000 --- a/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs +++ /dev/null @@ -1,162 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use std::path::PathBuf; - -use anyhow::Result; -use llama_cpp_bindings::LlamaLoraAdapterInitError; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn lora_adapter_init_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture - .model - .lora_adapter_init("/nonexistent/path/lora.gguf"); - assert_eq!( - result.unwrap_err(), - LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf")) - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn lora_adapter_init_with_invalid_gguf_returns_unloadable( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf"); - std::fs::write(&dummy_path, b"not a valid gguf")?; - - let result = fixture.model.lora_adapter_init(&dummy_path); - - assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable); - let _ = std::fs::remove_file(&dummy_path); - Ok(()) -} - -#[cfg(unix)] -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn lora_adapter_init_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - use std::path::Path; - - let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf")); - let result = fixture.model.lora_adapter_init(non_utf8_path); - - assert_eq!( - result.unwrap_err(), - LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf()) - ); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_metadata_kv.rs b/llama-cpp-bindings-tests/tests/model_metadata_kv.rs deleted file mode 100644 index 7d99b859..00000000 --- a/llama-cpp-bindings-tests/tests/model_metadata_kv.rs +++ /dev/null @@ -1,355 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.meta_count() > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> { - let key = fixture.model.meta_key_by_index(0)?; - assert!(!key.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> { - let value = fixture.model.meta_val_str_by_index(0)?; - assert!(!value.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture.model.meta_key_by_index(999_999); - assert!(result.is_err()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture.model.meta_val_str_by_index(999_999); - assert!(result.is_err()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let first_key = model.meta_key_by_index(0)?; - let value = model.meta_val_str(&first_key)?; - assert!(!value.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_val_str_with_long_value_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let count = model.meta_count(); - - for index in 0..count { - let key = model.meta_key_by_index(index); - let value = model.meta_val_str_by_index(index); - assert!(key.is_ok()); - assert!(value.is_ok()); - } - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = fixture.model.meta_val_str("key\0with_null"); - assert!(result.is_err()); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_params.rs b/llama-cpp-bindings-tests/tests/model_params.rs deleted file mode 100644 index 6684625e..00000000 --- a/llama-cpp-bindings-tests/tests/model_params.rs +++ /dev/null @@ -1,78 +0,0 @@ -#![expect( - clippy::similar_names, - reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity" -)] - -use std::ffi::CString; -use std::pin::pin; - -use anyhow::Result; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::max_devices; -use llama_cpp_bindings::model::params::LlamaModelParams; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let model_path_str = fixture - .model_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?; - let model_path_cstr = CString::new(model_path_str)?; - - let mut params = pin!(LlamaModelParams::default()); - let mut context_params = LlamaContextParams::default(); - let mut margins = vec![0usize; max_devices()]; - - let result = params.as_mut().fit_params( - &model_path_cstr, - &mut context_params, - &mut margins, - 512, - llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE, - ); - - let fit = result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?; - assert!(fit.n_ctx > 0); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_properties.rs b/llama-cpp-bindings-tests/tests/model_properties.rs deleted file mode 100644 index bd33ef6b..00000000 --- a/llama-cpp-bindings-tests/tests/model_properties.rs +++ /dev/null @@ -1,421 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - - assert!(model.n_vocab() > 0); - assert!(model.n_embd() > 0); - assert!(model.n_params() > 0); - assert!(model.n_ctx_train()? > 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.n_layer()? > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.n_head()? > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.n_head_kv()? > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(fixture.model.size() > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!(!fixture.model.is_recurrent()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn is_hybrid_returns_false_for_non_hybrid_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!( - !fixture.model.is_hybrid(), - "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { - assert!( - fixture.model.is_hybrid(), - "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn rope_type_returns_a_known_variant_for_rope_carrying_default_models( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - use llama_cpp_bindings::model::rope_type::RopeType; - let rope = fixture.model.rope_type(); - assert!( - matches!( - rope, - Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision) - ), - "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { - let rope = fixture.model.rope_type(); - assert!( - rope.is_none(), - "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - use llama_cpp_bindings::model::vocab_type::VocabType; - let vocab = fixture.model.vocab_type()?; - assert!( - matches!(vocab, VocabType::BPE | VocabType::SPM), - "vocab_type must be a known variant; got {vocab:?}" - ); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_sampling.rs b/llama-cpp-bindings-tests/tests/model_sampling.rs deleted file mode 100644 index d6b40ba4..00000000 --- a/llama-cpp-bindings-tests/tests/model_sampling.rs +++ /dev/null @@ -1,452 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::SampledToken; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::json_schema_to_grammar; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 256, - n_batch = 128, - n_ubatch = 64, -)] -fn sample_returns_result_and_succeeds_with_valid_index(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let tokens = model.str_to_token("Hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - - let result = sampler.sample(&context, batch.n_tokens() - 1); - - assert!(result.is_ok()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let mut classifier = model.sampled_token_classifier(); - let (raw_token, mut outcomes) = - classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; - outcomes.extend(classifier.flush()); - - assert_eq!( - outcomes.len(), - 1, - "expected one finalised outcome after flush" - ); - let outcome = &outcomes[0]; - - let raw_as_sampled = SampledToken::Content(raw_token); - assert!( - !model.is_eog_token(&raw_as_sampled), - "Grammar sampler should not allow EOS as first token" - ); - - let piece = &outcome.raw_piece; - let first_char = piece - .chars() - .next() - .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))? - .to_lowercase() - .next() - .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?; - - assert!( - first_char == 'y' || first_char == 'n', - "Grammar should constrain first token to start with y/n, got: '{piece}'" - ); - assert_eq!( - classifier.usage().completion_tokens(), - 1, - "exactly one completion token sampled" - ); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn json_schema_grammar_sampler_constrains_output_to_json(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let grammar_str = json_schema_to_grammar( - r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#, - )?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, &grammar_str, "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let mut classifier = model.sampled_token_classifier(); - let (raw_token, mut outcomes) = - classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; - outcomes.extend(classifier.flush()); - - assert_eq!( - outcomes.len(), - 1, - "expected one finalised outcome after flush" - ); - let outcome = &outcomes[0]; - - let raw_as_sampled = SampledToken::Content(raw_token); - assert!( - !model.is_eog_token(&raw_as_sampled), - "Grammar sampler should not allow EOS as first token" - ); - - let piece = &outcome.raw_piece; - - assert!( - piece.starts_with('{'), - "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'" - ); - assert_eq!( - classifier.usage().completion_tokens(), - 1, - "exactly one completion token sampled" - ); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn sample_with_grammar_produces_constrained_output_in_loop( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - let mut classifier = model.sampled_token_classifier(); - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - context.decode(&mut batch)?; - classifier.commit_prompt_tokens(); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 10, - } - .run()?; - - let lowercase = outcome.generated_raw.to_lowercase(); - assert!( - lowercase == "yes" || lowercase == "no", - "Grammar loop should produce 'yes' or 'no', got: '{}'", - outcome.generated_raw - ); - assert!( - outcome.eog_seen, - "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}" - ); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(outcome.observed_tool_call, 0); - assert!(outcome.observed_content > 0); - - let usage = classifier.into_usage(); - assert_eq!(usage.completion_tokens(), outcome.observed_content); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut context = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let prompt = - "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - - let mut classifier = model.sampled_token_classifier(); - let mut sampled_count: u64 = 0; - - for (position, _) in (batch.n_tokens()..).zip(0..5) { - let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?; - let raw_as_sampled = SampledToken::Content(raw_token); - - if model.is_eog_token(&raw_as_sampled) { - break; - } - - sampled_count += 1; - - batch.clear(); - batch.add(&raw_as_sampled, position, &[0], true)?; - - context.decode(&mut batch)?; - } - - let _ = classifier.flush(); - - assert!( - sampled_count > 0, - "Should produce at least one token without grammar" - ); - let usage = classifier.into_usage(); - assert!( - usage.completion_tokens() >= sampled_count, - "completion_tokens ({}) must include the {sampled_count} non-EOG samples", - usage.completion_tokens() - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_special_tokens.rs b/llama-cpp-bindings-tests/tests/model_special_tokens.rs deleted file mode 100644 index c719501b..00000000 --- a/llama-cpp-bindings-tests/tests/model_special_tokens.rs +++ /dev/null @@ -1,381 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::SampledToken; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let bos = model.token_bos(); - let eos = model.token_eos(); - - assert_ne!(bos, eos); - assert!(model.is_eog_token(&SampledToken::Content(eos))); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let nl_token = fixture.model.token_nl(); - assert!(nl_token.0 >= 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let eos = model.token_eos(); - assert!(model.is_eog_token(&SampledToken::Reasoning(eos))); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let eos = model.token_eos(); - assert!(model.is_eog_token(&SampledToken::ToolCall(eos))); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let eos = model.token_eos(); - assert!(model.is_eog_token(&SampledToken::Undeterminable(eos))); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let token = model.decode_start_token(); - let n_vocab = model.n_vocab(); - assert!( - token.0 == -1 || (0..n_vocab).contains(&token.0), - "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}" - ); - assert_eq!( - token, - model.decode_start_token(), - "decode_start_token must be deterministic across calls" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let token = model.token_sep(); - let n_vocab = model.n_vocab(); - assert!( - token.0 == -1 || (0..n_vocab).contains(&token.0), - "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}" - ); - assert_eq!( - token, - model.token_sep(), - "token_sep must be deterministic across calls" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let bos = model.token_bos(); - let attrs = model.token_attr(bos)?; - let bit_repr = format!("{:?}", *attrs); - assert!( - !bit_repr.is_empty(), - "token_attr(bos) must produce Debug output" - ); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_str_to_token.rs b/llama-cpp-bindings-tests/tests/model_str_to_token.rs deleted file mode 100644 index ea8ebb9c..00000000 --- a/llama-cpp-bindings-tests/tests/model_str_to_token.rs +++ /dev/null @@ -1,210 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let tokens = model.str_to_token("hello world", AddBos::Never)?; - assert!(!tokens.is_empty()); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let piece = model.token_to_piece( - &llama_cpp_bindings::SampledToken::Content(tokens[0]), - &mut decoder, - false, - None, - )?; - - assert!(!piece.is_empty()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn str_to_token_grows_buffer_when_initial_estimation_too_small( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let many_short_chars = "a b c d e f g h i j k l"; - let tokens = fixture - .model - .str_to_token(many_short_chars, AddBos::Always)?; - - assert!( - tokens.len() > 8, - "expected regrow; got {} tokens", - tokens.len() - ); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?; - let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?; - - assert!(tokens_with_bos.len() >= tokens_without_bos.len()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn str_to_token_with_many_tokens_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> { - use std::fmt::Write; - - let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| { - let _ = write!(accumulator, "{number} "); - accumulator - }); - - let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?; - - assert!(tokens.len() > many_numbers.len() / 2); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_token_to_piece.rs b/llama-cpp-bindings-tests/tests/model_token_to_piece.rs deleted file mode 100644 index b86d391b..00000000 --- a/llama-cpp-bindings-tests/tests/model_token_to_piece.rs +++ /dev/null @@ -1,364 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use std::num::NonZeroU16; - -use anyhow::Result; -use llama_cpp_bindings::SampledToken; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_to_piece_bytes_returns_bytes_for_known_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let tokens = model.str_to_token("hello", AddBos::Never)?; - let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?; - - assert!(!bytes.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_to_piece_handles_large_token_requiring_buffer_resize( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - - for (token, _) in model.tokens(true).take(200) { - let result = model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None); - assert!(result.is_ok()); - } - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_to_piece_bytes_insufficient_buffer_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let tokens = model.str_to_token("hello", AddBos::Never)?; - let result = model.token_to_piece_bytes(tokens[0], 1, false, None); - - assert!( - result - .unwrap_err() - .to_string() - .contains("Insufficient Buffer Space") - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hello", AddBos::Never)?; - let result = model.token_to_piece( - &SampledToken::Content(tokens[0]), - &mut decoder, - false, - NonZeroU16::new(1), - ); - - assert!(result.is_ok()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = model.token_to_piece( - &SampledToken::Reasoning(tokens[0]), - &mut decoder, - true, - None, - )?; - - assert!(!piece.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = - model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?; - - assert!(!piece.is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = model.token_to_piece( - &SampledToken::Undeterminable(tokens[0]), - &mut decoder, - true, - None, - )?; - - assert!(!piece.is_empty()); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs b/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs deleted file mode 100644 index 3f9ad9da..00000000 --- a/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs +++ /dev/null @@ -1,109 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut count = 0; - - for (token, _piece_result) in model.tokens(false) { - assert!(token.0 >= 0); - count += 1; - - if count >= 100 { - break; - } - } - - assert_eq!(count, 100); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let n_vocab = model.n_vocab(); - let count = model.tokens(false).count(); - - assert_eq!(count, usize::try_from(n_vocab)?); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs b/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs deleted file mode 100644 index 3c66f82f..00000000 --- a/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs +++ /dev/null @@ -1,81 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings_tests::test_model; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let fixtures = test_model::fixtures_dir(); - let image_path = fixtures.join("llamas.jpg"); - let image_bytes = std::fs::read(&image_path)?; - let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?; - - assert!(bitmap.nx() > 0); - assert!(bitmap.ny() > 0); - assert!(!bitmap.is_audio()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null"); - - assert!(result.is_err()); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs deleted file mode 100644 index 8a960774..00000000 --- a/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs +++ /dev/null @@ -1,147 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputChunkType; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - let copied = first_chunk.copy()?; - - assert!(copied.owned); - assert_eq!(copied.n_tokens(), first_chunk.n_tokens()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - let result = mtmd_ctx.encode_chunk(&chunk); - assert!(result.is_ok()); - return Ok(()); - } - } - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn decode_use_non_causal_returns_bool_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - let value = mtmd_ctx.decode_use_non_causal(&chunk); - let printed = format!("{value:?}"); - assert!( - !printed.is_empty(), - "decode_use_non_causal must return a Debug-printable bool" - ); - return Ok(()); - } - } - anyhow::bail!("tokenization should produce at least one Image chunk"); -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs deleted file mode 100644 index 1114af3c..00000000 --- a/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs +++ /dev/null @@ -1,242 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputChunkType; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -fn tokenize_synthetic( - fixture: &LlamaFixture<'_>, - prompt: &str, -) -> Result { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: prompt.to_owned(), - add_special: true, - parse_special: true, - }; - Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - let tokens = first_chunk.text_tokens(); - assert!(tokens.is_some()); - assert!(!tokens.expect("tokens should be some").is_empty()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert!(first_chunk.n_tokens() > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert!(first_chunk.n_positions() > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); - assert!(first_chunk.id().is_none()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - assert!(chunk.text_tokens().is_none()); - return Ok(()); - } - } - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> { - let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - assert!(chunk.id().is_some()); - return Ok(()); - } - } - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_context.rs b/llama-cpp-bindings-tests/tests/mtmd_context.rs deleted file mode 100644 index 8595eb2b..00000000 --- a/llama-cpp-bindings-tests/tests/mtmd_context.rs +++ /dev/null @@ -1,162 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::mtmd::MtmdContext; -use llama_cpp_bindings::mtmd::MtmdContextParams; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!(mtmd_ctx.support_vision()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn init_from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_params = MtmdContextParams::default(); - let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params); - - assert!(result.is_err()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!( - mtmd_ctx.decode_use_mrope(), - "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!( - !mtmd_ctx.support_audio(), - "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false" - ); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn get_audio_sample_rate_is_none_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - assert!( - mtmd_ctx.get_audio_sample_rate().is_none(), - "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None" - ); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs b/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs deleted file mode 100644 index b6f30f1c..00000000 --- a/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs +++ /dev/null @@ -1,236 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdEvalError; -use llama_cpp_bindings::mtmd::MtmdInputChunks; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings_tests::test_model; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; (width as usize) * (height as usize) * 3]; - let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let n_positions = chunks.total_positions(); - let required_n_ctx = u32::try_from(n_positions + 256)?; - if fixture.context_params.n_ctx < required_n_ctx { - anyhow::bail!( - "fixture n_ctx ({}) below required ({}) for {}x{} image", - fixture.context_params.n_ctx, - required_n_ctx, - width, - height, - ); - } - - let llama_ctx = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let n_batch = i32::try_from(llama_ctx.n_batch())?; - chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?; - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 64, - n_batch = 64, - n_ubatch = 32, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 64, - n_batch = 64, - n_ubatch = 32, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let llama_ctx = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chunks = MtmdInputChunks::new()?; - let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?; - - let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false); - - assert!(matches!( - result, - Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. }) - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let fixtures = test_model::fixtures_dir(); - let image_path = fixtures.join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - let input_text = MtmdInputText { - text: "What is in this image? <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let n_positions = chunks.total_positions(); - let required_n_ctx = u32::try_from(n_positions + 256)?; - assert!( - fixture.context_params.n_ctx >= required_n_ctx, - "fixture n_ctx ({}) below required ({}); update the attribute literal", - fixture.context_params.n_ctx, - required_n_ctx, - ); - - let llama_ctx = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let n_batch = i32::try_from(llama_ctx.n_batch())?; - let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> { - let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)]; - - for (width, height) in test_dimensions { - let result = eval_synthetic_bitmap(fixture, width, height); - assert!( - result.is_ok(), - "dimension {width}x{height} should succeed: {result:?}" - ); - } - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn eval_chunks_with_extreme_dimensions_does_not_crash(fixture: &LlamaFixture<'_>) -> Result<()> { - let extreme_dimensions: [(u32, u32); 6] = [ - (1, 1), - (7, 13), - (3, 1000), - (1000, 3), - (1920, 1080), - (4096, 4096), - ]; - - let mut any_reached_eval = false; - - for (width, height) in extreme_dimensions { - match eval_synthetic_bitmap(fixture, width, height) { - Ok(()) => any_reached_eval = true, - Err(error) => eprintln!(" {width}x{height} failed: {error}"), - } - } - - assert!( - any_reached_eval, - "at least one extreme dimension should reach eval_chunks" - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs b/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs deleted file mode 100644 index ae5f32c3..00000000 --- a/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs +++ /dev/null @@ -1,121 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe this image: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - assert!(!chunks.is_empty()); - assert!(chunks.total_tokens() > 0); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let input_text = MtmdInputText { - text: "No media markers here".to_string(), - add_special: true, - parse_special: true, - }; - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let result = mtmd_ctx.tokenize(input_text, &[&bitmap]); - assert!(result.is_err()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - let input_text = MtmdInputText { - text: "text\0null".to_string(), - add_special: true, - parse_special: true, - }; - let result = mtmd_ctx.tokenize(input_text, &[]); - assert!(result.is_err()); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/multimodal.rs b/llama-cpp-bindings-tests/tests/multimodal.rs deleted file mode 100644 index c1108c4d..00000000 --- a/llama-cpp-bindings-tests/tests/multimodal.rs +++ /dev/null @@ -1,212 +0,0 @@ -use anyhow::{Context, Result}; -use llama_cpp_bindings::SampledTokenClassifier; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel}; -use llama_cpp_bindings::mtmd::{MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText}; -use llama_cpp_bindings::sampled_token::SampledToken; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_sys::llama_pos; -use llama_cpp_bindings_tests::test_model; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -struct ChunkTokenBreakdown { - text: u64, - image: u64, - audio: u64, -} - -fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result { - let mut breakdown = ChunkTokenBreakdown { - text: 0, - image: 0, - audio: 0, - }; - for index in 0..chunks.len() { - let chunk = chunks - .get(index) - .with_context(|| format!("chunk index {index} is missing"))?; - let n_tokens = u64::try_from(chunk.n_tokens())?; - match chunk.chunk_type()? { - MtmdInputChunkType::Text => breakdown.text += n_tokens, - MtmdInputChunkType::Image => breakdown.image += n_tokens, - MtmdInputChunkType::Audio => breakdown.audio += n_tokens, - } - } - - Ok(breakdown) -} - -fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result { - let marker = llama_cpp_bindings::mtmd::mtmd_default_marker(); - let user_content = format!("{marker}{question}"); - let chat_template = model.chat_template(None)?; - let messages = [LlamaChatMessage::new("user".to_string(), user_content)?]; - - Ok(model.apply_chat_template(&chat_template, &messages, true)?) -} - -struct SamplingTotals { - generated: String, - observed_content: u64, - observed_reasoning: u64, -} - -fn drive_sampling_loop( - classifier: &mut SampledTokenClassifier, - model: &LlamaModel, - ctx: &mut LlamaContext, - starting_position: llama_pos, - max_tokens: usize, -) -> Result { - let mut sampler = LlamaSampler::greedy(); - let mut totals = SamplingTotals { - generated: String::new(), - observed_content: 0, - observed_reasoning: 0, - }; - let mut batch = LlamaBatch::new(512, 1)?; - - for (current_position, _) in (starting_position..).zip(0..max_tokens) { - let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?; - for outcome in &outcomes { - totals.generated.push_str(&outcome.raw_piece); - match outcome.sampled_token { - SampledToken::Content(_) => totals.observed_content += 1, - SampledToken::Reasoning(_) => totals.observed_reasoning += 1, - SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {} - } - } - - let raw_as_sampled = SampledToken::Content(raw_token); - if model.is_eog_token(&raw_as_sampled) { - break; - } - - batch.clear(); - batch.add(&raw_as_sampled, current_position, &[0], true)?; - - ctx.decode(&mut batch) - .with_context(|| "failed to decode generated token")?; - } - - for outcome in classifier.flush() { - totals.generated.push_str(&outcome.raw_piece); - match outcome.sampled_token { - SampledToken::Content(_) => totals.observed_content += 1, - SampledToken::Reasoning(_) => totals.observed_reasoning += 1, - SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {} - } - } - - Ok(totals) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut ctx = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create llama context")?; - - assert!( - mtmd_ctx.support_vision(), - "model should support vision input" - ); - - let image_path = test_model::fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .with_context(|| "image path is not valid UTF-8")?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str) - .with_context(|| "failed to load image from file")?; - - let formatted_prompt = - build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?; - - let input_text = MtmdInputText { - text: formatted_prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx - .tokenize(input_text, &[&bitmap]) - .with_context(|| "failed to tokenize multimodal input")?; - - assert!( - !chunks.is_empty(), - "tokenization should produce at least one chunk" - ); - - let expected = count_chunk_tokens_by_type(&chunks)?; - - eprintln!( - "tokenized into {} chunks, text {} image {} audio {}", - chunks.len(), - expected.text, - expected.image, - expected.audio - ); - - assert!( - expected.image > 0, - "vision input must produce at least one image chunk" - ); - - let mut classifier = model.sampled_token_classifier(); - let n_past = classifier - .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true) - .with_context(|| "failed to evaluate chunks")?; - - eprintln!("evaluated chunks, n_past = {n_past}"); - - { - let usage = classifier.usage(); - assert_eq!(usage.prompt_tokens, expected.text); - assert_eq!(usage.input_image_tokens, expected.image); - assert_eq!(usage.input_audio_tokens, expected.audio); - } - - let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?; - - eprintln!("generated text: {}", totals.generated); - - assert!( - !totals.generated.is_empty(), - "model should generate at least one token from image input" - ); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, expected.text); - assert_eq!(usage.input_image_tokens, expected.image); - assert_eq!(usage.input_audio_tokens, expected.audio); - assert_eq!(usage.content_tokens, totals.observed_content); - assert_eq!(usage.reasoning_tokens, totals.observed_reasoning); - assert_eq!( - usage.completion_tokens(), - totals.observed_content + totals.observed_reasoning - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/multimodal_vision.rs b/llama-cpp-bindings-tests/tests/multimodal_vision.rs new file mode 100644 index 00000000..7e596be6 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/multimodal_vision.rs @@ -0,0 +1,2001 @@ +use llama_cpp_test_harness::llama_tests_main; + +mod mtmd_bitmap { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings_tests::test_model; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let fixtures = test_model::fixtures_dir(); + let image_path = fixtures.join("llamas.jpg"); + let image_bytes = std::fs::read(&image_path)?; + let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?; + + assert!(bitmap.nx() > 0); + assert!(bitmap.ny() > 0); + assert!(!bitmap.is_audio()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null"); + + assert!(result.is_err()); + + Ok(()) + } +} + +mod mtmd_chunk_operations { + use anyhow::Result; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputChunkType; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Hello <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + let copied = first_chunk.copy()?; + + assert!(copied.owned); + assert_eq!(copied.n_tokens(), first_chunk.n_tokens()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + let result = mtmd_ctx.encode_chunk(&chunk); + assert!(result.is_ok()); + return Ok(()); + } + } + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn decode_use_non_causal_returns_bool_for_image_chunk( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + let value = mtmd_ctx.decode_use_non_causal(&chunk); + let printed = format!("{value:?}"); + assert!( + !printed.is_empty(), + "decode_use_non_causal must return a Debug-printable bool" + ); + return Ok(()); + } + } + anyhow::bail!("tokenization should produce at least one Image chunk"); + } +} + +mod mtmd_chunk_structure { + use anyhow::Result; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputChunkType; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + fn tokenize_synthetic( + fixture: &LlamaFixture<'_>, + prompt: &str, + ) -> Result { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: prompt.to_owned(), + add_special: true, + parse_special: true, + }; + Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + let tokens = first_chunk.text_tokens(); + assert!(tokens.is_some()); + assert!(!tokens.expect("tokens should be some").is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert!(first_chunk.n_tokens() > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert!(first_chunk.n_positions() > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); + assert!(first_chunk.id().is_none()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + assert!(chunk.text_tokens().is_none()); + return Ok(()); + } + } + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + assert!(chunk.id().is_some()); + return Ok(()); + } + } + Ok(()) + } +} + +mod mtmd_context { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::mtmd::MtmdContext; + use llama_cpp_bindings::mtmd::MtmdContextParams; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!(mtmd_ctx.support_vision()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn init_from_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mtmd_params = MtmdContextParams::default(); + let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params); + + assert!(result.is_err()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + mtmd_ctx.decode_use_mrope(), + "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + !mtmd_ctx.support_audio(), + "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn get_audio_sample_rate_is_none_for_vision_only_mmproj( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + mtmd_ctx.get_audio_sample_rate().is_none(), + "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None" + ); + Ok(()) + } +} + +mod mtmd_evaluation { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdEvalError; + use llama_cpp_bindings::mtmd::MtmdInputChunks; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_bindings_tests::test_model; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; (width as usize) * (height as usize) * 3]; + let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let n_positions = chunks.total_positions(); + let required_n_ctx = u32::try_from(n_positions + 256)?; + if fixture.context_params.n_ctx < required_n_ctx { + anyhow::bail!( + "fixture n_ctx ({}) below required ({}) for {}x{} image", + fixture.context_params.n_ctx, + required_n_ctx, + width, + height, + ); + } + + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let n_batch = i32::try_from(llama_ctx.n_batch())?; + chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?; + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 64, + n_ubatch = 32, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 64, + n_ubatch = 32, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chunks = MtmdInputChunks::new()?; + let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?; + + let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false); + + assert!(matches!( + result, + Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. }) + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let fixtures = test_model::fixtures_dir(); + let image_path = fixtures.join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + let input_text = MtmdInputText { + text: "What is in this image? <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let n_positions = chunks.total_positions(); + let required_n_ctx = u32::try_from(n_positions + 256)?; + assert!( + fixture.context_params.n_ctx >= required_n_ctx, + "fixture n_ctx ({}) below required ({}); update the attribute literal", + fixture.context_params.n_ctx, + required_n_ctx, + ); + + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let n_batch = i32::try_from(llama_ctx.n_batch())?; + let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> { + let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)]; + + for (width, height) in test_dimensions { + let result = eval_synthetic_bitmap(fixture, width, height); + assert!( + result.is_ok(), + "dimension {width}x{height} should succeed: {result:?}" + ); + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn eval_chunks_with_extreme_dimensions_does_not_crash( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let extreme_dimensions: [(u32, u32); 6] = [ + (1, 1), + (7, 13), + (3, 1000), + (1000, 3), + (1920, 1080), + (4096, 4096), + ]; + + let mut any_reached_eval = false; + + for (width, height) in extreme_dimensions { + match eval_synthetic_bitmap(fixture, width, height) { + Ok(()) => any_reached_eval = true, + Err(error) => eprintln!(" {width}x{height} failed: {error}"), + } + } + + assert!( + any_reached_eval, + "at least one extreme dimension should reach eval_chunks" + ); + + Ok(()) + } +} + +mod mtmd_tokenization { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe this image: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + assert!(!chunks.is_empty()); + assert!(chunks.total_tokens() > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let input_text = MtmdInputText { + text: "No media markers here".to_string(), + add_special: true, + parse_special: true, + }; + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let result = mtmd_ctx.tokenize(input_text, &[&bitmap]); + assert!(result.is_err()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let input_text = MtmdInputText { + text: "text\0null".to_string(), + add_special: true, + parse_special: true, + }; + let result = mtmd_ctx.tokenize(input_text, &[]); + assert!(result.is_err()); + Ok(()) + } +} + +mod multimodal { + use anyhow::{Context, Result}; + use llama_cpp_bindings::SampledTokenClassifier; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel}; + use llama_cpp_bindings::mtmd::{ + MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText, + }; + use llama_cpp_bindings::sampled_token::SampledToken; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_sys::llama_pos; + use llama_cpp_bindings_tests::test_model; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + struct ChunkTokenBreakdown { + text: u64, + image: u64, + audio: u64, + } + + fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result { + let mut breakdown = ChunkTokenBreakdown { + text: 0, + image: 0, + audio: 0, + }; + for index in 0..chunks.len() { + let chunk = chunks + .get(index) + .with_context(|| format!("chunk index {index} is missing"))?; + let n_tokens = u64::try_from(chunk.n_tokens())?; + match chunk.chunk_type()? { + MtmdInputChunkType::Text => breakdown.text += n_tokens, + MtmdInputChunkType::Image => breakdown.image += n_tokens, + MtmdInputChunkType::Audio => breakdown.audio += n_tokens, + } + } + + Ok(breakdown) + } + + fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result { + let marker = llama_cpp_bindings::mtmd::mtmd_default_marker(); + let user_content = format!("{marker}{question}"); + let chat_template = model.chat_template(None)?; + let messages = [LlamaChatMessage::new("user".to_string(), user_content)?]; + + Ok(model.apply_chat_template(&chat_template, &messages, true)?) + } + + struct SamplingTotals { + generated: String, + observed_content: u64, + observed_reasoning: u64, + } + + fn drive_sampling_loop( + classifier: &mut SampledTokenClassifier, + model: &LlamaModel, + ctx: &mut LlamaContext, + starting_position: llama_pos, + max_tokens: usize, + ) -> Result { + let mut sampler = LlamaSampler::greedy(); + let mut totals = SamplingTotals { + generated: String::new(), + observed_content: 0, + observed_reasoning: 0, + }; + let mut batch = LlamaBatch::new(512, 1)?; + + for (current_position, _) in (starting_position..).zip(0..max_tokens) { + let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?; + for outcome in &outcomes { + totals.generated.push_str(&outcome.raw_piece); + match outcome.sampled_token { + SampledToken::Content(_) => totals.observed_content += 1, + SampledToken::Reasoning(_) => totals.observed_reasoning += 1, + SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {} + } + } + + let raw_as_sampled = SampledToken::Content(raw_token); + if model.is_eog_token(&raw_as_sampled) { + break; + } + + batch.clear(); + batch.add(&raw_as_sampled, current_position, &[0], true)?; + + ctx.decode(&mut batch) + .with_context(|| "failed to decode generated token")?; + } + + for outcome in classifier.flush() { + totals.generated.push_str(&outcome.raw_piece); + match outcome.sampled_token { + SampledToken::Content(_) => totals.observed_content += 1, + SampledToken::Reasoning(_) => totals.observed_reasoning += 1, + SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {} + } + } + + Ok(totals) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create llama context")?; + + assert!( + mtmd_ctx.support_vision(), + "model should support vision input" + ); + + let image_path = test_model::fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .with_context(|| "image path is not valid UTF-8")?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str) + .with_context(|| "failed to load image from file")?; + + let formatted_prompt = + build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?; + + let input_text = MtmdInputText { + text: formatted_prompt, + add_special: false, + parse_special: true, + }; + + let chunks = mtmd_ctx + .tokenize(input_text, &[&bitmap]) + .with_context(|| "failed to tokenize multimodal input")?; + + assert!( + !chunks.is_empty(), + "tokenization should produce at least one chunk" + ); + + let expected = count_chunk_tokens_by_type(&chunks)?; + + eprintln!( + "tokenized into {} chunks, text {} image {} audio {}", + chunks.len(), + expected.text, + expected.image, + expected.audio + ); + + assert!( + expected.image > 0, + "vision input must produce at least one image chunk" + ); + + let mut classifier = model.sampled_token_classifier(); + let n_past = classifier + .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true) + .with_context(|| "failed to evaluate chunks")?; + + eprintln!("evaluated chunks, n_past = {n_past}"); + + { + let usage = classifier.usage(); + assert_eq!(usage.prompt_tokens, expected.text); + assert_eq!(usage.input_image_tokens, expected.image); + assert_eq!(usage.input_audio_tokens, expected.audio); + } + + let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?; + + eprintln!("generated text: {}", totals.generated); + + assert!( + !totals.generated.is_empty(), + "model should generate at least one token from image input" + ); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, expected.text); + assert_eq!(usage.input_image_tokens, expected.image); + assert_eq!(usage.input_audio_tokens, expected.audio); + assert_eq!(usage.content_tokens, totals.observed_content); + assert_eq!(usage.reasoning_tokens, totals.observed_reasoning); + assert_eq!( + usage.completion_tokens(), + totals.observed_content + totals.observed_reasoning + ); + + Ok(()) + } +} + +mod eval_multimodal_chunks_records_exact_token_counts { + use anyhow::Result; + use llama_cpp_bindings::TokenUsage; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputChunkType; + use llama_cpp_bindings::mtmd::MtmdInputChunks; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_bindings::mtmd::mtmd_default_marker; + use llama_cpp_bindings_tests::test_model::fixtures_dir; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const PROMPT_QUESTION: &str = "What animals do you see in this image?"; + + struct ExpectedChunkTotals { + text: u64, + image: u64, + audio: u64, + } + + fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result { + let mut totals = ExpectedChunkTotals { + text: 0, + image: 0, + audio: 0, + }; + for index in 0..chunks.len() { + let chunk = chunks + .get(index) + .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?; + let n_tokens = u64::try_from(chunk.n_tokens())?; + match chunk.chunk_type()? { + MtmdInputChunkType::Text => { + totals.text = totals.text.saturating_add(n_tokens); + } + MtmdInputChunkType::Image => { + totals.image = totals.image.saturating_add(n_tokens); + } + MtmdInputChunkType::Audio => { + totals.audio = totals.audio.saturating_add(n_tokens); + } + } + } + Ok(totals) + } + + fn build_multimodal_chunks_and_eval_into_usage( + fixture: &LlamaFixture<'_>, + ) -> Result<(TokenUsage, ExpectedChunkTotals)> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!("{marker}{PROMPT_QUESTION}"); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; + + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let expected = sum_chunk_token_counts_by_type(&chunks)?; + + let context_params = (*fixture.context_params).into_llama_context_params(); + let context = LlamaContext::from_model(model, fixture.backend, context_params)?; + + let mut classifier = model.sampled_token_classifier(); + classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + Ok((classifier.into_usage(), expected)) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if usage.prompt_tokens != expected.text { + anyhow::bail!( + "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}", + expected.text, + usage.prompt_tokens + ); + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if usage.input_image_tokens != expected.image { + anyhow::bail!( + "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}", + expected.image, + usage.input_image_tokens + ); + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if expected.audio != 0 { + anyhow::bail!( + "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}", + expected.audio + ); + } + if usage.input_audio_tokens != 0 { + anyhow::bail!( + "input_audio_tokens must be zero when no audio chunks are evaluated; got {}", + usage.input_audio_tokens + ); + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn completion_tokens_are_zero_after_eval_before_generation( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; + + if usage.completion_tokens() != 0 { + anyhow::bail!( + "completion_tokens must be zero immediately after eval (no generation has occurred); got {}", + usage.completion_tokens() + ); + } + + Ok(()) + } +} + +mod ingest_prompt_chunk { + use anyhow::Result; + use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputChunkType; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_bindings::mtmd::mtmd_default_marker; + use llama_cpp_bindings_tests::test_model::fixtures_dir; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let input_text = MtmdInputText { + text: "hello world".to_owned(), + add_special: false, + parse_special: false, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[])?; + + let text_chunk = (0..chunks.len()) + .filter_map(|index| chunks.get(index)) + .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text)) + .ok_or_else(|| { + anyhow::anyhow!("text-only tokenization should produce at least one text chunk") + })?; + + let n_tokens = u64::try_from(text_chunk.n_tokens())?; + + let mut classifier = model.sampled_token_classifier(); + + ingest_prompt_chunk(&mut classifier, &text_chunk)?; + + let usage = classifier.usage(); + if usage.prompt_tokens != n_tokens { + anyhow::bail!( + "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}", + usage.prompt_tokens + ); + } + if usage.input_image_tokens != 0 { + anyhow::bail!( + "text chunk must not bump input_image_tokens; got {}", + usage.input_image_tokens + ); + } + if usage.input_audio_tokens != 0 { + anyhow::bail!( + "text chunk must not bump input_audio_tokens; got {}", + usage.input_audio_tokens + ); + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let input_text = MtmdInputText { + text: marker.to_owned(), + add_special: false, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let image_chunk = (0..chunks.len()) + .filter_map(|index| chunks.get(index)) + .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image)) + .ok_or_else(|| { + anyhow::anyhow!("multimodal tokenization should produce an image chunk") + })?; + + let n_tokens = u64::try_from(image_chunk.n_tokens())?; + if n_tokens == 0 { + anyhow::bail!("image chunk should report at least one token"); + } + + let mut classifier = model.sampled_token_classifier(); + + ingest_prompt_chunk(&mut classifier, &image_chunk)?; + + let usage = classifier.usage(); + if usage.input_image_tokens != n_tokens { + anyhow::bail!( + "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}", + usage.input_image_tokens + ); + } + if usage.prompt_tokens != 0 { + anyhow::bail!( + "image chunk must not bump prompt_tokens; got {}", + usage.prompt_tokens + ); + } + if usage.input_audio_tokens != 0 { + anyhow::bail!( + "image chunk must not bump input_audio_tokens; got {}", + usage.input_audio_tokens + ); + } + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + fn text_chunk_drives_marker_state_machine_to_reasoning( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let input_text = MtmdInputText { + text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n\n".to_owned(), + add_special: false, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[])?; + + let mut classifier = model.sampled_token_classifier(); + + for index in 0..chunks.len() { + let chunk = chunks + .get(index) + .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?; + ingest_prompt_chunk(&mut classifier, &chunk)?; + } + + if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning { + anyhow::bail!( + "text chunk replay must transition the classifier section to Reasoning when the \ + prompt opens a `` block; got {:?}", + classifier.current_section() + ); + } + + Ok(()) + } +} + +mod gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_bindings::mtmd::mtmd_default_marker; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_bindings_tests::test_model::fixtures_dir; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + #[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"), + )] + fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "user\n{marker}What animals do you see in this image?\nmodel\n<|channel>thought\n" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; + + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = + classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \ + when the prompt opens a `<|channel>thought` block; outcome={outcome:?}" + ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" + ); + } + + Ok(()) + } +} + +mod mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_bindings::mtmd::mtmd_default_marker; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_bindings_tests::test_model::fixtures_dir; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 768; + + #[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"), + )] + fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ + First draft your thinking process (inner monologue) until you arrive at a response. \ + Format your response using Markdown, and use LaTeX for any mathematical equations. \ + Write both your thoughts and the response in the same language as the input.\n\n\ + Your thinking process must follow the template below:\ + [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ + Be as casual and as long as you want until you are confident to generate the response \ + to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ + [INST]{marker}What animals do you see in this image?[/INST]" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: true, + parse_special: true, + }; + + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = + classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::greedy(); + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \ + when the model opens a `[THINK]` block; outcome={outcome:?}" + ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" + ); + } + + Ok(()) + } +} + +mod qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_bindings::mtmd::mtmd_default_marker; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_bindings_tests::test_model::fixtures_dir; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; + + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = + classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \ + when the prompt opens a `` block; outcome={outcome:?}" + ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" + ); + } + + Ok(()) + } +} + +mod qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::mtmd::MtmdBitmap; + use llama_cpp_bindings::mtmd::MtmdInputText; + use llama_cpp_bindings::mtmd::mtmd_default_marker; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_bindings_tests::test_model::fixtures_dir; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), + )] + fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let image_path = fixtures_dir().join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + + let marker = mtmd_default_marker(); + let prompt = format!( + "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" + ); + + let input_text = MtmdInputText { + text: prompt, + add_special: false, + parse_special: true, + }; + + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + let mut classifier = model.sampled_token_classifier(); + let n_past = + classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + + let mut batch = LlamaBatch::new(2048, 1)?; + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position: n_past, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + if outcome.observed_reasoning == 0 { + anyhow::bail!( + "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}" + ); + } + if usage.reasoning_tokens == 0 { + anyhow::bail!( + "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" + ); + } + + Ok(()) + } +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/parse_chat_message.rs b/llama-cpp-bindings-tests/tests/parse_chat_message.rs deleted file mode 100644 index d23fe1c2..00000000 --- a/llama-cpp-bindings-tests/tests/parse_chat_message.rs +++ /dev/null @@ -1,368 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message("[]", "hello world", false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for plain content; got Unrecognized"); - }; - assert!(parsed.tool_calls.is_empty()); - assert!(!parsed.is_empty()); - assert!(parsed.content.contains("hello world")); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> { - let input = "step one, step two\n\nactual response"; - let outcome = fixture.model.parse_chat_message("[]", input, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for reasoning section; got Unrecognized"); - }; - assert!( - parsed.reasoning_content.contains("step") || parsed.content.contains("step"), - "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}", - parsed.content, - parsed.reasoning_content - ); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture.model.parse_chat_message("[]", "", false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for empty input; got Unrecognized"); - }; - assert!(parsed.tool_calls.is_empty()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn parses_malformed_tools_json_returns_tools_json_invalid_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let result = fixture - .model - .parse_chat_message("not_a_json[}", "hello", false); - - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( - _ - )) - )); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn parses_non_array_tools_json_returns_tools_json_not_array_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let result = fixture - .model - .parse_chat_message("{\"foo\": 1}", "hello", false); - - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray) - )); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn parses_with_tools_null_byte_returns_tools_json_invalid_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let result = fixture - .model - .parse_chat_message("[]\0extra", "hello", false); - - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid( - _ - )) - )); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn parses_with_input_null_byte_returns_tools_serialization_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let result = fixture - .model - .parse_chat_message("[]", "hello\0world", false); - - assert!(matches!( - result, - Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_)) - )); - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs deleted file mode 100644 index 260dd0f6..00000000 --- a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs +++ /dev/null @@ -1,87 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaChatMessage; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, -)] -fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chat_template = model.chat_template(None)?; - let messages = vec![LlamaChatMessage::new( - "user".to_owned(), - "Hello! How are you?".to_owned(), - )?]; - let prompt = model.apply_chat_template(&chat_template, &messages, true)?; - - let mut classifier = model.sampled_token_classifier(); - let tokens = model.str_to_token(&prompt, AddBos::Always)?; - let prompt_token_count = u64::try_from(tokens.len())?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 1024, - } - .run()?; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(outcome.observed_content > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(outcome.observed_tool_call, 0); - - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); - }; - assert!(!parsed.content.is_empty()); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, prompt_token_count); - assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); - assert_eq!(usage.undeterminable_tokens, 0); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs deleted file mode 100644 index df0a9b80..00000000 --- a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ /dev/null @@ -1,95 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -const QWEN35_THINKING_DISABLED_PROMPT: &str = "\ -<|im_start|>user -What is 2 + 2?<|im_end|> -<|im_start|>assistant - - - - -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs deleted file mode 100644 index f9c98932..00000000 --- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs +++ /dev/null @@ -1,111 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 1500; - -const QWEN35_THINKING_PROMPT: &str = "\ -<|im_start|>user -What is 2 + 2?<|im_end|> -<|im_start|>assistant - -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - ); - - if parsed.reasoning_content.is_empty() { - eprintln!( - "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ - skipping strict parser-equality assertions" - ); - } else { - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - } - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs deleted file mode 100644 index 414fde9a..00000000 --- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ /dev/null @@ -1,111 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::test_model::fixtures_dir; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 4096, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" - ); - - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \ - when the prompt opens a `` block; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs deleted file mode 100644 index f517a4e7..00000000 --- a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs +++ /dev/null @@ -1,104 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; -use serde_json::Value; -use serde_json::json; - -const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "negotiate_with_cat", - "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.", - "parameters": { - "type": "object", - "properties": { - "topic": { - "type": "string", - "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'" - }, - "bribe": { - "type": "string", - "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"], - "description": "What you are offering in exchange" - }, - "desperation_level": { - "type": "integer", - "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)", - "minimum": 1, - "maximum": 10 - } - }, - "required": ["topic"], - "additionalProperties": false - } - } - } -]"#; - -const NEGOTIATE_WITH_CAT_INPUT: &str = "\n\ -\n\ -\n\ -tuna\n\ -\n\ -\n\ -8\n\ -\n\ -\n\ -get off the keyboard\n\ -\n\ -\n\ -"; - -fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> { - match arguments { - ToolCallArguments::ValidJson(value) => Ok(value), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson arguments, got InvalidJson: {raw}") - } - } -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture.model.parse_chat_message( - NEGOTIATE_WITH_CAT_TOOLS_JSON, - NEGOTIATE_WITH_CAT_INPUT, - false, - )?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \ - got Unrecognized" - ); - }; - - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat"); - assert_eq!(parsed.tool_calls[0].id, "call_0"); - assert_eq!( - arguments_as_json(&parsed.tool_calls[0].arguments)?, - &json!({ - "bribe": "tuna", - "desperation_level": 8, - "topic": "get off the keyboard", - }), - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs deleted file mode 100644 index 2fe2b89c..00000000 --- a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs +++ /dev/null @@ -1,134 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const QWEN_XML_PAYLOAD: &str = "\n\ -\n\ -\n\ -Paris\n\ -\n\ -\n\ -"; - -const PARTIAL_QWEN_XML_PAYLOAD: &str = "\n\n) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized"); - }; - assert_eq!(parsed.tool_calls.len(), 1); - assert_eq!(parsed.tool_calls[0].name, "get_weather"); - let location = match &parsed.tool_calls[0].arguments { - ToolCallArguments::ValidJson(value) => value - .get("location") - .and_then(|v| v.as_str()) - .map(str::to_owned), - ToolCallArguments::InvalidJson(raw) => { - bail!("expected ValidJson, got InvalidJson: {raw}"); - } - }; - assert_eq!(location.as_deref(), Some("Paris")); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn qwen35_parses_partial_tool_call_returns_pending_state(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized"); - }; - assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized" - ); - }; - assert!( - !parsed.tool_calls.is_empty(), - "expected at least one tool call; got {:?}", - parsed.tool_calls - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs deleted file mode 100644 index 96b76cf5..00000000 --- a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs +++ /dev/null @@ -1,58 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const TOOLS_JSON: &str = r#"[ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather for a location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "The city name"} - }, - "required": ["location"] - } - } - } -]"#; - -const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let outcome = fixture - .model - .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; - - let ChatMessageParseOutcome::Recognized(parsed) = outcome else { - bail!( - "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \ - tool_calls); got Unrecognized" - ); - }; - assert!( - parsed.tool_calls.is_empty(), - "expected no tool calls; got {:?}", - parsed.tool_calls - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs deleted file mode 100644 index 233cef95..00000000 --- a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs +++ /dev/null @@ -1,87 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaChatMessage; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, -)] -fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chat_template = model.chat_template(None)?; - let messages = vec![LlamaChatMessage::new( - "user".to_owned(), - "Hello! How are you?".to_owned(), - )?]; - let prompt = model.apply_chat_template(&chat_template, &messages, true)?; - - let mut classifier = model.sampled_token_classifier(); - let tokens = model.str_to_token(&prompt, AddBos::Always)?; - let prompt_token_count = u64::try_from(tokens.len())?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 1024, - } - .run()?; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(outcome.observed_content > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(outcome.observed_tool_call, 0); - - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); - }; - assert!(!parsed.content.is_empty()); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, prompt_token_count); - assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); - assert_eq!(usage.undeterminable_tokens, 0); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs deleted file mode 100644 index 2b57fa17..00000000 --- a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ /dev/null @@ -1,95 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -const QWEN36_THINKING_DISABLED_PROMPT: &str = "\ -<|im_start|>user -What is 2 + 2?<|im_end|> -<|im_start|>assistant - - - - -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - assert!(!outcome.generated_raw.is_empty()); - assert_eq!(outcome.observed_reasoning, 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.reasoning_tokens, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert!(outcome.observed_content > 0); - assert_eq!(usage.completion_tokens(), outcome.observed_content); - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs deleted file mode 100644 index c9c16a64..00000000 --- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs +++ /dev/null @@ -1,108 +0,0 @@ -use anyhow::Result; -use anyhow::bail; -use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 1500; - -const QWEN36_THINKING_PROMPT: &str = "\ -<|im_start|>user -What is 2 + 2?<|im_end|> -<|im_start|>assistant - -"; - -const FORBIDDEN_MARKERS: &[&str] = &["", ""]; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 2048, - n_ubatch = 512, -)] -fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - - let mut classifier = model.sampled_token_classifier(); - let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?; - let prompt_token_count = u64::try_from(prompt_tokens.len())?; - - let mut batch = LlamaBatch::new(2048, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?; - let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { - bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); - }; - - assert!(!outcome.generated_raw.is_empty()); - assert!(outcome.observed_reasoning > 0); - assert!(usage.reasoning_tokens > 0); - assert_eq!(outcome.observed_undeterminable, 0); - assert_eq!(usage.undeterminable_tokens, 0); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - ); - - if parsed.reasoning_content.is_empty() { - eprintln!("Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS"); - } else { - assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); - assert_eq!(outcome.content_stream, parsed.content); - } - - for forbidden in FORBIDDEN_MARKERS { - assert!(!outcome.reasoning_stream.contains(forbidden)); - assert!(!outcome.content_stream.contains(forbidden)); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs deleted file mode 100644 index cf43adfd..00000000 --- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ /dev/null @@ -1,100 +0,0 @@ -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::test_model::fixtures_dir; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -const MAX_GENERATED_TOKENS: i32 = 200; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 8192, - n_batch = 512, - n_ubatch = 512, - mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), -)] -fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mtmd_ctx = fixture - .mtmd_context - .expect("mmproj_file declared in attribute"); - - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let image_path = fixtures_dir().join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - - let marker = mtmd_default_marker(); - let prompt = format!( - "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n" - ); - - let input_text = MtmdInputText { - text: prompt, - add_special: false, - parse_special: true, - }; - - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - let mut classifier = model.sampled_token_classifier(); - let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::penalties(64, 1.1, 0.0, 0.0), - LlamaSampler::top_k(40), - LlamaSampler::top_p(0.9, 1), - LlamaSampler::min_p(0.05, 1), - LlamaSampler::temp(0.7), - LlamaSampler::dist(0x00C0_FFEE), - ]); - - let mut batch = LlamaBatch::new(2048, 1)?; - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position: n_past, - max_generated_tokens: MAX_GENERATED_TOKENS, - } - .run()?; - - let usage = classifier.usage(); - - if outcome.observed_reasoning == 0 { - anyhow::bail!( - "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}" - ); - } - if usage.reasoning_tokens == 0 { - anyhow::bail!( - "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}" - ); - } - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs new file mode 100644 index 00000000..a5aac3d4 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs @@ -0,0 +1,2484 @@ +use llama_cpp_test_harness::llama_tests_main; + +mod deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\ + <|User|>What is 2 + 2?<|Assistant|> + + + + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = + model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!( + !outcome.generated_raw.is_empty(), + "DeepSeek-R1-8B: must generate at least one token" + ); + assert_eq!( + outcome.observed_reasoning, 0, + "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \ + when the prompt closes the think block before generation begins; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \ + before generation, so no Undeterminable tokens may be emitted; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + usage.reasoning_tokens, 0, + "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert!( + outcome.observed_content > 0, + "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content, + "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens" + ); + + for forbidden in FORBIDDEN_MARKERS { + assert!( + !outcome.content_stream.contains(forbidden), + "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream + ); + } + + Ok(()) + } +} + +mod deepseek_r1_8b_classifier_emits_reasoning { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 1500; + + // DeepSeek-R1-Distill-Llama-8B uses `...` reasoning markers + // and full-width-bar role tokens `<|User|>` / `<|Assistant|>` (U+FF5C, + // not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends + // `<|Assistant|>\n` — DeepSeek-R1 is a pure reasoner with no + // thinking-disabled mode — so the model resumes generation already inside + // the reasoning block. + const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\ + <|User|>What is 2 + 2?<|Assistant|> + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[expect( + clippy::too_many_lines, + reason = "test asserts many distinct properties of DeepSeek-R1-8B reasoning output; shortening messages or splitting the body would reduce diagnostic signal at failure time" + )] + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!( + "DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized" + ); + }; + + assert!( + !outcome.generated_raw.is_empty(), + "DeepSeek-R1-8B: must generate at least one token" + ); + assert!( + outcome.observed_reasoning > 0, + "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \ + opens a block; outcome={outcome:?}", + ); + assert!( + usage.reasoning_tokens > 0, + "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \ + block; usage was {usage:?}" + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \ + so no Undeterminable tokens may be emitted; outcome={outcome:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning" + ); + + if parsed.reasoning_content.is_empty() { + eprintln!( + "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \ + tokens — skipping strict parser-equality assertions" + ); + } else { + assert_eq!( + outcome.reasoning_stream, parsed.reasoning_content, + "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \ + (any difference means a marker leaked into the user-visible stream)", + ); + assert_eq!( + outcome.content_stream, parsed.content, + "DeepSeek-R1-8B: per-token content stream must equal parser-side content \ + (any difference means a marker leaked into the user-visible stream)", + ); + } + + for forbidden in FORBIDDEN_MARKERS { + assert!( + !outcome.reasoning_stream.contains(forbidden), + "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \ + reasoning_stream={:?}", + outcome.reasoning_stream + ); + assert!( + !outcome.content_stream.contains(forbidden), + "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream + ); + } + + Ok(()) + } +} + +mod deepseek_r1_8b_duck_types_gemma_paired_quote { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const GEMMA_PAIRED_QUOTE_PAYLOAD: &str = + "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise Gemma paired-quote on a model with no registered \ + template; got Unrecognized" + ); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } +} + +mod deepseek_r1_8b_duck_types_glm_key_value_tags { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const GLM_KEY_VALUE_PAYLOAD: &str = "get_weather\ + location\ + Paris\ + "; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise GLM key-value tags on a model with no registered \ + template; got Unrecognized" + ); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } +} + +mod deepseek_r1_8b_duck_types_mistral_bracketed_json { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const MISTRAL_BRACKETED_JSON_PAYLOAD: &str = + r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \ + template; got Unrecognized" + ); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } +} + +mod deepseek_r1_8b_duck_types_qwen_xml { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const QWEN_XML_PAYLOAD: &str = "\n\ + \n\ + \n\ + Paris\n\ + \n\ + \n\ + "; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "duck-type pass must recognise Qwen XML on a model with no registered template; \ + got Unrecognized" + ); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } +} + +mod deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "plain content with tools requested must produce Recognized (with empty tool_calls); \ + got Unrecognized" + ); + }; + assert!( + parsed.tool_calls.is_empty(), + "expected no tool calls; got {:?}", + parsed.tool_calls + ); + + Ok(()) + } +} + +mod deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const PLAIN_CONTENT: &str = "Hello there."; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message("[]", PLAIN_CONTENT, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("plain content with empty tools array must produce Recognized; got Unrecognized"); + }; + assert!( + parsed.tool_calls.is_empty(), + "expected no tool calls; got {:?}", + parsed.tool_calls + ); + + Ok(()) + } +} + +mod gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\ + user\nReply with the single word: four. Do not explain.\n\ + model\n<|channel>thought\n\n"; + + const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!( + !outcome.generated_raw.is_empty(), + "Gemma 4 must generate at least one token" + ); + assert_eq!( + outcome.observed_reasoning, 0, + "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \ + when the prompt closes the thought channel before generation begins; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \ + before generation, so no Undeterminable tokens may be emitted; \ + generated={:?}", + outcome.generated_raw + ); + assert_eq!( + usage.reasoning_tokens, 0, + "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert!( + outcome.observed_content > 0, + "Gemma 4 thinking-disabled: classifier must emit at least one Content token" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content, + "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens" + ); + + for forbidden in FORBIDDEN_MARKERS { + assert!( + !outcome.content_stream.contains(forbidden), + "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream + ); + } + + Ok(()) + } +} + +mod gemma4_classifier_emits_reasoning { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 1500; + + const GEMMA4_THINKING_PROMPT: &str = "\ + user\nReply with the single word: four. Do not explain.\n\ + model\n<|channel>thought\n"; + + const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn gemma4_classifier_emits_reasoning_for_thinking_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!( + !outcome.generated_raw.is_empty(), + "Gemma 4 must generate at least one token" + ); + assert!( + outcome.observed_reasoning > 0, + "Gemma 4 classifier must emit at least one Reasoning token when the model \ + emits a `<|channel>thought` block; outcome={outcome:?}", + ); + assert!( + usage.reasoning_tokens > 0, + "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \ + reasoning block; usage was {usage:?}" + ); + assert_eq!( + outcome.observed_undeterminable, 0, + "Gemma 4: classifier must not emit Undeterminable when the model emits a \ + detected `<|channel>thought` marker; outcome={outcome:?}" + ); + assert_eq!( + usage.undeterminable_tokens, 0, + "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}" + ); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + "Gemma 4: completion tokens must equal observed Content + Reasoning" + ); + assert!( + !parsed.reasoning_content.is_empty(), + "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \ + increase the budget or pick a more direct prompt. generated={:?}", + outcome.generated_raw, + ); + + for forbidden in FORBIDDEN_MARKERS { + assert!( + !outcome.reasoning_stream.contains(forbidden), + "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \ + reasoning_stream={:?}", + outcome.reasoning_stream + ); + assert!( + !outcome.content_stream.contains(forbidden), + "Gemma 4: content_stream leaked marker {forbidden:?}; \ + content_stream={:?}", + outcome.content_stream + ); + } + + Ok(()) + } +} + +mod gemma4_parses_tool_call_payload { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str = + "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; + + #[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized" + ); + }; + assert_eq!( + parsed.tool_calls.len(), + 1, + "expected one tool call; got {:?}", + parsed.tool_calls + ); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } +} + +mod gemma4_template_override_returns_full_markers { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::ToolCallArgsShape; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model + .chat_template(None) + .expect("Gemma 4 chat template must be present"); + let template_str = template.to_str().expect("template must be valid UTF-8"); + assert!( + template_str.contains("<|tool_call>call:"), + "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \ + template starts with: {:?}", + &template_str[..template_str.len().min(200)], + ); + + let markers = model + .tool_call_markers() + .expect("Gemma 4 must produce ToolCallMarkers via override registry"); + + assert_eq!(markers.open, "<|tool_call>call:"); + assert_eq!(markers.close, "}"); + let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else { + panic!("expected PairedQuote variant, got {:?}", markers.args_shape); + }; + assert_eq!(shape.name_args_separator, "{"); + assert_eq!(shape.value_quote.open, "<|\"|>"); + assert_eq!(shape.value_quote.close, "<|\"|>"); + + Ok(()) + } +} + +mod glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + const GLM47_THINKING_DISABLED_PROMPT: &str = "\ + <|user|> + What is 2 + 2? + <|assistant|> + + + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +mod glm47_classifier_emits_reasoning { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 1500; + + const GLM47_THINKING_PROMPT: &str = "\ + <|user|> + What is 2 + 2? + <|assistant|> + + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning + ); + + if parsed.reasoning_content.is_empty() { + eprintln!( + "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ + skipping strict parser-equality assertions" + ); + } else { + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + } + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +mod glm47_parses_tool_call_payload { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const GLM47_KEY_VALUE_PAYLOAD: &str = "get_weather\ + location\ + Paris\ + "; + + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized" + ); + }; + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } +} + +mod glm47_template_override_returns_full_markers { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::ToolCallArgsShape; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model + .chat_template(None) + .expect("GLM-4.7 chat template must be present"); + let template_str = template.to_str().expect("template must be valid UTF-8"); + assert!(template_str.contains("")); + + let markers = model + .tool_call_markers() + .expect("GLM-4.7 must produce ToolCallMarkers via override registry"); + + assert_eq!(markers.open, ""); + assert_eq!(markers.close, ""); + let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else { + panic!( + "expected KeyValueXmlTags variant, got {:?}", + markers.args_shape + ); + }; + assert_eq!(shape.key_open, ""); + assert_eq!(shape.key_close, ""); + assert_eq!(shape.value_open, ""); + assert_eq!(shape.value_close, ""); + + Ok(()) + } +} + +mod mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\ + [INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]"; + + const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; + + #[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = + model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +mod mistral3_classifier_emits_reasoning { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 768; + + const MISTRAL3_THINKING_PROMPT: &str = "\ + [SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ + First draft your thinking process (inner monologue) until you arrive at a response. \ + Format your response using Markdown, and use LaTeX for any mathematical equations. \ + Write both your thoughts and the response in the same language as the input.\n\n\ + Your thinking process must follow the template below:\ + [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \ + Be as casual and as long as you want until you are confident to generate the response \ + to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ + [INST]Reply with the single word: four. Do not explain.[/INST]"; + + const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; + + #[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn mistral3_classifier_emits_reasoning_for_thinking_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + ); + assert!(!parsed.reasoning_content.is_empty()); + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +mod mistral3_parses_tool_call_payload { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str = + r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; + + #[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized" + ); + }; + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } +} + +mod qwen35_chat_inference_emits_reasoning_when_template_auto_opens { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::model::LlamaChatMessage; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, + )] + fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chat_template = model.chat_template(None)?; + let messages = vec![LlamaChatMessage::new( + "user".to_owned(), + "Hello! How are you?".to_owned(), + )?]; + let prompt = model.apply_chat_template(&chat_template, &messages, true)?; + + let mut classifier = model.sampled_token_classifier(); + let tokens = model.str_to_token(&prompt, AddBos::Always)?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 1024, + } + .run()?; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(outcome.observed_content > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); + + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); + }; + assert!(!parsed.content.is_empty()); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); + assert_eq!(usage.undeterminable_tokens, 0); + + Ok(()) + } +} + +mod qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + const QWEN35_THINKING_DISABLED_PROMPT: &str = "\ + <|im_start|>user + What is 2 + 2?<|im_end|> + <|im_start|>assistant + + + + + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +mod qwen35_classifier_emits_reasoning { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 1500; + + const QWEN35_THINKING_PROMPT: &str = "\ + <|im_start|>user + What is 2 + 2?<|im_end|> + <|im_start|>assistant + + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + ); + + if parsed.reasoning_content.is_empty() { + eprintln!( + "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ + skipping strict parser-equality assertions" + ); + } else { + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + } + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +mod qwen35_parses_constrained_schema_payload { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + use serde_json::Value; + use serde_json::json; + + const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "negotiate_with_cat", + "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.", + "parameters": { + "type": "object", + "properties": { + "topic": { + "type": "string", + "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'" + }, + "bribe": { + "type": "string", + "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"], + "description": "What you are offering in exchange" + }, + "desperation_level": { + "type": "integer", + "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)", + "minimum": 1, + "maximum": 10 + } + }, + "required": ["topic"], + "additionalProperties": false + } + } + } + ]"#; + + const NEGOTIATE_WITH_CAT_INPUT: &str = "\n\ + \n\ + \n\ + tuna\n\ + \n\ + \n\ + 8\n\ + \n\ + \n\ + get off the keyboard\n\ + \n\ + \n\ + "; + + fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> { + match arguments { + ToolCallArguments::ValidJson(value) => Ok(value), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson arguments, got InvalidJson: {raw}") + } + } + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture.model.parse_chat_message( + NEGOTIATE_WITH_CAT_TOOLS_JSON, + NEGOTIATE_WITH_CAT_INPUT, + false, + )?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \ + got Unrecognized" + ); + }; + + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat"); + assert_eq!(parsed.tool_calls[0].id, "call_0"); + assert_eq!( + arguments_as_json(&parsed.tool_calls[0].arguments)?, + &json!({ + "bribe": "tuna", + "desperation_level": 8, + "topic": "get off the keyboard", + }), + ); + + Ok(()) + } +} + +mod qwen35_parses_tool_call_payload { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::ToolCallArguments; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const QWEN_XML_PAYLOAD: &str = "\n\ + \n\ + \n\ + Paris\n\ + \n\ + \n\ + "; + + const PARTIAL_QWEN_XML_PAYLOAD: &str = "\n\n) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized"); + }; + assert_eq!(parsed.tool_calls.len(), 1); + assert_eq!(parsed.tool_calls[0].name, "get_weather"); + let location = match &parsed.tool_calls[0].arguments { + ToolCallArguments::ValidJson(value) => value + .get("location") + .and_then(|v| v.as_str()) + .map(str::to_owned), + ToolCallArguments::InvalidJson(raw) => { + bail!("expected ValidJson, got InvalidJson: {raw}"); + } + }; + assert_eq!(location.as_deref(), Some("Paris")); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn qwen35_parses_partial_tool_call_returns_pending_state( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized"); + }; + assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized" + ); + }; + assert!( + !parsed.tool_calls.is_empty(), + "expected at least one tool call; got {:?}", + parsed.tool_calls + ); + + Ok(()) + } +} + +mod qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const TOOLS_JSON: &str = r#"[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + }, + "required": ["location"] + } + } + } + ]"#; + + const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; + + let ChatMessageParseOutcome::Recognized(parsed) = outcome else { + bail!( + "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \ + tool_calls); got Unrecognized" + ); + }; + assert!( + parsed.tool_calls.is_empty(), + "expected no tool calls; got {:?}", + parsed.tool_calls + ); + + Ok(()) + } +} + +mod qwen36_chat_inference_emits_reasoning_when_template_auto_opens { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::model::LlamaChatMessage; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, + )] + fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chat_template = model.chat_template(None)?; + let messages = vec![LlamaChatMessage::new( + "user".to_owned(), + "Hello! How are you?".to_owned(), + )?]; + let prompt = model.apply_chat_template(&chat_template, &messages, true)?; + + let mut classifier = model.sampled_token_classifier(); + let tokens = model.str_to_token(&prompt, AddBos::Always)?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 1024, + } + .run()?; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(outcome.observed_content > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); + + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); + }; + assert!(!parsed.content.is_empty()); + + let usage = classifier.into_usage(); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); + assert_eq!(usage.undeterminable_tokens, 0); + + Ok(()) + } +} + +mod qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt { + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 200; + + const QWEN36_THINKING_DISABLED_PROMPT: &str = "\ + <|im_start|>user + What is 2 + 2?<|im_end|> + <|im_start|>assistant + + + + + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +mod qwen36_classifier_emits_reasoning { + use anyhow::Result; + use anyhow::bail; + use llama_cpp_bindings::ChatMessageParseOutcome; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const MAX_GENERATED_TOKENS: i32 = 1500; + + const QWEN36_THINKING_PROMPT: &str = "\ + <|im_start|>user + What is 2 + 2?<|im_end|> + <|im_start|>assistant + + "; + + const FORBIDDEN_MARKERS: &[&str] = &["", ""]; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, + )] + fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let mut classifier = model.sampled_token_classifier(); + let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?; + let prompt_token_count = u64::try_from(prompt_tokens.len())?; + + let mut batch = LlamaBatch::new(2048, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::penalties(64, 1.1, 0.0, 0.0), + LlamaSampler::top_k(40), + LlamaSampler::top_p(0.9, 1), + LlamaSampler::min_p(0.05, 1), + LlamaSampler::temp(0.7), + LlamaSampler::dist(0x00C0_FFEE), + ]); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: MAX_GENERATED_TOKENS, + } + .run()?; + + let usage = classifier.usage(); + let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?; + let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { + bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); + }; + + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert_eq!( + usage.completion_tokens(), + outcome.observed_content + outcome.observed_reasoning, + ); + + if parsed.reasoning_content.is_empty() { + eprintln!( + "Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS" + ); + } else { + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); + } + + for forbidden in FORBIDDEN_MARKERS { + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); + } + + Ok(()) + } +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/reranker.rs b/llama-cpp-bindings-tests/tests/reranker.rs deleted file mode 100644 index d08de7eb..00000000 --- a/llama-cpp-bindings-tests/tests/reranker.rs +++ /dev/null @@ -1,158 +0,0 @@ -use std::time::Duration; - -use anyhow::{Context, Result, bail}; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::ggml_time_us; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -fn normalize(input: &[f32]) -> Vec { - let magnitude = input - .iter() - .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator)) - .sqrt(); - - input.iter().map(|&value| value / magnitude).collect() -} - -fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 { - vec_a - .iter() - .zip(vec_b.iter()) - .map(|(left, right)| left * right) - .sum::() -} - -#[llama_test( - model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, - n_seq_max = 2, - n_threads_batch = 8, - embeddings = true, -)] -fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - - let query = "What is machine learning?"; - let documents = [ - "Machine learning is a subset of artificial intelligence.", - "The weather today is sunny and warm.", - ]; - - let document_count = documents.len(); - assert_eq!( - u32::try_from(document_count)?, - fixture.context_params.n_seq_max, - "attribute n_seq_max must match the document count this trial expects", - ); - - let mut ctx = LlamaContext::from_model( - model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create context")?; - - let prompt_lines: Vec = documents - .iter() - .map(|document| format!("{query}{document}")) - .collect(); - - let tokens_lines_list = prompt_lines - .iter() - .map(|line| model.str_to_token(line, AddBos::Always)) - .collect::, _>>() - .with_context(|| "failed to tokenize prompts")?; - - let n_ctx = usize::try_from(ctx.n_ctx())?; - - if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) { - bail!("one of the provided prompts exceeds the size of the context window"); - } - - let mut classifier = model.sampled_token_classifier(); - let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?; - let t_main_start = ggml_time_us(); - - for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() { - classifier.feed_prompt_sequence_to_batch( - &mut batch, - tokens, - i32::try_from(sequence_index)?, - false, - )?; - } - - let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum(); - let total_token_count = u64::try_from(total_tokens)?; - - assert_eq!(classifier.pending_prompt_tokens(), total_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); - - ctx.clear_kv_cache(); - ctx.decode(&mut batch) - .with_context(|| "llama_decode() failed")?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, total_token_count); - - let mut embeddings = Vec::with_capacity(document_count); - - for sequence_index in 0..document_count { - let raw_embedding = ctx - .embeddings_seq_ith(i32::try_from(sequence_index)?) - .with_context(|| "failed to get sequence embeddings")?; - embeddings.push(normalize(raw_embedding)); - } - - let t_main_end = ggml_time_us(); - let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); - - #[expect( - clippy::cast_precision_loss, - reason = "logged throughput tolerates f32 precision" - )] - let tokens_per_second = total_tokens as f32 / duration.as_secs_f32(); - - eprintln!( - "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", - duration.as_secs_f32(), - ); - - assert_eq!( - embeddings.len(), - document_count, - "should produce one embedding per document" - ); - - for (index, embedding) in embeddings.iter().enumerate() { - assert!( - !embedding.is_empty(), - "embedding {index} should not be empty" - ); - } - - let similarity = cosine_similarity(&embeddings[0], &embeddings[1]); - eprintln!("cosine similarity between document embeddings: {similarity:.4}"); - - assert!( - similarity.is_finite(), - "cosine similarity should be a finite number" - ); - - let usage = classifier.into_usage(); - assert_eq!(usage.prompt_tokens, total_token_count); - assert_eq!(usage.completion_tokens(), 0); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs b/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs deleted file mode 100644 index 4127fc58..00000000 --- a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs +++ /dev/null @@ -1,513 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" -)] - -use anyhow::Result; -use llama_cpp_bindings::SampledToken; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier; -use llama_cpp_bindings::sampled_token_section::SampledTokenSection; -use llama_cpp_bindings::streaming_markers::StreamingMarkers; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn classifier_starts_in_pending_section_for_default_fixture( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let classifier = fixture.model.sampled_token_classifier(); - - assert_eq!(classifier.current_section(), SampledTokenSection::Pending); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn classifier_construction_is_idempotent_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { - let first = fixture.model.sampled_token_classifier(); - let second = fixture.model.sampled_token_classifier(); - - assert_eq!(first.current_section(), second.current_section()); - assert_eq!(first.usage(), second.usage()); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - - let outcomes = classifier.ingest(model.token_bos()); - - assert_eq!(outcomes.len(), 1); - let outcome = &outcomes[0]; - assert!(matches!( - outcome.sampled_token, - SampledToken::Undeterminable(_) - )); - assert_eq!(outcome.visible_piece, outcome.raw_piece); - assert_eq!(classifier.usage().undeterminable_tokens, 1); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn ingest_with_no_markers_decodes_each_token_independently( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - - let _ = classifier.ingest(model.token_bos()); - let _ = classifier.ingest(model.token_eos()); - - assert_eq!(classifier.usage().undeterminable_tokens, 2); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let usage_before = *classifier.usage(); - - classifier.ingest_prompt_token(model.token_bos()); - classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]); - - assert_eq!(*classifier.usage(), usage_before); - assert_eq!(classifier.current_section(), SampledTokenSection::Pending); - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn feed_prompt_to_batch_increments_pending_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; - - classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; - classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; - - assert_eq!(classifier.pending_prompt_tokens(), 2); - assert_eq!(batch.n_tokens(), 2); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; - - let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()]; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - assert_eq!(classifier.pending_prompt_tokens(), 3); - assert_eq!(batch.n_tokens(), 3); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; - - classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; - classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; - - let promoted = classifier.commit_prompt_tokens(); - - assert_eq!(promoted, 2); - assert_eq!(classifier.pending_prompt_tokens(), 0); - assert_eq!(classifier.usage().prompt_tokens, 2); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn discard_pending_prompt_tokens_clears_count_without_recording_usage( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let model = fixture.model; - let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); - let mut batch = LlamaBatch::new(8, 1)?; - - classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; - - let discarded = classifier.discard_pending_prompt_tokens(); - - assert_eq!(discarded, 1); - assert_eq!(classifier.pending_prompt_tokens(), 0); - assert_eq!(classifier.usage().prompt_tokens, 0); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 128, - n_ubatch = 64, -)] -fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?; - let _ = left; - let _ = right; - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/sampling.rs b/llama-cpp-bindings-tests/tests/sampling.rs deleted file mode 100644 index d03e965e..00000000 --- a/llama-cpp-bindings-tests/tests/sampling.rs +++ /dev/null @@ -1,429 +0,0 @@ -#![expect( - clippy::unnecessary_wraps, - reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" -)] - -use anyhow::Result; -use llama_cpp_bindings::GrammarError; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings::token::LlamaToken; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> { - let breakers: Vec<&[u8]> = vec![b"\n", b"\t"]; - let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn dry_sampler_with_null_byte_in_seq_breakers_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let breakers: Vec<&[u8]> = vec![b"hello\0world"]; - let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers); - - assert!(result.is_err()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root"); - - assert!(sampler.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let trigger_words: Vec<&[u8]> = vec![b"function"]; - let sampler = LlamaSampler::grammar_lazy( - fixture.model, - "root ::= \"hello\"", - "root", - trigger_words, - &[], - ); - - assert!(sampler.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let patterns = vec!["\\{.*".to_owned()]; - let sampler = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "root ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(sampler.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { - let trigger_words: Vec<&[u8]> = vec![b"function"]; - let result = LlamaSampler::grammar_lazy( - fixture.model, - "expr ::= \"hello\"", - "root", - trigger_words, - &[], - ); - - assert!(matches!(result, Err(GrammarError::RootNotFound))); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_lazy_with_null_byte_in_trigger_word_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"]; - let result = LlamaSampler::grammar_lazy( - fixture.model, - "root ::= \"hello\"", - "root", - trigger_words, - &[], - ); - - assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_)))); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_lazy_patterns_with_root_not_found_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let patterns = vec!["\\{.*".to_owned()]; - let result = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "expr ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(matches!(result, Err(GrammarError::RootNotFound))); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let patterns = vec!["hel\0lo".to_owned()]; - let result = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "root ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_)))); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let patterns = vec!["[".to_owned()]; - let result = LlamaSampler::grammar_lazy_patterns( - fixture.model, - "root ::= \"hello\"", - "root", - &patterns, - &[], - ); - - assert!(matches!( - result, - Err(GrammarError::InvalidTriggerPattern { .. }), - )); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { - let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no"); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> { - let result = LlamaSampler::logit_bias(0, &[]); - - assert!(result.is_ok()); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn dry_sampler_with_root_not_found_grammar_does_not_apply( - fixture: &LlamaFixture<'_>, -) -> Result<()> { - let breakers: Vec<&[u8]> = vec![b"\n"]; - let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()]; - - sampler.accept_many(&tokens)?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn with_tokens_returns_self_after_accepting_each_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - let tokens = [fixture.model.token_bos(), fixture.model.token_eos()]; - - let _consumed = sampler.with_tokens(tokens.iter().copied())?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - - sampler.accept(fixture.model.token_bos())?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> { - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - - sampler.try_accept(LlamaToken::new(0))?; - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?; - let sampler = LlamaSampler::greedy(); - sampler.apply(&mut data_array); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 2048, - n_ubatch = 512, -)] -fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { - let mut context = LlamaContext::from_model( - fixture.model, - fixture.backend, - (*fixture.context_params).into_llama_context_params(), - )?; - let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - let result = sampler.sample(&context, batch.n_tokens() - 1); - - assert!(result.is_ok()); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs new file mode 100644 index 00000000..dc9395aa --- /dev/null +++ b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs @@ -0,0 +1,2518 @@ +use llama_cpp_test_harness::llama_tests_main; + +mod model_sampling { + use anyhow::Result; + use llama_cpp_bindings::SampledToken; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::json_schema_to_grammar; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, + )] + fn sample_returns_result_and_succeeds_with_valid_index( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let tokens = model.str_to_token("Hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = + LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + + let result = sampler.sample(&context, batch.n_tokens() - 1); + + assert!(result.is_ok()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let mut classifier = model.sampled_token_classifier(); + let (raw_token, mut outcomes) = + classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; + outcomes.extend(classifier.flush()); + + assert_eq!( + outcomes.len(), + 1, + "expected one finalised outcome after flush" + ); + let outcome = &outcomes[0]; + + let raw_as_sampled = SampledToken::Content(raw_token); + assert!( + !model.is_eog_token(&raw_as_sampled), + "Grammar sampler should not allow EOS as first token" + ); + + let piece = &outcome.raw_piece; + let first_char = piece + .chars() + .next() + .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))? + .to_lowercase() + .next() + .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?; + + assert!( + first_char == 'y' || first_char == 'n', + "Grammar should constrain first token to start with y/n, got: '{piece}'" + ); + assert_eq!( + classifier.usage().completion_tokens(), + 1, + "exactly one completion token sampled" + ); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn json_schema_grammar_sampler_constrains_output_to_json( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let grammar_str = json_schema_to_grammar( + r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#, + )?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, &grammar_str, "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let mut classifier = model.sampled_token_classifier(); + let (raw_token, mut outcomes) = + classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; + outcomes.extend(classifier.flush()); + + assert_eq!( + outcomes.len(), + 1, + "expected one finalised outcome after flush" + ); + let outcome = &outcomes[0]; + + let raw_as_sampled = SampledToken::Content(raw_token); + assert!( + !model.is_eog_token(&raw_as_sampled), + "Grammar sampler should not allow EOS as first token" + ); + + let piece = &outcome.raw_piece; + + assert!( + piece.starts_with('{'), + "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'" + ); + assert_eq!( + classifier.usage().completion_tokens(), + 1, + "exactly one completion token sampled" + ); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn sample_with_grammar_produces_constrained_output_in_loop( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + let mut classifier = model.sampled_token_classifier(); + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + context.decode(&mut batch)?; + classifier.commit_prompt_tokens(); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 10, + } + .run()?; + + let lowercase = outcome.generated_raw.to_lowercase(); + assert!( + lowercase == "yes" || lowercase == "no", + "Grammar loop should produce 'yes' or 'no', got: '{}'", + outcome.generated_raw + ); + assert!( + outcome.eog_seen, + "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}" + ); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); + assert!(outcome.observed_content > 0); + + let usage = classifier.into_usage(); + assert_eq!(usage.completion_tokens(), outcome.observed_content); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = + "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = + LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + + let mut classifier = model.sampled_token_classifier(); + let mut sampled_count: u64 = 0; + + for (position, _) in (batch.n_tokens()..).zip(0..5) { + let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?; + let raw_as_sampled = SampledToken::Content(raw_token); + + if model.is_eog_token(&raw_as_sampled) { + break; + } + + sampled_count += 1; + + batch.clear(); + batch.add(&raw_as_sampled, position, &[0], true)?; + + context.decode(&mut batch)?; + } + + let _ = classifier.flush(); + + assert!( + sampled_count > 0, + "Should produce at least one token without grammar" + ); + let usage = classifier.into_usage(); + assert!( + usage.completion_tokens() >= sampled_count, + "completion_tokens ({}) must include the {sampled_count} non-EOG samples", + usage.completion_tokens() + ); + + Ok(()) + } +} + +mod sampling { + #![expect( + clippy::unnecessary_wraps, + reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" + )] + + use anyhow::Result; + use llama_cpp_bindings::GrammarError; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings::token::LlamaToken; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let breakers: Vec<&[u8]> = vec![b"\n", b"\t"]; + let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn dry_sampler_with_null_byte_in_seq_breakers_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let breakers: Vec<&[u8]> = vec![b"hello\0world"]; + let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers); + + assert!(result.is_err()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root"); + + assert!(sampler.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let trigger_words: Vec<&[u8]> = vec![b"function"]; + let sampler = LlamaSampler::grammar_lazy( + fixture.model, + "root ::= \"hello\"", + "root", + trigger_words, + &[], + ); + + assert!(sampler.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let patterns = vec!["\\{.*".to_owned()]; + let sampler = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(sampler.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let trigger_words: Vec<&[u8]> = vec![b"function"]; + let result = LlamaSampler::grammar_lazy( + fixture.model, + "expr ::= \"hello\"", + "root", + trigger_words, + &[], + ); + + assert!(matches!(result, Err(GrammarError::RootNotFound))); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_lazy_with_null_byte_in_trigger_word_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"]; + let result = LlamaSampler::grammar_lazy( + fixture.model, + "root ::= \"hello\"", + "root", + trigger_words, + &[], + ); + + assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_)))); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_lazy_patterns_with_root_not_found_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let patterns = vec!["\\{.*".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "expr ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(matches!(result, Err(GrammarError::RootNotFound))); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let patterns = vec!["hel\0lo".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_)))); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let patterns = vec!["[".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); + + assert!(matches!( + result, + Err(GrammarError::InvalidTriggerPattern { .. }), + )); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no"); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> { + let result = LlamaSampler::logit_bias(0, &[]); + + assert!(result.is_ok()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn dry_sampler_with_root_not_found_grammar_does_not_apply( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let breakers: Vec<&[u8]> = vec![b"\n"]; + let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()]; + + sampler.accept_many(&tokens)?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn with_tokens_returns_self_after_accepting_each_token( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + let tokens = [fixture.model.token_bos(), fixture.model.token_eos()]; + + let _consumed = sampler.with_tokens(tokens.iter().copied())?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + + sampler.accept(fixture.model.token_bos())?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); + + sampler.try_accept(LlamaToken::new(0))?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?; + let sampler = LlamaSampler::greedy(); + sampler.apply(&mut data_array); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + )] + fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + let mut sampler = + LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + let result = sampler.sample(&context, batch.n_tokens() - 1); + + assert!(result.is_ok()); + + Ok(()) + } +} + +mod text_generation { + use std::io::Write; + use std::time::Duration; + + use anyhow::Context as _; + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::ggml_time_us; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::model::LlamaChatMessage; + use llama_cpp_bindings::sampled_token::SampledToken; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut ctx = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; + + let prompt = "Hello my name is"; + let max_generated_tokens: i32 = 64; + + let mut classifier = model.sampled_token_classifier(); + let tokens_list = model + .str_to_token(prompt, AddBos::Always) + .with_context(|| format!("failed to tokenize {prompt}"))?; + let prompt_token_count = u64::try_from(tokens_list.len())?; + + let mut decoder = encoding_rs::UTF_8.new_decoder(); + + for token in &tokens_list { + eprint!( + "{}", + model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)? + ); + } + std::io::stderr().flush()?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); + + ctx.decode(&mut batch) + .with_context(|| "llama_decode() failed")?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, prompt_token_count); + + let mut sampler = + LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]); + let initial_position = batch.n_tokens(); + let t_main_start = ggml_time_us(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut ctx, + batch: &mut batch, + initial_position, + max_generated_tokens, + } + .run()?; + let t_main_end = ggml_time_us(); + let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); + let total_observed = + outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; + + #[expect( + clippy::cast_precision_loss, + reason = "logged throughput tolerates f32 precision" + )] + let tokens_per_second = total_observed as f32 / duration.as_secs_f32(); + + eprintln!( + "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", + duration.as_secs_f32(), + ); + + assert!( + !outcome.generated_raw.is_empty(), + "model should generate at least one token" + ); + assert_eq!( + outcome.observed_tool_call, 0, + "raw prompt without tool-call markers must not produce ToolCall tokens; \ + outcome={outcome:?}" + ); + assert!( + total_observed > 0, + "model must produce at least one classified token; outcome={outcome:?}" + ); + + let usage = classifier.into_usage(); + assert_eq!( + usage.prompt_tokens, prompt_token_count, + "prompt_tokens must equal the tokenizer's prompt length" + ); + assert_eq!( + usage.content_tokens, outcome.observed_content, + "content_tokens must equal observed Content variants" + ); + assert_eq!( + usage.reasoning_tokens, outcome.observed_reasoning, + "reasoning_tokens must equal observed Reasoning variants" + ); + assert_eq!( + usage.undeterminable_tokens, outcome.observed_undeterminable, + "undeterminable_tokens must equal observed Undeterminable variants" + ); + assert_eq!( + usage.tool_call_tokens, outcome.observed_tool_call, + "tool_call_tokens must equal observed ToolCall variants" + ); + assert_eq!( + usage.completion_tokens(), + total_observed, + "completion_tokens must equal Content + Reasoning + Undeterminable" + ); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, + )] + fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chat_template = model.chat_template(None)?; + let messages = vec![LlamaChatMessage::new( + "user".to_string(), + "Hello! How are you?".to_string(), + )?]; + let prompt = model.apply_chat_template(&chat_template, &messages, true)?; + + let mut classifier = model.sampled_token_classifier(); + let tokens = model.str_to_token(&prompt, AddBos::Always)?; + let prompt_token_count = u64::try_from(tokens.len())?; + + let mut batch = LlamaBatch::new(512, 1)?; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); + assert_eq!(classifier.usage().prompt_tokens, 0); + + context.decode(&mut batch)?; + + let promoted = classifier.commit_prompt_tokens(); + assert_eq!(promoted, prompt_token_count); + + let mut sampler = LlamaSampler::greedy(); + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 1024, + } + .run()?; + + println!(); + + assert!( + !outcome.generated_raw.is_empty(), + "model should generate at least one token" + ); + let total_observed = + outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; + assert!( + total_observed > 0, + "model must produce at least one classified token; outcome={outcome:?}" + ); + assert_eq!( + outcome.observed_tool_call, 0, + "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}" + ); + + let usage = classifier.into_usage(); + + assert_eq!( + usage.prompt_tokens, prompt_token_count, + "prompt_tokens must equal the tokenizer's prompt length" + ); + assert_eq!( + usage.content_tokens, outcome.observed_content, + "content_tokens must equal observed Content variants" + ); + assert_eq!( + usage.reasoning_tokens, outcome.observed_reasoning, + "reasoning_tokens must equal observed Reasoning variants" + ); + assert_eq!( + usage.undeterminable_tokens, outcome.observed_undeterminable, + "undeterminable_tokens must equal observed Undeterminable variants" + ); + assert_eq!( + usage.completion_tokens(), + total_observed, + "completion_tokens must equal Content + Reasoning + Undeterminable" + ); + assert_eq!( + usage.tool_call_tokens, outcome.observed_tool_call, + "tool_call_tokens must equal observed ToolCall variants" + ); + + Ok(()) + } +} + +mod constrained_decoding { + use std::io::Write; + + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampled_token::SampledToken; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + + let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n"; + + let mut ctx = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let tokens_list = model.str_to_token(prompt, AddBos::Always)?; + + let mut batch = LlamaBatch::new(512, 1)?; + let last_index = i32::try_from(tokens_list.len())? - 1; + + for (index, token) in (0_i32..).zip(&tokens_list) { + batch.add( + &SampledToken::Content(*token), + index, + &[0], + index == last_index, + )?; + } + + ctx.decode(&mut batch)?; + + let schema = r#"{ + "type": "object", + "properties": { + "city": { "type": "string" }, + "temperature": { "type": "number" } + }, + "required": ["city", "temperature"] + }"#; + + let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?; + let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); + + let mut n_cur = batch.n_tokens(); + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let mut generated = String::new(); + + while n_cur <= 128 { + let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?); + + if model.is_eog_token(&token) { + break; + } + + let output_string = model.token_to_piece(&token, &mut decoder, true, None)?; + generated.push_str(&output_string); + print!("{output_string}"); + std::io::stdout().flush()?; + + batch.clear(); + batch.add(&token, n_cur, &[0], true)?; + n_cur += 1; + ctx.decode(&mut batch)?; + } + + println!(); + + let parsed = serde_json::Deserializer::from_str(&generated) + .into_iter::() + .next() + .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??; + + assert!(parsed.get("city").is_some()); + assert!(parsed.get("temperature").is_some()); + + Ok(()) + } +} + +mod llguidance { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use std::ffi::CStr; + use std::sync::Arc; + + use anyhow::Result; + use llama_cpp_bindings::context::LlamaContext; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::llguidance_sampler::create_llg_sampler; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_bindings::sampling::LlamaSampler; + use llama_cpp_bindings::token::LlamaToken; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + const JSON_SCHEMA: &str = + r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#; + const REGEX_GRAMMAR: &str = r"yes|no"; + const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?; + + assert!(!sampler.sampler.is_null()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + assert!(!sampler.sampler.is_null()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?; + + assert!(!sampler.sampler.is_null()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything"); + + assert!(result.is_err()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "json", "{this is not valid json"); + + assert!(result.is_err()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "regex", "[invalid"); + + assert!(result.is_err()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) }; + assert!(!name_ptr.is_null()); + let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?; + + assert_eq!(name, "llguidance"); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) }; + + assert!(!cloned.is_null()); + + unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) }; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "Answer yes or no:"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; + let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); + + let token = chain.sample(&context, batch.n_tokens() - 1)?; + chain.accept(token)?; + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + let huge_token = LlamaToken(i32::MAX - 1); + let _ = sampler.accept(huge_token); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.approximate_tok_env(); + let second = fixture.model.approximate_tok_env(); + + assert!(Arc::ptr_eq(&first, &second)); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn approximate_tok_env_drives_consistent_grammar_constraint( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + + assert!(!first.sampler.is_null()); + assert!(!second.sampler.is_null()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let tokens = model.str_to_token("Answer:", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; + let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]); + let _ = chain.sample(&context, batch.n_tokens() - 1); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, + )] + fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + let huge_token = LlamaToken(i32::MAX - 1); + let _ = sampler.accept(huge_token); + sampler.reset(); + let after = sampler.accept(LlamaToken(0)); + assert!( + after.is_ok() || after.is_err(), + "after reset, sampler.accept must return Ok or Err (not panic)" + ); + Ok(()) + } +} + +mod sampled_token_classifier_markers { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::SampledToken; + use llama_cpp_bindings::llama_batch::LlamaBatch; + use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier; + use llama_cpp_bindings::sampled_token_section::SampledTokenSection; + use llama_cpp_bindings::streaming_markers::StreamingMarkers; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn classifier_starts_in_pending_section_for_default_fixture( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let classifier = fixture.model.sampled_token_classifier(); + + assert_eq!(classifier.current_section(), SampledTokenSection::Pending); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn classifier_construction_is_idempotent_across_calls( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let first = fixture.model.sampled_token_classifier(); + let second = fixture.model.sampled_token_classifier(); + + assert_eq!(first.current_section(), second.current_section()); + assert_eq!(first.usage(), second.usage()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + + let outcomes = classifier.ingest(model.token_bos()); + + assert_eq!(outcomes.len(), 1); + let outcome = &outcomes[0]; + assert!(matches!( + outcome.sampled_token, + SampledToken::Undeterminable(_) + )); + assert_eq!(outcome.visible_piece, outcome.raw_piece); + assert_eq!(classifier.usage().undeterminable_tokens, 1); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn ingest_with_no_markers_decodes_each_token_independently( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + + let _ = classifier.ingest(model.token_bos()); + let _ = classifier.ingest(model.token_eos()); + + assert_eq!(classifier.usage().undeterminable_tokens, 2); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let usage_before = *classifier.usage(); + + classifier.ingest_prompt_token(model.token_bos()); + classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]); + + assert_eq!(*classifier.usage(), usage_before); + assert_eq!(classifier.current_section(), SampledTokenSection::Pending); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn feed_prompt_to_batch_increments_pending_prompt_tokens( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; + classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; + + assert_eq!(classifier.pending_prompt_tokens(), 2); + assert_eq!(batch.n_tokens(), 2); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()]; + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + assert_eq!(classifier.pending_prompt_tokens(), 3); + assert_eq!(batch.n_tokens(), 3); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; + classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?; + + let promoted = classifier.commit_prompt_tokens(); + + assert_eq!(promoted, 2); + assert_eq!(classifier.pending_prompt_tokens(), 0); + assert_eq!(classifier.usage().prompt_tokens, 2); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn discard_pending_prompt_tokens_clears_count_without_recording_usage( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); + let mut batch = LlamaBatch::new(8, 1)?; + + classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?; + + let discarded = classifier.discard_pending_prompt_tokens(); + + assert_eq!(discarded, 1); + assert_eq!(classifier.pending_prompt_tokens(), 0); + assert_eq!(classifier.usage().prompt_tokens, 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?; + let _ = left; + let _ = right; + Ok(()) + } +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/text_generation.rs b/llama-cpp-bindings-tests/tests/text_generation.rs deleted file mode 100644 index 57fd54d7..00000000 --- a/llama-cpp-bindings-tests/tests/text_generation.rs +++ /dev/null @@ -1,298 +0,0 @@ -use std::io::Write; -use std::time::Duration; - -use anyhow::Context as _; -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::ggml_time_us; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaChatMessage; -use llama_cpp_bindings::sampled_token::SampledToken; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_test_harness::LlamaFixture; -use llama_cpp_test_harness::llama_test; -use llama_cpp_test_harness::llama_tests_main; - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 512, - n_batch = 512, - n_ubatch = 128, -)] -fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut ctx = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - ) - .with_context(|| "unable to create context")?; - - let prompt = "Hello my name is"; - let max_generated_tokens: i32 = 64; - - let mut classifier = model.sampled_token_classifier(); - let tokens_list = model - .str_to_token(prompt, AddBos::Always) - .with_context(|| format!("failed to tokenize {prompt}"))?; - let prompt_token_count = u64::try_from(tokens_list.len())?; - - let mut decoder = encoding_rs::UTF_8.new_decoder(); - - for token in &tokens_list { - eprint!( - "{}", - model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)? - ); - } - std::io::stderr().flush()?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?; - - assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); - - ctx.decode(&mut batch) - .with_context(|| "llama_decode() failed")?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, prompt_token_count); - - let mut sampler = - LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]); - let initial_position = batch.n_tokens(); - let t_main_start = ggml_time_us(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut ctx, - batch: &mut batch, - initial_position, - max_generated_tokens, - } - .run()?; - let t_main_end = ggml_time_us(); - let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?); - let total_observed = - outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; - - #[expect( - clippy::cast_precision_loss, - reason = "logged throughput tolerates f32 precision" - )] - let tokens_per_second = total_observed as f32 / duration.as_secs_f32(); - - eprintln!( - "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s", - duration.as_secs_f32(), - ); - - assert!( - !outcome.generated_raw.is_empty(), - "model should generate at least one token" - ); - assert_eq!( - outcome.observed_tool_call, 0, - "raw prompt without tool-call markers must not produce ToolCall tokens; \ - outcome={outcome:?}" - ); - assert!( - total_observed > 0, - "model must produce at least one classified token; outcome={outcome:?}" - ); - - let usage = classifier.into_usage(); - assert_eq!( - usage.prompt_tokens, prompt_token_count, - "prompt_tokens must equal the tokenizer's prompt length" - ); - assert_eq!( - usage.content_tokens, outcome.observed_content, - "content_tokens must equal observed Content variants" - ); - assert_eq!( - usage.reasoning_tokens, outcome.observed_reasoning, - "reasoning_tokens must equal observed Reasoning variants" - ); - assert_eq!( - usage.undeterminable_tokens, outcome.observed_undeterminable, - "undeterminable_tokens must equal observed Undeterminable variants" - ); - assert_eq!( - usage.tool_call_tokens, outcome.observed_tool_call, - "tool_call_tokens must equal observed ToolCall variants" - ); - assert_eq!( - usage.completion_tokens(), - total_observed, - "completion_tokens must equal Content + Reasoning + Undeterminable" - ); - - Ok(()) -} - -#[llama_test( - model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, -)] -#[llama_test( - model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), - n_gpu_layers = 999, - use_mmap = true, - use_mlock = false, - n_ctx = 2048, - n_batch = 512, - n_ubatch = 128, -)] -fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> { - let model = fixture.model; - let backend = fixture.backend; - let mut context = LlamaContext::from_model( - model, - backend, - (*fixture.context_params).into_llama_context_params(), - )?; - - let chat_template = model.chat_template(None)?; - let messages = vec![LlamaChatMessage::new( - "user".to_string(), - "Hello! How are you?".to_string(), - )?]; - let prompt = model.apply_chat_template(&chat_template, &messages, true)?; - - let mut classifier = model.sampled_token_classifier(); - let tokens = model.str_to_token(&prompt, AddBos::Always)?; - let prompt_token_count = u64::try_from(tokens.len())?; - - let mut batch = LlamaBatch::new(512, 1)?; - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count); - assert_eq!(classifier.usage().prompt_tokens, 0); - - context.decode(&mut batch)?; - - let promoted = classifier.commit_prompt_tokens(); - assert_eq!(promoted, prompt_token_count); - - let mut sampler = LlamaSampler::greedy(); - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 1024, - } - .run()?; - - println!(); - - assert!( - !outcome.generated_raw.is_empty(), - "model should generate at least one token" - ); - let total_observed = - outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable; - assert!( - total_observed > 0, - "model must produce at least one classified token; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_tool_call, 0, - "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}" - ); - - let usage = classifier.into_usage(); - - assert_eq!( - usage.prompt_tokens, prompt_token_count, - "prompt_tokens must equal the tokenizer's prompt length" - ); - assert_eq!( - usage.content_tokens, outcome.observed_content, - "content_tokens must equal observed Content variants" - ); - assert_eq!( - usage.reasoning_tokens, outcome.observed_reasoning, - "reasoning_tokens must equal observed Reasoning variants" - ); - assert_eq!( - usage.undeterminable_tokens, outcome.observed_undeterminable, - "undeterminable_tokens must equal observed Undeterminable variants" - ); - assert_eq!( - usage.completion_tokens(), - total_observed, - "completion_tokens must equal Content + Reasoning + Undeterminable" - ); - assert_eq!( - usage.tool_call_tokens, outcome.observed_tool_call, - "tool_call_tokens must equal observed ToolCall variants" - ); - - Ok(()) -} - -llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs new file mode 100644 index 00000000..7b26c7ee --- /dev/null +++ b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs @@ -0,0 +1,1978 @@ +use llama_cpp_test_harness::llama_tests_main; + +mod model_properties { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + assert!(model.n_vocab() > 0); + assert!(model.n_embd() > 0); + assert!(model.n_params() > 0); + assert!(model.n_ctx_train()? > 0); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_layer()? > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_head()? > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_head_kv()? > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.size() > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(!fixture.model.is_recurrent()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn is_hybrid_returns_false_for_non_hybrid_default_models( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + assert!( + !fixture.model.is_hybrid(), + "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!( + fixture.model.is_hybrid(), + "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn rope_type_returns_a_known_variant_for_rope_carrying_default_models( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + use llama_cpp_bindings::model::rope_type::RopeType; + let rope = fixture.model.rope_type(); + assert!( + matches!( + rope, + Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision) + ), + "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { + let rope = fixture.model.rope_type(); + assert!( + rope.is_none(), + "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + use llama_cpp_bindings::model::vocab_type::VocabType; + let vocab = fixture.model.vocab_type()?; + assert!( + matches!(vocab, VocabType::BPE | VocabType::SPM), + "vocab_type must be a known variant; got {vocab:?}" + ); + Ok(()) + } +} + +mod model_metadata_kv { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.meta_count() > 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> { + let key = fixture.model.meta_key_by_index(0)?; + assert!(!key.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> { + let value = fixture.model.meta_val_str_by_index(0)?; + assert!(!value.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_key_by_index(999_999); + assert!(result.is_err()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_val_str_by_index(999_999); + assert!(result.is_err()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let first_key = model.meta_key_by_index(0)?; + let value = model.meta_val_str(&first_key)?; + assert!(!value.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_val_str_with_long_value_triggers_buffer_resize( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let count = model.meta_count(); + + for index in 0..count { + let key = model.meta_key_by_index(index); + let value = model.meta_val_str_by_index(index); + assert!(key.is_ok()); + assert!(value.is_ok()); + } + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_val_str("key\0with_null"); + assert!(result.is_err()); + Ok(()) + } +} + +mod model_params { + #![expect( + clippy::similar_names, + reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity" + )] + + use std::ffi::CString; + use std::pin::pin; + + use anyhow::Result; + use llama_cpp_bindings::context::params::LlamaContextParams; + use llama_cpp_bindings::max_devices; + use llama_cpp_bindings::model::params::LlamaModelParams; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let model_path_str = fixture + .model_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?; + let model_path_cstr = CString::new(model_path_str)?; + + let mut params = pin!(LlamaModelParams::default()); + let mut context_params = LlamaContextParams::default(); + let mut margins = vec![0usize; max_devices()]; + + let result = params.as_mut().fit_params( + &model_path_cstr, + &mut context_params, + &mut margins, + 512, + llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE, + ); + + let fit = + result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?; + assert!(fit.n_ctx > 0); + + Ok(()) + } +} + +mod model_special_tokens { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_bindings::SampledToken; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let bos = model.token_bos(); + let eos = model.token_eos(); + + assert_ne!(bos, eos); + assert!(model.is_eog_token(&SampledToken::Content(eos))); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let nl_token = fixture.model.token_nl(); + assert!(nl_token.0 >= 0); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::Reasoning(eos))); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::ToolCall(eos))); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::Undeterminable(eos))); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let token = model.decode_start_token(); + let n_vocab = model.n_vocab(); + assert!( + token.0 == -1 || (0..n_vocab).contains(&token.0), + "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}" + ); + assert_eq!( + token, + model.decode_start_token(), + "decode_start_token must be deterministic across calls" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let token = model.token_sep(); + let n_vocab = model.n_vocab(); + assert!( + token.0 == -1 || (0..n_vocab).contains(&token.0), + "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}" + ); + assert_eq!( + token, + model.token_sep(), + "token_sep must be deterministic across calls" + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let bos = model.token_bos(); + let attrs = model.token_attr(bos)?; + let bit_repr = format!("{:?}", *attrs); + assert!( + !bit_repr.is_empty(), + "token_attr(bos) must produce Debug output" + ); + Ok(()) + } +} + +mod model_str_to_token { + use anyhow::Result; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello world", AddBos::Never)?; + assert!(!tokens.is_empty()); + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let piece = model.token_to_piece( + &llama_cpp_bindings::SampledToken::Content(tokens[0]), + &mut decoder, + false, + None, + )?; + + assert!(!piece.is_empty()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn str_to_token_grows_buffer_when_initial_estimation_too_small( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let many_short_chars = "a b c d e f g h i j k l"; + let tokens = fixture + .model + .str_to_token(many_short_chars, AddBos::Always)?; + + assert!( + tokens.len() > 8, + "expected regrow; got {} tokens", + tokens.len() + ); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?; + let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?; + + assert!(tokens_with_bos.len() >= tokens_without_bos.len()); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn str_to_token_with_many_tokens_triggers_buffer_resize( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + use std::fmt::Write; + + let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| { + let _ = write!(accumulator, "{number} "); + accumulator + }); + + let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?; + + assert!(tokens.len() > many_numbers.len() / 2); + + Ok(()) + } +} + +mod model_token_to_piece { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use std::num::NonZeroU16; + + use anyhow::Result; + use llama_cpp_bindings::SampledToken; + use llama_cpp_bindings::model::AddBos; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_to_piece_bytes_returns_bytes_for_known_token( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello", AddBos::Never)?; + let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?; + + assert!(!bytes.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_to_piece_handles_large_token_requiring_buffer_resize( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + + for (token, _) in model.tokens(true).take(200) { + let result = + model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None); + assert!(result.is_ok()); + } + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_to_piece_bytes_insufficient_buffer_returns_error( + fixture: &LlamaFixture<'_>, + ) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello", AddBos::Never)?; + let result = model.token_to_piece_bytes(tokens[0], 1, false, None); + + assert!( + result + .unwrap_err() + .to_string() + .contains("Insufficient Buffer Space") + ); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hello", AddBos::Never)?; + let result = model.token_to_piece( + &SampledToken::Content(tokens[0]), + &mut decoder, + false, + NonZeroU16::new(1), + ); + + assert!(result.is_ok()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = model.token_to_piece( + &SampledToken::Reasoning(tokens[0]), + &mut decoder, + true, + None, + )?; + + assert!(!piece.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = + model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?; + + assert!(!piece.is_empty()); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = model.token_to_piece( + &SampledToken::Undeterminable(tokens[0]), + &mut decoder, + true, + None, + )?; + + assert!(!piece.is_empty()); + Ok(()) + } +} + +mod model_tokens_iterator { + #![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" + )] + + use anyhow::Result; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut count = 0; + + for (token, _piece_result) in model.tokens(false) { + assert!(token.0 >= 0); + count += 1; + + if count >= 100 { + break; + } + } + + assert_eq!(count, 100); + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + )] + fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let n_vocab = model.n_vocab(); + let count = model.tokens(false).count(); + + assert_eq!(count, usize::try_from(n_vocab)?); + Ok(()) + } +} + +mod model_helpers { + #![expect( + clippy::unnecessary_wraps, + reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" + )] + + use anyhow::Result; + use llama_cpp_test_harness::LlamaFixture; + use llama_cpp_test_harness::llama_test; + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 + )] + fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> { + let formatted = format!("{:?}", fixture.model); + + assert!(formatted.contains("LlamaModel")); + assert!(formatted.contains("model")); + + Ok(()) + } + + #[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 + )] + fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.approximate_tok_env(); + let second = fixture.model.approximate_tok_env(); + + assert!(std::sync::Arc::ptr_eq(&first, &second)); + + Ok(()) + } +} + +llama_tests_main!(); diff --git a/llama-cpp-test-harness/Cargo.toml b/llama-cpp-test-harness/Cargo.toml index 041ea779..477362da 100644 --- a/llama-cpp-test-harness/Cargo.toml +++ b/llama-cpp-test-harness/Cargo.toml @@ -13,6 +13,7 @@ inventory = { workspace = true } libtest-mimic = { workspace = true } llama-cpp-bindings = { workspace = true } llama-cpp-test-harness-macros = { workspace = true } +thiserror = { workspace = true } [features] cuda = ["llama-cpp-bindings/cuda"] diff --git a/llama-cpp-test-harness/src/deterministic_arguments.rs b/llama-cpp-test-harness/src/deterministic_arguments.rs deleted file mode 100644 index 353053dd..00000000 --- a/llama-cpp-test-harness/src/deterministic_arguments.rs +++ /dev/null @@ -1,50 +0,0 @@ -use libtest_mimic::Arguments; - -const fn build_deterministic_arguments(mut arguments: Arguments) -> Arguments { - arguments.test_threads = Some(1); - arguments -} - -#[must_use] -pub fn deterministic_arguments_from_cli() -> Arguments { - build_deterministic_arguments(Arguments::from_args()) -} - -#[cfg(test)] -mod tests { - use libtest_mimic::Arguments; - - use super::build_deterministic_arguments; - - #[test] - fn build_deterministic_arguments_forces_test_threads_to_one() { - let input = Arguments { - test_threads: Some(8), - ..Arguments::default() - }; - let output = build_deterministic_arguments(input); - - assert_eq!(output.test_threads, Some(1)); - } - - #[test] - fn build_deterministic_arguments_overrides_unset_test_threads() { - let input = Arguments::default(); - let output = build_deterministic_arguments(input); - - assert_eq!(output.test_threads, Some(1)); - } - - #[test] - fn build_deterministic_arguments_preserves_other_settings() { - let input = Arguments { - list: true, - filter: Some("foo".to_owned()), - ..Arguments::default() - }; - let output = build_deterministic_arguments(input); - - assert!(output.list); - assert_eq!(output.filter.as_deref(), Some("foo")); - } -} diff --git a/llama-cpp-test-harness/src/execution_plan.rs b/llama-cpp-test-harness/src/execution_plan.rs index 52f6dd4c..927c87a8 100644 --- a/llama-cpp-test-harness/src/execution_plan.rs +++ b/llama-cpp-test-harness/src/execution_plan.rs @@ -16,10 +16,10 @@ use std::collections::BTreeMap; use std::sync::Arc; +use libtest_mimic::Arguments; use libtest_mimic::Conclusion; use llama_cpp_bindings::llama_backend::LlamaBackend; -use crate::deterministic_arguments::deterministic_arguments_from_cli; use crate::execution_phase::ExecutionPhase; use crate::llama_test_registration::LlamaTestRegistration; @@ -65,13 +65,12 @@ impl ExecutionPlan { } #[must_use] - pub fn run(&self, backend: &Arc) -> Vec { - let arguments = deterministic_arguments_from_cli(); + pub fn run(&self, backend: &Arc, arguments: &Arguments) -> Vec { let total = self.phases.len(); let mut conclusions = Vec::with_capacity(total); for (index, phase) in self.phases.iter().enumerate() { phase.print_header(index, total); - conclusions.push(phase.run(backend, &arguments)); + conclusions.push(phase.run(backend, arguments)); } conclusions } diff --git a/llama-cpp-test-harness/src/harness_arguments_error.rs b/llama-cpp-test-harness/src/harness_arguments_error.rs new file mode 100644 index 00000000..53db2279 --- /dev/null +++ b/llama-cpp-test-harness/src/harness_arguments_error.rs @@ -0,0 +1,9 @@ +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum HarnessArgumentsError { + #[error( + "the test harness requires --test-threads=1 (or unset); got --test-threads={requested}" + )] + ConflictingTestThreads { requested: usize }, +} diff --git a/llama-cpp-test-harness/src/lib.rs b/llama-cpp-test-harness/src/lib.rs index fb0c1230..8f112b9f 100644 --- a/llama-cpp-test-harness/src/lib.rs +++ b/llama-cpp-test-harness/src/lib.rs @@ -8,10 +8,10 @@ //! See the workspace README and `tests/` directory for usage examples. pub mod context_params; -pub mod deterministic_arguments; pub mod download_model; pub mod execution_phase; pub mod execution_plan; +pub mod harness_arguments_error; pub mod llama_fixture; pub mod llama_test_fn; pub mod llama_test_registration; @@ -21,6 +21,7 @@ pub mod mmproj_source; pub mod model_load_params; pub mod model_source; pub mod no_op; +pub mod parse_harness_arguments; pub mod phase_state; pub mod run; pub mod run_to_conclusions; diff --git a/llama-cpp-test-harness/src/parse_harness_arguments.rs b/llama-cpp-test-harness/src/parse_harness_arguments.rs new file mode 100644 index 00000000..b4b3ce72 --- /dev/null +++ b/llama-cpp-test-harness/src/parse_harness_arguments.rs @@ -0,0 +1,82 @@ +use libtest_mimic::Arguments; + +use crate::harness_arguments_error::HarnessArgumentsError; + +fn validate(mut arguments: Arguments) -> Result { + match arguments.test_threads { + None | Some(1) => { + arguments.test_threads = Some(1); + Ok(arguments) + } + Some(requested) => Err(HarnessArgumentsError::ConflictingTestThreads { requested }), + } +} + +/// Parses the test-binary CLI into [`libtest_mimic::Arguments`], enforcing the harness's +/// single-thread requirement. +/// +/// `--test-threads` left unset is treated as `1`; `--test-threads=1` is accepted unchanged. +/// +/// # Errors +/// +/// Returns [`HarnessArgumentsError::ConflictingTestThreads`] when `--test-threads` is set to +/// any value other than `1`. The harness orchestrates phase batching itself and cannot share +/// that responsibility with `libtest_mimic`'s thread pool. +pub fn parse_harness_arguments() -> Result { + validate(Arguments::from_args()) +} + +#[cfg(test)] +mod tests { + use libtest_mimic::Arguments; + + use crate::harness_arguments_error::HarnessArgumentsError; + + use super::validate; + + #[test] + fn validate_accepts_unset_test_threads_and_defaults_to_one() { + let input = Arguments::default(); + let output = validate(input).expect("unset must be accepted"); + + assert_eq!(output.test_threads, Some(1)); + } + + #[test] + fn validate_accepts_explicit_single_thread() { + let input = Arguments { + test_threads: Some(1), + ..Arguments::default() + }; + let output = validate(input).expect("--test-threads=1 must be accepted"); + + assert_eq!(output.test_threads, Some(1)); + } + + #[test] + fn validate_rejects_non_one_test_threads() { + let input = Arguments { + test_threads: Some(8), + ..Arguments::default() + }; + let error = validate(input).expect_err("--test-threads=8 must be rejected"); + + assert!(matches!( + error, + HarnessArgumentsError::ConflictingTestThreads { requested: 8 } + )); + } + + #[test] + fn validate_preserves_other_settings() { + let input = Arguments { + list: true, + filter: Some("foo".to_owned()), + ..Arguments::default() + }; + let output = validate(input).expect("default test_threads must pass"); + + assert!(output.list); + assert_eq!(output.filter.as_deref(), Some("foo")); + } +} diff --git a/llama-cpp-test-harness/src/run.rs b/llama-cpp-test-harness/src/run.rs index 6d13b1b4..376cbbae 100644 --- a/llama-cpp-test-harness/src/run.rs +++ b/llama-cpp-test-harness/src/run.rs @@ -5,6 +5,7 @@ use libtest_mimic::Conclusion; use llama_cpp_bindings::llama_backend::LlamaBackend; use crate::execution_plan::ExecutionPlan; +use crate::parse_harness_arguments::parse_harness_arguments; fn aggregate_exit_code(conclusions: &[Conclusion]) -> ExitCode { if conclusions.iter().any(Conclusion::has_failed) { @@ -16,6 +17,13 @@ fn aggregate_exit_code(conclusions: &[Conclusion]) -> ExitCode { #[must_use] pub fn run() -> ExitCode { + let arguments = match parse_harness_arguments() { + Ok(arguments) => arguments, + Err(error) => { + eprintln!("llama-cpp-test-harness: {error}"); + return ExitCode::from(2); + } + }; let mut backend = match LlamaBackend::init() { Ok(backend) => backend, Err(error) => { @@ -28,7 +36,7 @@ pub fn run() -> ExitCode { backend.void_logs(); } let backend = Arc::new(backend); - aggregate_exit_code(&plan.run(&backend)) + aggregate_exit_code(&plan.run(&backend, &arguments)) } #[cfg(test)] diff --git a/llama-cpp-test-harness/src/run_to_conclusions.rs b/llama-cpp-test-harness/src/run_to_conclusions.rs index 8de67e11..67c90003 100644 --- a/llama-cpp-test-harness/src/run_to_conclusions.rs +++ b/llama-cpp-test-harness/src/run_to_conclusions.rs @@ -4,6 +4,7 @@ use libtest_mimic::Conclusion; use llama_cpp_bindings::llama_backend::LlamaBackend; use crate::execution_plan::ExecutionPlan; +use crate::parse_harness_arguments::parse_harness_arguments; /// Runs every registered test against its declared model and returns one [`Conclusion`] per phase. /// @@ -13,10 +14,15 @@ use crate::execution_plan::ExecutionPlan; /// /// # Panics /// -/// Panics if [`LlamaBackend::init`] fails. The harness is meaningless without a backend; a -/// crash is the loudest possible failure signal. +/// Panics if [`LlamaBackend::init`] fails or if the CLI arguments conflict with the harness's +/// single-thread requirement. The harness is meaningless without a backend or with conflicting +/// thread-count flags; a crash is the loudest possible failure signal. #[must_use] pub fn run_to_conclusions() -> Vec { + let arguments = match parse_harness_arguments() { + Ok(arguments) => arguments, + Err(error) => panic!("llama-cpp-test-harness: {error}"), + }; let mut backend = match LlamaBackend::init() { Ok(backend) => backend, Err(error) => panic!("llama-cpp-test-harness: backend init failed: {error}"), @@ -26,7 +32,7 @@ pub fn run_to_conclusions() -> Vec { backend.void_logs(); } let backend = Arc::new(backend); - plan.run(&backend) + plan.run(&backend, &arguments) } #[cfg(test)]