intentee · mcharytoniuk · May 25, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/.claude/skills/run-all-tests/SKILL.md b/.claude/skills/run-all-tests/SKILL.md
@@ -26,19 +26,6 @@ echo "Device: $DEVICE"
 
 ## Step 2: run the suites
 
-Sequentially, from the workspace root. 
-
-Copy this checklist and tick each item as the suite completes:
-
-```
-Test progress:
-- [ ] make test.unit
-- [ ] make test.qwen3.5_0.8B
-- [ ] make test.qwen3.6_35b_a3b
-- [ ] make test.glm4_7_flash
-- [ ] make test.deepseek_r1_distill_llama_8b
-```
-
 Translate `$DEVICE` into the value the Makefile expects. `TEST_DEVICE` holds **only** the backend name (`cuda` / `metal` / `vulkan` / `rocm`), or empty for CPU since there is no `cpu` feature:
 
 ```bash
@@ -48,20 +35,11 @@ Translate `$DEVICE` into the value the Makefile expects. `TEST_DEVICE` holds **o
 Then run exactly:
 
 ```bash
-make test.unit TEST_DEVICE="$FEAT"
-make test.qwen3.5_0.8B TEST_DEVICE="$FEAT"
-make test.qwen3.6_35b_a3b TEST_DEVICE="$FEAT"
-make test.glm4_7_flash TEST_DEVICE="$FEAT"
-make test.deepseek_r1_distill_llama_8b TEST_DEVICE="$FEAT"
+make test.llms TEST_DEVICE="$FEAT"
 ```
 
-The Makefile's `$(if $(TEST_DEVICE),--features $(TEST_DEVICE),)` already skips the `--features` flag when `$FEAT` is empty, so the CPU path needs no further special-casing.
-
-Do not run `make test.llms` or `make test`. Those bundle every LLM suite into one cargo invocation, which loses per-suite failure attribution and breaks the checklist above.
-
 ## Step 3: rules during the run
 
-- **Serialize GPU suites.** When `$DEVICE` is `cuda` or `metal`, run test suites sequentially to avoid device contention.
 - **Per-test 30 s budget.** Flag any individual test that exceeds 30 s wall-clock. That is a real bug — production or test — not flakiness.
 
 ## Step 4: report

diff --git a/.claude/skills/run-coverage/SKILL.md b/.claude/skills/run-coverage/SKILL.md
@@ -0,0 +1,47 @@
+---
+name: run-coverage
+description: Runs code coverage checker on the fastest available device. Use when the user asks to run the coverage, or to check the code coverage.
+---
+
+# Checking the code coverage
+
+Run every instrumented test suite in the workspace, picking the fastest compiled device backend for the host, then make sure everything is within required limits.
+
+Makefile is the source of truth for the gated values, and the code coverage setup.
+
+## Step 1: detect the device
+
+Run this once at the start and echo the chosen device:
+
+```bash
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  DEVICE=metal
+elif command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
+  DEVICE=cuda
+else
+  DEVICE=cpu
+fi
+echo "Device: $DEVICE"
+```
+
+`$DEVICE` selects the backend feature for every suite in Step 2, including `test.unit`. Passing the same device through every target keeps the cmake hash stable, so llama.cpp is compiled once and reused across all suites.
+
+## Step 2: run the suites
+
+Translate `$DEVICE` into the value the Makefile expects. `TEST_DEVICE` holds **only** the backend name (`cuda` / `metal` / `vulkan` / `rocm`), or empty for CPU since there is no `cpu` feature:
+
+```bash
+[ "$DEVICE" = "cpu" ] && FEAT= || FEAT="$DEVICE"
+```
+
+Then run exactly:
+
+```bash
+make coverage TEST_DEVICE="$FEAT"
+```
+
+## Step 4: report
+
+After all suites finish, sum up the results in an actionable report. Make sure all code coverage gates are met.
+
+
diff --git a/.github/actions/install-rust-toolchain/action.yml b/.github/actions/install-rust-toolchain/action.yml
@@ -1,11 +1,11 @@
 name: install-rust-toolchain
-description: Install the pinned stable Rust toolchain (with rustfmt and clippy) and configure the cargo build cache.
+description: Install the toolchain pinned by rust-toolchain.toml and configure the cargo build cache.
 
 runs:
   using: composite
   steps:
-    - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
-      with:
-        components: rustfmt, clippy
+    - name: Install toolchain pinned by rust-toolchain.toml
+      shell: bash
+      run: cargo --version
 
     - uses: Swatinem/rust-cache@v2
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,6 +7,8 @@ members = [
   "llama-cpp-bindings",
   "llama-cpp-bindings-tests",
   "llama-cpp-log-decoder",
+  "llama-cpp-test-harness",
+  "llama-cpp-test-harness-macros",
 ]
 
 [workspace.package]
@@ -25,17 +27,25 @@ enumflags2 = "=0.7.12"
 find_cuda_helper = "=0.2.0"
 glob = "=0.3.3"
 hf-hub = "=0.5.0"
+inventory = "=0.3.24"
+libtest-mimic = "=0.8.2"
 llama-cpp-bindings = { path = "llama-cpp-bindings", version = "=0.7.0" }
 llama-cpp-bindings-build = { path = "llama-cpp-bindings-build", version = "=0.7.0" }
 llama-cpp-bindings-sys = { path = "llama-cpp-bindings-sys", version = "=0.7.0" }
 llama-cpp-bindings-types = { path = "llama-cpp-bindings-types", version = "=0.7.0" }
 llama-cpp-log-decoder = { path = "llama-cpp-log-decoder", version = "=0.7.0" }
+llama-cpp-test-harness = { path = "llama-cpp-test-harness", version = "=0.7.0" }
+llama-cpp-test-harness-macros = { path = "llama-cpp-test-harness-macros", version = "=0.7.0" }
 llguidance = "=1.7.0"
 log = "=0.4.29"
 nom = "=8.0.0"
+proc-macro2 = "=1.0.106"
+quote = "=1.0.45"
 serde = { version = "=1.0.228", features = ["derive"] }
 serde_json = "=1.0.149"
 serial_test = "=3.4.0"
+syn = { version = "=2.0.117", features = ["full"] }
 thiserror = "=2.0.18"
 toktrie = "=1.7.0"
+trybuild = "=1.0.116"
 walkdir = "=2.5.0"