UbiquitousLearning · FarmersWrap · Mar 20, 2026 · Mar 20, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/pymllm/mem_cache/mamba_radix_cache.py b/pymllm/mem_cache/mamba_radix_cache.py
@@ -534,14 +534,12 @@ def _add_leaf(
         value: torch.Tensor,
         mamba_value: Optional[torch.Tensor] = None,
     ) -> MambaTreeNode:
-        # Parent may lose leaf status
-        if (
-            len(parent.children) == 0
-            and parent != self.root_node
-            and parent.full_lock_ref == 0
-            and not parent.evicted
-        ):
-            self._full_evictable -= len(parent.key)
+        # Note: we intentionally do NOT subtract parent's tokens from
+        # _full_evictable when a leaf gains its first child.  Internal
+        # nodes are still reclaimable via cascade eviction (evict children
+        # first, then the childless parent cascades).  Subtracting here
+        # would break the invariant that evictable + protected == total
+        # tree tokens.  See RadixCache._add_leaf for full rationale.
 
         new_node = MambaTreeNode()
         new_node.parent = parent

diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py
@@ -639,13 +639,26 @@ def _add_leaf(
         value: torch.Tensor,
         swa_tombstone: bool = False,
     ) -> TreeNode:
-        if (
-            len(parent.children) == 0
-            and parent != self.root_node
-            and parent.lock_ref == 0
-            and not parent.evicted
-        ):
-            self._evictable_size -= len(parent.key)
+        # Note: we do NOT subtract parent's tokens from _evictable_size
+        # when a leaf gains its first child, even though the parent is no
+        # longer directly evictable as a leaf.  Reasons:
+        #
+        # 1. Internal nodes ARE reclaimable via cascade eviction: evict()
+        #    evicts all children first, then the childless parent cascades.
+        #    So _evictable_size correctly tracks "total reclaimable tokens".
+        #
+        # 2. _split_node (which also creates internal nodes) does not adjust
+        #    _evictable_size.  Subtracting here but not there would create
+        #    an inconsistency.
+        #
+        # 3. inc_lock_ref / dec_lock_ref assume ALL non-root tokens are
+        #    partitioned into evictable + protected.  Subtracting here
+        #    breaks that invariant, causing _evictable_size to go negative
+        #    when locks walk up through internal nodes whose tokens were
+        #    already removed.
+        #
+        # A safety guard in alloc_kv_with_eviction() breaks the eviction
+        # loop if evict() frees 0 tokens despite evictable_size > 0.
 
         new_node = TreeNode()
         new_node.parent = parent

diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py
@@ -900,6 +900,15 @@ def _alloc_kv_with_eviction(self, num_tokens: int) -> Optional[torch.Tensor]:
                 evict_target,
             )
 
+            # Safety: if evict() freed nothing despite evictable_size > 0,
+            # the size accounting is stale — break to avoid spinning.
+            if evict_result.full_evicted == 0:
+                logger.warning(
+                    "KV allocation failed: evictable_size=%d but evict freed 0 tokens",
+                    evictable,
+                )
+                return None
+
             # Retry allocation
             result = runner.token_to_kv_pool_allocator.alloc(num_tokens)
             if result is not None: