Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions pymllm/mem_cache/mamba_radix_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,14 +534,12 @@ def _add_leaf(
value: torch.Tensor,
mamba_value: Optional[torch.Tensor] = None,
) -> MambaTreeNode:
# Parent may lose leaf status
if (
len(parent.children) == 0
and parent != self.root_node
and parent.full_lock_ref == 0
and not parent.evicted
):
self._full_evictable -= len(parent.key)
# Note: we intentionally do NOT subtract parent's tokens from
# _full_evictable when a leaf gains its first child. Internal
# nodes are still reclaimable via cascade eviction (evict children
# first, then the childless parent cascades). Subtracting here
# would break the invariant that evictable + protected == total
# tree tokens. See RadixCache._add_leaf for full rationale.

new_node = MambaTreeNode()
new_node.parent = parent
Expand Down
27 changes: 20 additions & 7 deletions pymllm/mem_cache/radix_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,13 +639,26 @@ def _add_leaf(
value: torch.Tensor,
swa_tombstone: bool = False,
) -> TreeNode:
if (
len(parent.children) == 0
and parent != self.root_node
and parent.lock_ref == 0
and not parent.evicted
):
self._evictable_size -= len(parent.key)
# Note: we do NOT subtract parent's tokens from _evictable_size
# when a leaf gains its first child, even though the parent is no
# longer directly evictable as a leaf. Reasons:
#
# 1. Internal nodes ARE reclaimable via cascade eviction: evict()
# evicts all children first, then the childless parent cascades.
# So _evictable_size correctly tracks "total reclaimable tokens".
#
# 2. _split_node (which also creates internal nodes) does not adjust
# _evictable_size. Subtracting here but not there would create
# an inconsistency.
#
# 3. inc_lock_ref / dec_lock_ref assume ALL non-root tokens are
# partitioned into evictable + protected. Subtracting here
# breaks that invariant, causing _evictable_size to go negative
# when locks walk up through internal nodes whose tokens were
# already removed.
#
# A safety guard in alloc_kv_with_eviction() breaks the eviction
# loop if evict() frees 0 tokens despite evictable_size > 0.

new_node = TreeNode()
new_node.parent = parent
Expand Down
9 changes: 9 additions & 0 deletions pymllm/orchestrator/model_runner_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,15 @@ def _alloc_kv_with_eviction(self, num_tokens: int) -> Optional[torch.Tensor]:
evict_target,
)

# Safety: if evict() freed nothing despite evictable_size > 0,
# the size accounting is stale — break to avoid spinning.
if evict_result.full_evicted == 0:
logger.warning(
"KV allocation failed: evictable_size=%d but evict freed 0 tokens",
evictable,
)
return None

# Retry allocation
result = runner.token_to_kv_pool_allocator.alloc(num_tokens)
if result is not None:
Expand Down
Loading