diff --git a/Cargo.lock b/Cargo.lock
index 4a084bee5f..07668d8e11 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8619,9 +8619,11 @@ dependencies = [
  "restate-limiter",
  "restate-memory",
  "restate-partition-store",
+ "restate-platform",
  "restate-rocksdb",
  "restate-serde-util",
  "restate-storage-api",
+ "restate-time-util",
  "restate-types",
  "restate-util-string",
  "restate-worker-api",
diff --git a/crates/partition-store/src/journal_table_v2/mod.rs b/crates/partition-store/src/journal_table_v2/mod.rs
index 8bb7fcbdfd..33ebf9bd4d 100644
--- a/crates/partition-store/src/journal_table_v2/mod.rs
+++ b/crates/partition-store/src/journal_table_v2/mod.rs
@@ -769,23 +769,23 @@ fn budgeted_journal_v2_stream<'a, DB: DBAccess + Send>(
 impl WriteJournalTable for PartitionStoreTransaction<'_> {
     fn put_journal_entry(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         index: u32,
         entry: &StoredRawEntry,
         related_completion_ids: &[CompletionId],
     ) -> Result<()> {
-        self.assert_partition_key(&invocation_id)?;
-        put_journal_entry(self, &invocation_id, index, entry, related_completion_ids)
+        self.assert_partition_key(invocation_id)?;
+        put_journal_entry(self, invocation_id, index, entry, related_completion_ids)
     }
 
     fn delete_journal(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         journal_length: EntryIndex,
     ) -> Result<()> {
-        self.assert_partition_key(&invocation_id)?;
+        self.assert_partition_key(invocation_id)?;
         let _x = RocksDbPerfGuard::new("delete-journal");
-        delete_journal(self, &invocation_id, journal_length)
+        delete_journal(self, invocation_id, journal_length)
     }
 }
 
diff --git a/crates/partition-store/src/tests/journal_table_v2_test/mod.rs b/crates/partition-store/src/tests/journal_table_v2_test/mod.rs
index da58dcd129..d5f6226010 100644
--- a/crates/partition-store/src/tests/journal_table_v2_test/mod.rs
+++ b/crates/partition-store/src/tests/journal_table_v2_test/mod.rs
@@ -99,7 +99,7 @@ fn mock_one_way_call_command(invocation_id_completion_id: CompletionId) -> Entry
 fn populate_sleep_journal<T: WriteJournalTable>(txn: &mut T) {
     for i in 0..5 {
         txn.put_journal_entry(
-            MOCK_INVOCATION_ID_1,
+            &MOCK_INVOCATION_ID_1,
             i,
             &StoredRawEntry::new(
                 StoredRawEntryHeader::new(MillisSinceEpoch::now()),
@@ -111,7 +111,7 @@ fn populate_sleep_journal<T: WriteJournalTable>(txn: &mut T) {
     }
     for i in 5..10 {
         txn.put_journal_entry(
-            MOCK_INVOCATION_ID_1,
+            &MOCK_INVOCATION_ID_1,
             i,
             &StoredRawEntry::new(
                 StoredRawEntryHeader::new(MillisSinceEpoch::now()),
@@ -199,7 +199,7 @@ async fn sleep_point_lookups<T: ReadJournalTable>(txn: &mut T) {
 }
 
 fn delete_journal<T: WriteJournalTable>(txn: &mut T, length: usize) {
-    txn.delete_journal(MOCK_INVOCATION_ID_1, length as u32)
+    txn.delete_journal(&MOCK_INVOCATION_ID_1, length as u32)
         .unwrap();
 }
 
@@ -259,7 +259,7 @@ async fn test_call_journal() {
 
     // Populate
     txn.put_journal_entry(
-        MOCK_INVOCATION_ID_1,
+        &MOCK_INVOCATION_ID_1,
         0,
         &StoredRawEntry::new(
             StoredRawEntryHeader::new(MillisSinceEpoch::now()),
@@ -269,7 +269,7 @@ async fn test_call_journal() {
     )
     .unwrap();
     txn.put_journal_entry(
-        MOCK_INVOCATION_ID_1,
+        &MOCK_INVOCATION_ID_1,
         1,
         &StoredRawEntry::new(
             StoredRawEntryHeader::new(MillisSinceEpoch::now()),
diff --git a/crates/partition-store/src/tests/vqueue_table_test/mod.rs b/crates/partition-store/src/tests/vqueue_table_test/mod.rs
index be7d060657..67a784b8ed 100644
--- a/crates/partition-store/src/tests/vqueue_table_test/mod.rs
+++ b/crates/partition-store/src/tests/vqueue_table_test/mod.rs
@@ -20,17 +20,21 @@
 //! Invariants:
 //! - The waiting cursor must not cross the boundary to adjacent vqueue ids.
 //! - The waiting cursor must not cross partition-key boundaries either.
-//! - When the waiting cursor is "seeked" to to first after a higher-order item has been inserted
-//!   (i.e. item with has_lock=true, or with older run_at), the cursor must show this added item.
+//! - The waiting cursor uses snapshot semantics: it captures a consistent
+//!   view of storage at creation time. Writes/deletes that happen after the
+//!   cursor is created are NOT visible to that cursor — callers must create
+//!   a fresh cursor to observe them.
 
 use restate_clock::time::MillisSinceEpoch;
 use restate_storage_api::Transaction;
+use restate_storage_api::vqueue_table::ScanVQueueInboxStages;
 use restate_storage_api::vqueue_table::{
-    EntryKey, EntryMetadata, EntryValue, Stage, Status, VQueueCursor, VQueueStore,
-    WriteVQueueTable, stats::EntryStatistics,
+    EntryKey, EntryMetadata, EntryValue, Options, Stage, Status, VQueueCursor, VQueueRunningCursor,
+    VQueueStore, WriteVQueueTable, stats::EntryStatistics,
 };
 use restate_types::clock::UniqueTimestamp;
 use restate_types::identifiers::PartitionKey;
+use restate_types::sharding::KeyRange;
 use restate_types::vqueues::{EntryId, EntryKind, VQueueId};
 
 use crate::PartitionStore;
@@ -90,7 +94,16 @@ fn default_entry(id: u8) -> (EntryKey, EntryValue) {
     entry(id, false, 0, id as u64)
 }
 
-/// Collects all items from a cursor into a Vec
+fn inbox_reader(db: &crate::PartitionDb, qid: &VQueueId) -> impl VQueueCursor + use<> {
+    db.new_inbox_reader(
+        qid,
+        Options {
+            allow_blocking_io: true,
+        },
+    )
+}
+
+/// Collects all items from an inbox cursor.
 fn collect_cursor<C: VQueueCursor>(cursor: &mut C) -> Vec<(EntryKey, EntryValue)> {
     let mut items = Vec::new();
     cursor.seek_to_first();
@@ -101,6 +114,17 @@ fn collect_cursor<C: VQueueCursor>(cursor: &mut C) -> Vec<(EntryKey, EntryValue)
     items
 }
 
+/// Collects all items from a running cursor (different error type than inbox).
+fn collect_running<C: VQueueRunningCursor>(cursor: &mut C) -> Vec<(EntryKey, EntryValue)> {
+    let mut items = Vec::new();
+    cursor.seek_to_first();
+    while let Ok(Some(item)) = cursor.peek() {
+        items.push(item);
+        cursor.advance();
+    }
+    items
+}
+
 fn collect_ids(items: &[(EntryKey, EntryValue)]) -> Vec<EntryId> {
     items.iter().map(|(key, _)| *key.entry_id()).collect()
 }
@@ -125,7 +149,7 @@ async fn key_ordering_by_has_lock_run_at_seq<W: WriteVQueueTable>(txn: &mut W) {
 
 fn verify_key_ordering_by_has_lock_run_at_seq(db: &crate::PartitionDb) {
     let qid = test_qid();
-    let mut reader = db.new_inbox_reader(&qid);
+    let mut reader = inbox_reader(db, &qid);
     let items = collect_cursor(&mut reader);
 
     assert_eq!(items.len(), 5, "Expected 5 items in inbox");
@@ -165,7 +189,7 @@ async fn ordering_within_same_lock_domain<W: WriteVQueueTable>(txn: &mut W) {
 
 fn verify_ordering_within_same_lock_domain(db: &crate::PartitionDb) {
     let qid = VQueueId::custom(2000, "1");
-    let mut reader = db.new_inbox_reader(&qid);
+    let mut reader = inbox_reader(db, &qid);
     let items = collect_cursor(&mut reader);
 
     assert_eq!(items.len(), 4, "Expected 4 items");
@@ -199,12 +223,12 @@ fn verify_running_and_inbox_are_separate(db: &crate::PartitionDb) {
 
     // Running reader should only see the Run stage entry
     let mut run_reader = db.new_run_reader(&qid);
-    let run_items = collect_cursor(&mut run_reader);
+    let run_items = collect_running(&mut run_reader);
     assert_eq!(run_items.len(), 1, "Running reader should see 1 item");
     assert_eq!(*run_items[0].0.entry_id(), entry_id(10));
 
     // Inbox reader should only see the Inbox stage entries
-    let mut inbox_reader = db.new_inbox_reader(&qid);
+    let mut inbox_reader = inbox_reader(db, &qid);
     let inbox_items = collect_cursor(&mut inbox_reader);
     assert_eq!(inbox_items.len(), 2, "Inbox reader should see 2 items");
     assert_eq!(collect_ids(&inbox_items), vec![entry_id(20), entry_id(21)]);
@@ -226,10 +250,10 @@ fn verify_seek_after_works(db: &crate::PartitionDb) {
 
     let entries: Vec<_> = (1..=5).map(default_entry).collect();
 
-    let mut reader = db.new_inbox_reader(&qid);
+    let mut reader = inbox_reader(db, &qid);
 
     // Seek after the 3rd entry (id=3)
-    reader.seek_after(&qid, &entries[2].0);
+    reader.seek_after(&entries[2].0);
     let item = reader.peek().unwrap();
     assert!(item.is_some(), "Should have items after seek_after");
     // Next item should be entry 4 (strictly after entry 3)
@@ -251,7 +275,7 @@ fn verify_empty_queue_returns_none(db: &crate::PartitionDb) {
         "Empty running queue should return None"
     );
 
-    let mut inbox_reader = db.new_inbox_reader(&qid);
+    let mut inbox_reader = inbox_reader(db, &qid);
     inbox_reader.seek_to_first();
     assert!(
         inbox_reader.peek().unwrap().is_none(),
@@ -283,17 +307,17 @@ fn verify_vqueue_isolation(db: &crate::PartitionDb) {
     let qid3 = VQueueId::custom(pkey, "3");
 
     // Each queue should only see its own entry
-    let mut reader1 = db.new_inbox_reader(&qid1);
+    let mut reader1 = inbox_reader(db, &qid1);
     let items1 = collect_cursor(&mut reader1);
     assert_eq!(items1.len(), 1);
     assert_eq!(*items1[0].0.entry_id(), entry_id(1));
 
-    let mut reader2 = db.new_inbox_reader(&qid2);
+    let mut reader2 = inbox_reader(db, &qid2);
     let items2 = collect_cursor(&mut reader2);
     assert_eq!(items2.len(), 1);
     assert_eq!(*items2[0].0.entry_id(), entry_id(2));
 
-    let mut reader3 = db.new_inbox_reader(&qid3);
+    let mut reader3 = inbox_reader(db, &qid3);
     let items3 = collect_cursor(&mut reader3);
     assert_eq!(items3.len(), 1);
     assert_eq!(*items3[0].0.entry_id(), entry_id(3));
@@ -320,7 +344,7 @@ fn verify_waiting_cursor_boundary_is_respected(db: &crate::PartitionDb) {
     let qid_b = VQueueId::custom(pkey, "b");
     let a2 = entry(12, false, 10, 2);
 
-    let mut reader_a = db.new_inbox_reader(&qid_a);
+    let mut reader_a = inbox_reader(db, &qid_a);
     reader_a.seek_to_first();
 
     assert_eq!(
@@ -339,13 +363,13 @@ fn verify_waiting_cursor_boundary_is_respected(db: &crate::PartitionDb) {
         "Reader for qid_a must stop before qid_b entries"
     );
 
-    reader_a.seek_after(&qid_a, &a2.0);
+    reader_a.seek_after(&a2.0);
     assert!(
         reader_a.peek().unwrap().is_none(),
         "seek_after(last_item) must not cross into the next vqueue"
     );
 
-    let mut reader_b = db.new_inbox_reader(&qid_b);
+    let mut reader_b = inbox_reader(db, &qid_b);
     let items_b = collect_cursor(&mut reader_b);
     assert_eq!(collect_ids(&items_b), vec![entry_id(21)]);
 }
@@ -376,7 +400,7 @@ fn verify_waiting_cursor_partition_prefix_boundary_is_respected(db: &crate::Part
     let qid_next_partition = VQueueId::custom(PartitionKey::from(5_201u64), "shared-boundary");
     let target2 = entry(42, false, 10, 2);
 
-    let mut reader_target = db.new_inbox_reader(&qid_target_partition);
+    let mut reader_target = inbox_reader(db, &qid_target_partition);
     reader_target.seek_to_first();
 
     assert_eq!(
@@ -403,33 +427,33 @@ fn verify_waiting_cursor_partition_prefix_boundary_is_respected(db: &crate::Part
         "Reader for target qid must stop before adjacent partition keys"
     );
 
-    reader_target.seek_after(&qid_target_partition, &target2.0);
+    reader_target.seek_after(&target2.0);
     assert!(
         reader_target.peek().unwrap().is_none(),
         "seek_after(last_item) must not cross into adjacent partition-key prefixes"
     );
 
-    let mut reader_prev = db.new_inbox_reader(&qid_prev_partition);
+    let mut reader_prev = inbox_reader(db, &qid_prev_partition);
     assert_eq!(
         collect_ids(&collect_cursor(&mut reader_prev)),
         vec![entry_id(31)]
     );
 
-    let mut reader_next = db.new_inbox_reader(&qid_next_partition);
+    let mut reader_next = inbox_reader(db, &qid_next_partition);
     assert_eq!(
         collect_ids(&collect_cursor(&mut reader_next)),
         vec![entry_id(51)]
     );
 }
 
-/// Test: Tailing iterator sees newly enqueued items after seek_to_first.
+/// Test: a freshly created reader sees the current state of storage,
+/// including writes that landed after a previous reader was created.
 ///
-/// This verifies that the inbox reader (which uses a tailing iterator) can see
-/// items that were added after the reader was created, when re-seeking.
-async fn tailing_iterator_sees_new_items_on_reseek(rocksdb: &mut PartitionStore) {
+/// This is the snapshot-semantics counterpart to the previous tailing-iterator
+/// tests: callers must construct a new reader to observe post-creation writes.
+async fn fresh_reader_sees_current_state(rocksdb: &mut PartitionStore) {
     let qid = VQueueId::custom(6000, "1");
 
-    // Insert initial entries
     let entry1 = default_entry(1);
     let entry2 = default_entry(2);
     {
@@ -439,475 +463,60 @@ async fn tailing_iterator_sees_new_items_on_reseek(rocksdb: &mut PartitionStore)
         txn.commit().await.expect("commit should succeed");
     }
 
-    // Create reader and verify initial state
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    reader.seek_to_first();
-
-    let item = reader.peek().unwrap();
-    assert_eq!(item.as_ref().map(|e| *e.0.entry_id()), Some(entry_id(1)));
-    reader.advance();
-
-    let item = reader.peek().unwrap();
-    assert_eq!(item.as_ref().map(|e| *e.0.entry_id()), Some(entry_id(2)));
-    reader.advance();
-
-    // Should be empty now
-    assert!(reader.peek().unwrap().is_none());
-
-    // Now add a new entry while the reader is still open
-    let entry3 = default_entry(3);
+    // Create the original reader and drain it.
     {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1);
-        txn.commit().await.expect("commit should succeed");
+        let db = rocksdb.partition_db();
+        let mut reader = inbox_reader(db, &qid);
+        let items = collect_cursor(&mut reader);
+        assert_eq!(collect_ids(&items), vec![entry_id(1), entry_id(2)]);
     }
 
-    // Re-seek to first - should now see all 3 entries
-    reader.seek_to_first();
-    let items = {
-        let mut items = Vec::new();
-        while let Ok(Some(item)) = reader.peek() {
-            items.push(item);
-            reader.advance();
-        }
-        items
-    };
-
-    assert_eq!(items.len(), 3, "Should see all 3 items after reseek");
-    assert_eq!(
-        collect_ids(&items),
-        vec![entry_id(1), entry_id(2), entry_id(3)]
-    );
-}
-
-/// Test: Re-seek to first sees newly inserted higher-order items.
-///
-/// This covers both invariants mentioned in the module docs:
-/// - new item with older `run_at` appears first after re-seek
-/// - new item with `has_lock=true` appears first after re-seek
-async fn reseek_shows_new_higher_order_items(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(6_500, "1");
-
-    let base1 = entry(1, false, 100, 1);
-    let base2 = entry(2, false, 200, 2);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &base1.0, &base1.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &base2.0, &base2.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    reader.seek_to_first();
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1));
-
-    // Add an item with older run_at (higher order among unlocked entries)
-    let older_run_at = entry(3, false, 50, 3);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &older_run_at.0, &older_run_at.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    reader.seek_to_first();
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3));
-
-    // Add an item that has a lock (always higher order than unlocked entries)
-    let locked = entry(4, true, 300, 4);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &locked.0, &locked.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    reader.seek_to_first();
-    let items = {
-        let mut items = Vec::new();
-        while let Ok(Some(item)) = reader.peek() {
-            items.push(item);
-            reader.advance();
-        }
-        items
-    };
-
-    assert_eq!(
-        collect_ids(&items),
-        vec![entry_id(4), entry_id(3), entry_id(1), entry_id(2)]
-    );
-}
-
-/// Test: Tailing iterator sees newly enqueued items via seek_after.
-///
-/// This verifies that when using seek_after to resume iteration, newly added
-/// items that sort after the seek position are visible.
-async fn tailing_iterator_sees_new_items_on_seek_after(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(7000, "1");
-
-    // Insert initial entries with different key prefixes
-    let entry_first = entry(1, true, 10, 1);
-    let entry_second = entry(2, false, 10, 2);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_first.0, &entry_first.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_second.0, &entry_second.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Create reader and read the first item
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    reader.seek_to_first();
-
-    let first = reader.peek().unwrap().unwrap();
-    assert_eq!(*first.0.entry_id(), entry_id(1));
-
-    // Now add a new entry that sorts after the seek position
-    let entry_third = entry(3, false, 10, 3);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_third.0, &entry_third.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Seek after the first item - should see entry_second and entry_third
-    reader.seek_after(&qid, &first.0);
-
-    let mut remaining = Vec::new();
-    while let Ok(Some(item)) = reader.peek() {
-        remaining.push(item);
-        reader.advance();
-    }
-
-    assert_eq!(remaining.len(), 2, "Should see 2 items after seek_after");
-    assert_eq!(collect_ids(&remaining), vec![entry_id(2), entry_id(3)]);
-}
-
-/// Test: Tailing iterator sees items inserted ahead of current position without re-seek.
-///
-/// Scenario (1): after the cursor has advanced at least once, if a new item is inserted
-/// at a key greater than the current position, it should become visible by continuing to
-/// call `advance()`/`peek()` without any seek.
-async fn tailing_iterator_sees_inserted_ahead_without_reseek(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(7_200, "1");
-
-    let entry1 = entry(1, false, 10, 1);
-    let entry3 = entry(3, false, 30, 3);
-    let entry5 = entry(5, false, 50, 5);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry5.0, &entry5.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    reader.seek_to_first();
-
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1));
-    reader.advance();
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3));
-
-    let entry4 = entry(4, false, 40, 4);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry4.0, &entry4.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Continue from current position without seek.
-    reader.advance();
-    let maybe_inserted = reader.peek().unwrap().as_ref().map(|e| *e.0.entry_id());
-    assert_eq!(
-        maybe_inserted,
-        Some(entry_id(4)),
-        "Inserted item ahead of current position should be visible without re-seek"
-    );
-    reader.advance();
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(5));
-}
-
-/// Test: Tailing iterator sees flushed insertions while mid-iteration without re-seek.
-///
-/// Scenario: the cursor advances a couple of times and still has items to read.
-/// New entries are inserted ahead of the current position, but before the next
-/// existing item (splicing), and memtables are flushed.
-/// Continuing with `advance()`/`peek()` (without seek) should surface the flushed items.
-async fn tailing_iterator_sees_flushed_insertions_mid_iteration(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(7_250, "1");
-
-    let entry1 = entry(1, false, 10, 1);
-    let entry3 = entry(3, false, 30, 3);
-    let entry6 = entry(6, false, 60, 6);
-    let entry9 = entry(9, false, 90, 9);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry6.0, &entry6.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry9.0, &entry9.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    reader.seek_to_first();
-
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1));
-    reader.advance();
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3));
-
-    // Splice entries between current position (3) and next existing item (6).
-    let entry4 = entry(4, false, 40, 4);
-    let entry5 = entry(5, false, 50, 5);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry4.0, &entry4.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry5.0, &entry5.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    rocksdb
-        .partition_db()
-        .flush_memtables(true)
-        .await
-        .expect("flush memtables should succeed");
-
-    // Continue from current position without seek.
-    reader.advance();
-    let mut remaining_ids = Vec::new();
-    while let Ok(Some(item)) = reader.peek() {
-        remaining_ids.push(*item.0.entry_id());
-        reader.advance();
-    }
-
-    assert_eq!(
-        remaining_ids,
-        vec![entry_id(4), entry_id(5), entry_id(6), entry_id(9)],
-        "Spliced flushed items should be visible and not skipped without re-seek"
-    );
-}
-
-/// Test: Seeked tailing iterator sees spliced insertions after a pre-insert flush.
-///
-/// Scenario: perform one seek to position the cursor on the item immediately
-/// before the splice point, flush memtables, then insert a new item in the middle.
-/// Advancing from that seeked position should return the new spliced item first.
-async fn seeked_tailing_iterator_sees_spliced_insertions_after_preflush(
-    rocksdb: &mut PartitionStore,
-) {
-    let qid = VQueueId::custom(7_255, "1");
-
-    let entry1 = entry(1, false, 10, 1);
-    let entry3 = entry(3, false, 30, 3);
-    let entry6 = entry(6, false, 60, 6);
-    let entry9 = entry(9, false, 90, 9);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry6.0, &entry6.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry9.0, &entry9.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-
-    rocksdb
-        .partition_db()
-        .flush_memtables(true)
-        .await
-        .expect("flush memtables should succeed");
-
-    // Single seek before inserting new items: land on entry3.
-    reader.seek_after(&qid, &entry1.0);
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3));
-
-    // Splice one entry between current position (3) and next existing item (6).
-    let entry4 = entry(4, false, 40, 4);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry4.0, &entry4.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Continue from the seeked position without another seek.
-    reader.advance();
-
-    // This is the proof that we need to re-seek because the iterator will be blind
-    // to the newly added item if the seek happened after the mutable memtable flush.
-    assert_ne!(
-        *reader.peek().unwrap().unwrap().0.entry_id(),
-        entry_id(4),
-        "advance from seeked predecessor should surface the spliced item first"
-    );
-
-    // Re-seeking should make it visible again.
-    reader.seek_after(&qid, &entry1.0);
-    reader.advance();
-    assert_eq!(
-        *reader.peek().unwrap().unwrap().0.entry_id(),
-        entry_id(4),
-        "advance from seeked predecessor should surface the spliced item first"
-    );
-
-    reader.advance();
-    let mut tail_ids = Vec::new();
-    while let Ok(Some(item)) = reader.peek() {
-        tail_ids.push(*item.0.entry_id());
-        reader.advance();
-    }
-
-    assert_eq!(
-        tail_ids,
-        vec![entry_id(6), entry_id(9)],
-        "Remaining tail after the spliced item should keep original order"
-    );
-}
-
-/// Test: Seeked tailing iterator sees appended insertions after a pre-insert flush.
-///
-/// Scenario: perform one seek, flush memtables, then insert new items that are strictly
-/// after the existing tail. Continuing with `advance()`/`peek()` from that seeked
-/// position should eventually surface the appended items.
-async fn seeked_tailing_iterator_sees_appended_insertions_after_preflush(
-    rocksdb: &mut PartitionStore,
-) {
-    let qid = VQueueId::custom(7_256, "1");
-
-    let entry1 = entry(1, false, 10, 1);
-    let entry3 = entry(3, false, 30, 3);
-    let entry6 = entry(6, false, 60, 6);
-    let entry9 = entry(9, false, 90, 9);
+    // Append a new item.
+    let entry3 = default_entry(3);
     {
         let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1);
         txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry6.0, &entry6.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry9.0, &entry9.1);
         txn.commit().await.expect("commit should succeed");
     }
 
+    // A fresh reader sees all three items.
     let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-
-    // Single seek before appending new tail items: land on entry3.
-    reader.seek_after(&qid, &entry1.0);
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3));
-
-    rocksdb
-        .partition_db()
-        .flush_memtables(true)
-        .await
-        .expect("flush memtables should succeed");
-
-    let entry10 = entry(10, false, 100, 10);
-    let entry11 = entry(11, false, 110, 11);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry10.0, &entry10.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry11.0, &entry11.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Continue from the seeked position without another seek.
-    reader.advance();
-    let mut remaining_ids = Vec::new();
-    while let Ok(Some(item)) = reader.peek() {
-        remaining_ids.push(*item.0.entry_id());
-        reader.advance();
-    }
-
+    let mut reader = inbox_reader(db, &qid);
+    let items = collect_cursor(&mut reader);
     assert_eq!(
-        remaining_ids,
-        vec![entry_id(6), entry_id(9), entry_id(10), entry_id(11)],
-        "Appended flushed items should be visible after traversing existing tail"
+        collect_ids(&items),
+        vec![entry_id(1), entry_id(2), entry_id(3)]
     );
 }
 
-/// Test: Tailing iterator sees items inserted after reaching the end without re-seek.
-///
-/// Scenario (2): after the cursor reaches end-of-iteration, if a new item is inserted,
-/// continuing with `advance()`/`peek()` (without seek) should surface the new item.
-async fn tailing_iterator_sees_item_added_after_end_without_reseek(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(7_300, "1");
+/// Test: an existing reader holds a snapshot — writes after creation are not
+/// observable, even after `seek_to_first` or `seek_after`.
+async fn existing_reader_does_not_see_post_snapshot_writes(rocksdb: &mut PartitionStore) {
+    let qid = VQueueId::custom(6_100, "1");
 
-    let entry1 = entry(1, false, 10, 1);
+    let entry1 = default_entry(1);
+    let entry2 = default_entry(2);
     {
         let mut txn = rocksdb.transaction();
         txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1);
+        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1);
         txn.commit().await.expect("commit should succeed");
     }
 
     let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
+    let mut reader = inbox_reader(db, &qid);
     reader.seek_to_first();
-
     assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1));
-    reader.advance();
-    assert!(reader.peek().unwrap().is_none(), "Reader should be at end");
-
-    let entry2 = entry(2, false, 20, 2);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Stay in the same iterator path, no re-seek.
-    reader.advance();
-    assert!(
-        reader.peek().unwrap().is_none(),
-        "Inserted item after end is not visible without re-seek"
-    );
-
-    // Re-seeking makes the newly inserted item visible.
-    reader.seek_after(&qid, &entry1.0);
-    assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(2));
-}
-
-/// Test: Deleted items don't appear after reseek.
-///
-/// This verifies that when an item is deleted while the reader is open,
-/// re-seeking will not return the deleted item.
-async fn deleted_items_not_visible_after_reseek(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(8000, "1");
 
-    // Insert initial entries
-    let entry1 = default_entry(1);
-    let entry2 = default_entry(2);
+    // Insert a new item after the reader has taken its snapshot.
     let entry3 = default_entry(3);
     {
         let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1);
         txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1);
         txn.commit().await.expect("commit should succeed");
     }
 
-    // Create reader and verify we see all 3
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    let items = collect_cursor(&mut reader);
-    assert_eq!(items.len(), 3);
-
-    // Delete the middle entry while the reader is still open
-    {
-        let mut txn = rocksdb.transaction();
-        assert!(
-            txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry2.0)
-                .unwrap()
-                .is_some()
-        );
-        txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry2.0);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Re-seek and verify we only see entries 1 and 3
+    // Re-seeking the same reader still only shows the snapshot's two items.
     reader.seek_to_first();
     let items = {
         let mut items = Vec::new();
@@ -917,123 +526,68 @@ async fn deleted_items_not_visible_after_reseek(rocksdb: &mut PartitionStore) {
         }
         items
     };
-
-    assert_eq!(items.len(), 2, "Should only see 2 items after deletion");
-    assert_eq!(collect_ids(&items), vec![entry_id(1), entry_id(3)]);
+    assert_eq!(collect_ids(&items), vec![entry_id(1), entry_id(2)]);
 }
 
-/// Test: Deleted items don't appear after seek_after.
+/// Test: Stage scan reads only the requested stage key kind.
 ///
-/// This verifies that deleted items are not returned when using seek_after
-/// to resume iteration past a certain point.
-async fn deleted_items_not_visible_after_seek_after(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(8500, "1");
+/// This validates the datafusion-oriented scan API and ensures stage-specific
+/// scans do not leak rows from adjacent stage key kinds or partition keys.
+async fn stage_scan_is_filtered_by_stage(rocksdb: &mut PartitionStore) {
+    let target_partition_key = PartitionKey::from(9_300u64);
+    let other_partition_key = PartitionKey::from(9_301u64);
+    let target_qid = VQueueId::custom(target_partition_key, "scan-target");
+    let other_qid = VQueueId::custom(other_partition_key, "scan-target");
+
+    let stages = [
+        Stage::Inbox,
+        Stage::Running,
+        Stage::Suspended,
+        Stage::Paused,
+        Stage::Finished,
+    ];
 
-    // Insert entries
-    let entry1 = default_entry(1);
-    let entry2 = default_entry(2);
-    let entry3 = default_entry(3);
     {
         let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1);
-        txn.commit().await.expect("commit should succeed");
-    }
-
-    // Create reader
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    reader.seek_to_first();
-
-    // Read first item
-    let first = reader.peek().unwrap().unwrap();
-    assert_eq!(*first.0.entry_id(), entry_id(1));
+        for (index, stage) in stages.into_iter().enumerate() {
+            let entry_id = 100 + index as u8;
+            let target_entry = default_entry(entry_id);
+            let other_entry = default_entry(entry_id + 10);
 
-    // Delete entries 2 and 3 while reader is open
-    {
-        let mut txn = rocksdb.transaction();
-        assert!(
-            txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry2.0)
-                .unwrap()
-                .is_some()
-        );
-        txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry2.0);
-        assert!(
-            txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry3.0)
-                .unwrap()
-                .is_some()
-        );
-        txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry3.0);
+            txn.put_vqueue_inbox(&target_qid, stage, &target_entry.0, &target_entry.1);
+            txn.put_vqueue_inbox(&other_qid, stage, &other_entry.0, &other_entry.1);
+        }
         txn.commit().await.expect("commit should succeed");
     }
 
-    // Seek after first - should see nothing since 2 and 3 are deleted
-    reader.seek_after(&qid, &first.0);
-    assert!(
-        reader.peek().unwrap().is_none(),
-        "Should see no items after seek_after when remaining items are deleted"
-    );
-}
+    let range = KeyRange::from(target_partition_key..=target_partition_key);
 
-/// Test: Concurrent enqueue and delete operations are handled correctly.
-///
-/// This tests a more complex scenario where items are both added and removed
-/// while the reader is open.
-async fn concurrent_enqueue_and_delete(rocksdb: &mut PartitionStore) {
-    let qid = VQueueId::custom(9000, "1");
-
-    // Insert initial entries in deterministic key order
-    let entry_high = entry(10, true, 10, 10);
-    let entry_mid = entry(20, false, 20, 20);
-    let entry_low = entry(30, false, 30, 30);
-    {
-        let mut txn = rocksdb.transaction();
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_high.0, &entry_high.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_mid.0, &entry_mid.1);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_low.0, &entry_low.1);
-        txn.commit().await.expect("commit should succeed");
-    }
+    for (index, stage) in stages.into_iter().enumerate() {
+        let expected_key = default_entry(100 + index as u8).0;
+        let rows = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
+        let rows_for_scan = rows.clone();
 
-    // Create reader and read first item
-    let db = rocksdb.partition_db();
-    let mut reader = db.new_inbox_reader(&qid);
-    reader.seek_to_first();
+        rocksdb
+            .for_each_vqueue_inbox_entry(range, stage, move |(qid, got_stage, key, _)| {
+                rows_for_scan
+                    .lock()
+                    .expect("stage scan lock should not be poisoned")
+                    .push((qid.clone(), got_stage, *key));
+                std::ops::ControlFlow::Continue(())
+            })
+            .expect("stage scan setup should succeed")
+            .await
+            .expect("stage scan should succeed");
 
-    let first = reader.peek().unwrap().unwrap();
-    assert_eq!(*first.0.entry_id(), entry_id(10));
-
-    // Simultaneously: delete entry_mid, add a new entry that sorts first
-    let entry_new_first = entry(5, true, 5, 5);
-    {
-        let mut txn = rocksdb.transaction();
-        assert!(
-            txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry_mid.0)
-                .unwrap()
-                .is_some()
-        );
-        txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry_mid.0);
-        txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_new_first.0, &entry_new_first.1);
-        txn.commit().await.expect("commit should succeed");
+        let rows = rows
+            .lock()
+            .expect("stage scan lock should not be poisoned")
+            .clone();
+        assert_eq!(rows.len(), 1, "stage {stage} should return one row");
+        assert_eq!(rows[0].0, target_qid, "stage {stage} returned wrong qid");
+        assert_eq!(rows[0].1, stage, "stage {stage} returned wrong stage");
+        assert_eq!(rows[0].2, expected_key, "stage {stage} returned wrong key");
     }
-
-    // Re-seek from start - should see: id=5, id=10, id=30
-    // (id=20 is deleted, id=5 is added)
-    reader.seek_to_first();
-    let items = {
-        let mut items = Vec::new();
-        while let Ok(Some(item)) = reader.peek() {
-            items.push(item);
-            reader.advance();
-        }
-        items
-    };
-
-    assert_eq!(items.len(), 3, "Should see 3 items");
-    assert_eq!(
-        collect_ids(&items),
-        vec![entry_id(5), entry_id(10), entry_id(30)]
-    );
 }
 
 pub(crate) async fn run_tests(mut rocksdb: PartitionStore) {
@@ -1062,16 +616,9 @@ pub(crate) async fn run_tests(mut rocksdb: PartitionStore) {
     verify_waiting_cursor_boundary_is_respected(db);
     verify_waiting_cursor_partition_prefix_boundary_is_respected(db);
 
-    // Tailing iterator tests - these need mutable access to rocksdb for writes
-    tailing_iterator_sees_new_items_on_reseek(&mut rocksdb).await;
-    reseek_shows_new_higher_order_items(&mut rocksdb).await;
-    tailing_iterator_sees_new_items_on_seek_after(&mut rocksdb).await;
-    tailing_iterator_sees_inserted_ahead_without_reseek(&mut rocksdb).await;
-    tailing_iterator_sees_flushed_insertions_mid_iteration(&mut rocksdb).await;
-    seeked_tailing_iterator_sees_spliced_insertions_after_preflush(&mut rocksdb).await;
-    seeked_tailing_iterator_sees_appended_insertions_after_preflush(&mut rocksdb).await;
-    tailing_iterator_sees_item_added_after_end_without_reseek(&mut rocksdb).await;
-    deleted_items_not_visible_after_reseek(&mut rocksdb).await;
-    deleted_items_not_visible_after_seek_after(&mut rocksdb).await;
-    concurrent_enqueue_and_delete(&mut rocksdb).await;
+    stage_scan_is_filtered_by_stage(&mut rocksdb).await;
+    // Snapshot-iterator tests — exercise the contract that a fresh reader
+    // sees current storage and that an existing reader holds a fixed view.
+    fresh_reader_sees_current_state(&mut rocksdb).await;
+    existing_reader_does_not_see_post_snapshot_writes(&mut rocksdb).await;
 }
diff --git a/crates/partition-store/src/vqueue_table/mod.rs b/crates/partition-store/src/vqueue_table/mod.rs
index 57f115ac79..c1283c2368 100644
--- a/crates/partition-store/src/vqueue_table/mod.rs
+++ b/crates/partition-store/src/vqueue_table/mod.rs
@@ -18,6 +18,7 @@ mod running_reader;
 mod waiting_reader;
 
 use std::io::Cursor;
+use std::pin::Pin;
 
 pub use entry::{EntryStatusKey, EntryStatusKeyRef, StatusHeaderRaw};
 pub use inbox::InboxKey;
@@ -32,12 +33,12 @@ use tracing::error;
 
 use restate_rocksdb::Priority;
 use restate_storage_api::StorageError;
-use restate_storage_api::vqueue_table::ScanVQueueMetaTable;
 use restate_storage_api::vqueue_table::metadata::{VQueueMeta, VQueueMetaRef, VQueueMetaUpdates};
 use restate_storage_api::vqueue_table::{
     EntryKey, EntryMetadata, EntryStatusHeader, EntryValue, LazyEntryStatus, ReadVQueueTable,
     ScanVQueueTable, Stage, Status, WriteVQueueTable, stats::EntryStatistics,
 };
+use restate_storage_api::vqueue_table::{ScanVQueueInboxStages, ScanVQueueMetaTable};
 use restate_types::sharding::{KeyRange, PartitionKey};
 use restate_types::vqueues::{EntryId, Seq, VQueueId};
 
@@ -474,6 +475,118 @@ impl ScanVQueueMetaTable for PartitionStore {
     }
 }
 
+fn stage_from_key_kind(key_kind: KeyKind) -> Option<Stage> {
+    match key_kind {
+        KeyKind::VQueueInboxStage => Some(Stage::Inbox),
+        KeyKind::VQueueRunningStage => Some(Stage::Running),
+        KeyKind::VQueueSuspendedStage => Some(Stage::Suspended),
+        KeyKind::VQueuePausedStage => Some(Stage::Paused),
+        KeyKind::VQueueFinishedStage => Some(Stage::Finished),
+        _ => None,
+    }
+}
+
+fn scan_vqueue_inbox_stage<'store, K, F>(
+    partition_store: &'store PartitionStore,
+    scanner_name: &'static str,
+    range: KeyRange,
+    stage: Stage,
+    mut f: F,
+) -> Result<Pin<Box<dyn Future<Output = Result<()>> + Send + 'store>>>
+where
+    K: EncodeTableKeyPrefix + 'store,
+    F: for<'row> FnMut(
+            (&'row VQueueId, Stage, &'row EntryKey, &'row EntryValue),
+        ) -> std::ops::ControlFlow<()>
+        + Send
+        + Sync
+        + 'static,
+{
+    let future = partition_store
+        .iterator_for_each(
+            scanner_name,
+            Priority::Low,
+            TableScan::FullScanPartitionKeyRange::<K>(range),
+            move |(mut key, mut value)| {
+                let key_kind = break_on_err(KeyKind::deserialize(&mut key))?;
+                let key_stage = break_on_err(
+                    stage_from_key_kind(key_kind).ok_or(StorageError::DataIntegrityError),
+                )?;
+                if key_stage != stage {
+                    return std::ops::ControlFlow::Break(Err(StorageError::DataIntegrityError));
+                }
+
+                let qid: VQueueId = break_on_err(crate::keys::deserialize(&mut key))?;
+                let entry_key: EntryKey = break_on_err(crate::keys::deserialize(&mut key))?;
+                let entry = break_on_err(
+                    EntryValue::decode(&mut value).map_err(StorageError::BilrostDecode),
+                )?;
+
+                f((&qid, stage, &entry_key, &entry)).map_break(Ok)
+            },
+        )
+        .map_err(|_| StorageError::OperationalError)?;
+
+    Ok(Box::pin(future))
+}
+
+impl ScanVQueueInboxStages for PartitionStore {
+    fn for_each_vqueue_inbox_entry<
+        F: for<'a> FnMut(
+                (&'a VQueueId, Stage, &'a EntryKey, &'a EntryValue),
+            ) -> std::ops::ControlFlow<()>
+            + Send
+            + Sync
+            + 'static,
+    >(
+        &self,
+        range: KeyRange,
+        stage: Stage,
+        f: F,
+    ) -> Result<impl Future<Output = Result<()>> + Send> {
+        match stage {
+            Stage::Unknown => Err(StorageError::Generic(anyhow::anyhow!(
+                "Unknown stage can't be scanned"
+            ))),
+            Stage::Inbox => scan_vqueue_inbox_stage::<inbox::InboxKey, _>(
+                self,
+                "df-vqueue-inbox",
+                range,
+                stage,
+                f,
+            ),
+            Stage::Running => scan_vqueue_inbox_stage::<inbox::RunningKey, _>(
+                self,
+                "df-vqueue-running",
+                range,
+                stage,
+                f,
+            ),
+            Stage::Suspended => scan_vqueue_inbox_stage::<inbox::SuspendedKey, _>(
+                self,
+                "df-vqueue-suspended",
+                range,
+                stage,
+                f,
+            ),
+            Stage::Paused => scan_vqueue_inbox_stage::<inbox::PausedKey, _>(
+                self,
+                "df-vqueue-paused",
+                range,
+                stage,
+                f,
+            ),
+            Stage::Finished => scan_vqueue_inbox_stage::<inbox::FinishedKey, _>(
+                self,
+                "df-vqueue-finished",
+                range,
+                stage,
+                f,
+            ),
+        }
+    }
+}
+
 // ## Safety
 // The iterator is guaranteed to be dropped before the database is dropped, we hold to the
 // PartitionDb in this struct for as long as the iterator is alive.
diff --git a/crates/partition-store/src/vqueue_table/reader.rs b/crates/partition-store/src/vqueue_table/reader.rs
index 99a1870f0a..ab8bfface0 100644
--- a/crates/partition-store/src/vqueue_table/reader.rs
+++ b/crates/partition-store/src/vqueue_table/reader.rs
@@ -8,7 +8,7 @@
 // the Business Source License, use of this software will be governed
 // by the Apache License, Version 2.0.
 
-use restate_storage_api::vqueue_table::VQueueStore;
+use restate_storage_api::vqueue_table::{Options, VQueueStore};
 use restate_types::vqueues::VQueueId;
 
 use crate::PartitionDb;
@@ -24,7 +24,7 @@ impl VQueueStore for PartitionDb {
         VQueueRunningReader::new(self, qid)
     }
 
-    fn new_inbox_reader(&self, qid: &VQueueId) -> Self::InboxReader {
-        VQueueWaitingReader::new(self, qid)
+    fn new_inbox_reader(&self, qid: &VQueueId, opts: Options) -> Self::InboxReader {
+        VQueueWaitingReader::new(self, qid, opts)
     }
 }
diff --git a/crates/partition-store/src/vqueue_table/running_reader.rs b/crates/partition-store/src/vqueue_table/running_reader.rs
index cc21bfba67..f8d44aa5a5 100644
--- a/crates/partition-store/src/vqueue_table/running_reader.rs
+++ b/crates/partition-store/src/vqueue_table/running_reader.rs
@@ -13,7 +13,7 @@ use bilrost::OwnedMessage;
 use rocksdb::DBRawIteratorWithThreadMode;
 
 use restate_storage_api::StorageError;
-use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueCursor};
+use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueRunningCursor};
 use restate_types::vqueues::VQueueId;
 
 use crate::PartitionDb;
@@ -60,57 +60,15 @@ impl VQueueRunningReader {
     }
 }
 
-impl VQueueCursor for VQueueRunningReader {
+impl VQueueRunningCursor for VQueueRunningReader {
     fn seek_to_first(&mut self) {
         self.it.seek_to_first();
     }
 
-    fn seek_after(&mut self, _qid: &VQueueId, _key: &EntryKey) {
-        panic!("seek_after is not supported for running snapshot reader");
-    }
-
     fn advance(&mut self) {
         self.it.next();
     }
 
-    /// Returns the current key under cursor
-    fn current_key(&mut self) -> Result<Option<EntryKey>, StorageError> {
-        if let Some(key) = self.it.key() {
-            debug_assert_eq!(key.len(), RunningKey::serialized_length_fixed());
-
-            // The portion we are interested in is everything that represents the EntryKey
-            let entry_key =
-                <EntryKey as KeyDecode>::decode(&mut &key[RunningKey::offset_of_entry_key()..])?;
-            Ok(Some(entry_key))
-        } else {
-            // we reached the end (or an error). We cannot recover from this without seek.
-            // todo: add support for iterator refresh().
-            self.it
-                .status()
-                .context("peek into vqueue snapshot iterator")
-                .map_err(StorageError::Generic)?;
-            // iterator is beyond the end, we can't peek
-            Ok(None)
-        }
-    }
-
-    /// Returns the current value under cursor
-    fn current_value(&mut self) -> Result<Option<EntryValue>, StorageError> {
-        if let Some(mut value) = self.it.value() {
-            let value = EntryValue::decode(&mut value)?;
-            Ok(Some(value))
-        } else {
-            // we reached the end (or an error). We cannot recover from this without seek.
-            // todo: add support for iterator refresh().
-            self.it
-                .status()
-                .context("peek into vqueue snapshot iterator")
-                .map_err(StorageError::Generic)?;
-            // iterator is beyond the end, we can't peek
-            Ok(None)
-        }
-    }
-
     fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>, StorageError> {
         if let Some((key, mut value)) = self.it.item() {
             debug_assert_eq!(key.len(), RunningKey::serialized_length_fixed());
diff --git a/crates/partition-store/src/vqueue_table/waiting_reader.rs b/crates/partition-store/src/vqueue_table/waiting_reader.rs
index 48a91b4a35..0f81d9f268 100644
--- a/crates/partition-store/src/vqueue_table/waiting_reader.rs
+++ b/crates/partition-store/src/vqueue_table/waiting_reader.rs
@@ -8,12 +8,16 @@
 // the Business Source License, use of this software will be governed
 // by the Apache License, Version 2.0.
 
-use anyhow::Context;
+use std::sync::Arc;
+
 use bilrost::OwnedMessage;
 use rocksdb::DBRawIteratorWithThreadMode;
 
+use restate_rocksdb::RocksDb;
 use restate_storage_api::StorageError;
-use restate_storage_api::vqueue_table::{EntryKey, EntryValue, Stage, VQueueCursor};
+use restate_storage_api::vqueue_table::{
+    CursorError, EntryKey, EntryValue, Options, Stage, VQueueCursor,
+};
 use restate_types::vqueues::VQueueId;
 
 use crate::PartitionDb;
@@ -22,20 +26,34 @@ use crate::vqueue_table::InboxKey;
 use crate::vqueue_table::inbox::InboxKeyRef;
 
 pub struct VQueueWaitingReader {
+    qid: VQueueId,
     it: DBRawIteratorWithThreadMode<'static, rocksdb::DB>,
+    // Safety: This must be dropped last since the iterator references memory allocated by it.
+    // This is only set to Some if the iterator is configured to run with blocking IO. The
+    // assumption is that blocking iterators will run in background threads, so we need to pin
+    // the database until the iterator is dropped.
+    _db: Option<Arc<RocksDb>>,
 }
 
 impl VQueueWaitingReader {
-    pub(crate) fn new(db: &PartitionDb, qid: &VQueueId) -> Self {
+    pub(crate) fn new(db: &PartitionDb, qid: &VQueueId, opts: Options) -> Self {
         let mut readopts = rocksdb::ReadOptions::default();
         readopts.set_async_io(true);
         // this is not the place to be concerned about corruption, we favor speed
         // over safety for this particular use-case.
         readopts.set_verify_checksums(false);
-        readopts.set_tailing(true);
+        // We re-create this reader on every refill, so a fresh snapshot is what
+        // we want. A tailing iterator would see new writes but is unsafe across
+        // memtable flushes.
         // use prefix extractors for efficient filtering.
         readopts.set_prefix_same_as_start(true);
 
+        if opts.allow_blocking_io {
+            readopts.set_read_tier(rocksdb::ReadTier::All);
+        } else {
+            readopts.set_read_tier(rocksdb::ReadTier::BlockCache);
+        }
+
         // we know how big the prefix is
         let mut key_buf = [0u8; InboxKey::by_qid_prefix_len()];
         InboxKeyRef::builder()
@@ -54,10 +72,14 @@ impl VQueueWaitingReader {
             .raw_iterator_cf_opt(db.cf_handle(), readopts);
 
         Self {
+            qid: qid.clone(),
             // Safety:
             // The iterator is guaranteed to be dropped before the database is dropped, we hold to the
             // PartitionDb in the scheduler which pins the database and the column family.
+            //
+            // We also pin the database if blocking IO is configured.
             it: unsafe { super::ignore_iterator_lifetime(it) },
+            _db: opts.allow_blocking_io.then(|| db.rocksdb().clone()),
         }
     }
 }
@@ -67,9 +89,8 @@ impl VQueueCursor for VQueueWaitingReader {
         self.it.seek_to_first();
     }
 
-    fn seek_after(&mut self, qid: &VQueueId, key: &EntryKey) {
-        tracing::trace!("Seeking after {key:?}");
-        let mut key_buf = super::inbox::encode_stage_key(Stage::Inbox, qid, key);
+    fn seek_after(&mut self, key: &EntryKey) {
+        let mut key_buf = super::inbox::encode_stage_key(Stage::Inbox, &self.qid, key);
         let success = crate::convert_to_upper_bound(&mut key_buf);
         debug_assert!(success);
         self.it.seek(key_buf);
@@ -79,57 +100,20 @@ impl VQueueCursor for VQueueWaitingReader {
         self.it.next();
     }
 
-    /// Returns the current key under cursor
-    fn current_key(&mut self) -> Result<Option<EntryKey>, StorageError> {
-        if let Some(key) = self.it.key() {
-            debug_assert_eq!(key.len(), InboxKey::serialized_length_fixed());
-
-            // The portion we are interested in is everything that represents the EntryKey
-            let entry_key =
-                <EntryKey as KeyDecode>::decode(&mut &key[InboxKey::offset_of_entry_key()..])?;
-            Ok(Some(entry_key))
-        } else {
-            // we reached the end (or an error). We cannot recover from this without seek.
-            // todo: add support for iterator refresh().
-            self.it
-                .status()
-                .context("peek into vqueue snapshot iterator")
-                .map_err(StorageError::Generic)?;
-            // iterator is beyond the end, we can't peek
-            Ok(None)
-        }
-    }
-    /// Returns the current value under cursor
-    fn current_value(&mut self) -> Result<Option<EntryValue>, StorageError> {
-        if let Some(mut value) = self.it.value() {
-            let value = EntryValue::decode(&mut value)?;
-            Ok(Some(value))
-        } else {
-            // we reached the end (or an error). We cannot recover from this without seek.
-            // todo: add support for iterator refresh().
-            self.it
-                .status()
-                .context("peek into vqueue snapshot iterator")
-                .map_err(StorageError::Generic)?;
-            // iterator is beyond the end, we can't peek
-            Ok(None)
-        }
-    }
-
-    fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>, StorageError> {
+    fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>, CursorError> {
         if let Some((key, mut value)) = self.it.item() {
             debug_assert_eq!(key.len(), InboxKey::serialized_length_fixed());
             let entry_key =
                 <EntryKey as KeyDecode>::decode(&mut &key[InboxKey::offset_of_entry_key()..])?;
-            let value = EntryValue::decode(&mut value)?;
+
+            let value = EntryValue::decode(&mut value).map_err(StorageError::BilrostDecode)?;
 
             Ok(Some((entry_key, value)))
         } else {
-            // We reached the end (or an error). We cannot recover from this without seek.
-            self.it
-                .status()
-                .context("peek into vqueue iterator")
-                .map_err(StorageError::Generic)?;
+            self.it.status().map_err(|err| match err.kind() {
+                rocksdb::ErrorKind::Incomplete => CursorError::WouldBlock,
+                _ => CursorError::Other(StorageError::Generic(err.into())),
+            })?;
             // iterator is beyond the end, we can't peek
             Ok(None)
         }
diff --git a/crates/rocksdb/src/background.rs b/crates/rocksdb/src/background.rs
index 5252543a07..6584d70fa0 100644
--- a/crates/rocksdb/src/background.rs
+++ b/crates/rocksdb/src/background.rs
@@ -64,7 +64,7 @@ where
         )
         .increment(1);
 
-        let span = tracing::Span::current().clone();
+        let span = tracing::Span::current();
 
         move || span.in_scope(|| self.run())
     }
@@ -75,7 +75,7 @@ where
          OP_TYPE => self.kind.as_static_str(),
         )
         .increment(1);
-        let span = tracing::Span::current().clone();
+        let span = tracing::Span::current();
 
         move || {
             span.in_scope(|| {
diff --git a/crates/storage-api/src/journal_table_v2/mod.rs b/crates/storage-api/src/journal_table_v2/mod.rs
index 6eb68de893..00233f01c4 100644
--- a/crates/storage-api/src/journal_table_v2/mod.rs
+++ b/crates/storage-api/src/journal_table_v2/mod.rs
@@ -113,14 +113,14 @@ pub trait WriteJournalTable {
     /// Related completion ids to this RawEntry, used to build the internal index
     fn put_journal_entry(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         index: EntryIndex,
         entry: &StoredRawEntry,
         related_completion_ids: &[CompletionId],
     ) -> Result<()>;
 
     /// When length is available, it is suggested to provide it as it makes the delete more efficient.
-    fn delete_journal(&mut self, invocation_id: InvocationId, length: EntryIndex) -> Result<()>;
+    fn delete_journal(&mut self, invocation_id: &InvocationId, length: EntryIndex) -> Result<()>;
 }
 
 #[derive(Debug, Clone)]
diff --git a/crates/storage-api/src/vqueue_table/entry.rs b/crates/storage-api/src/vqueue_table/entry.rs
index 4e7990f62f..e2b40475fb 100644
--- a/crates/storage-api/src/vqueue_table/entry.rs
+++ b/crates/storage-api/src/vqueue_table/entry.rs
@@ -205,7 +205,7 @@ impl<'a> From<&'a EntryMetadata> for EntryMetadataRef<'a> {
 pub struct EntryMetadata {
     // todo: This is temporary placeholder, type and name _will_ change.
     #[bilrost(tag(1))]
-    deployment: Option<String>,
+    pub deployment: Option<String>,
 }
 
 #[cfg(test)]
diff --git a/crates/storage-api/src/vqueue_table/store.rs b/crates/storage-api/src/vqueue_table/store.rs
index d882e6103f..eca01a30dd 100644
--- a/crates/storage-api/src/vqueue_table/store.rs
+++ b/crates/storage-api/src/vqueue_table/store.rs
@@ -10,30 +10,54 @@
 
 use restate_types::vqueues::VQueueId;
 
+use crate::StorageError;
+
 use super::{EntryKey, EntryValue};
-use crate::Result;
+
+pub struct Options {
+    /// Allows blocking IO when we need to read from the storage
+    /// operations that block should be performed on IO threads.
+    ///
+    /// When set to false, seek and read operations may return `WouldBlock` if
+    /// the operation cannot be performed without blocking.
+    pub allow_blocking_io: bool,
+}
 
 /// Storage for vqueue heads (e.g., RocksDB `ready_idx`).
 pub trait VQueueStore {
-    type RunningReader: VQueueCursor + Send + Unpin;
-    type InboxReader: VQueueCursor + Send + Unpin;
+    type RunningReader: VQueueRunningCursor + Send + Unpin;
+    type InboxReader: VQueueCursor + Send + Unpin + 'static;
 
     fn new_run_reader(&self, qid: &VQueueId) -> Self::RunningReader;
-    fn new_inbox_reader(&self, qid: &VQueueId) -> Self::InboxReader;
+    fn new_inbox_reader(&self, qid: &VQueueId, opts: Options) -> Self::InboxReader;
 }
 
-/// Iterator over vqueue entries
+/// Iterator over "waiting inbox" vqueue entries
 pub trait VQueueCursor {
     /// Moves the cursor to the beginning (min key) of the vqueue
     fn seek_to_first(&mut self);
     /// Moves the cursor to point strictly after `item`
-    fn seek_after(&mut self, qid: &VQueueId, item: &EntryKey);
-    /// Returns the current key under cursor
-    fn current_key(&mut self) -> Result<Option<EntryKey>>;
-    /// Returns the current value under cursor
-    fn current_value(&mut self) -> Result<Option<EntryValue>>;
+    fn seek_after(&mut self, item: &EntryKey);
+    /// Advancing the cursor.
+    fn advance(&mut self);
     /// Peek item without advancing the cursor
-    fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>>;
+    fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>, CursorError>;
+}
+
+/// Iterator over already running vqueue entries
+pub trait VQueueRunningCursor {
+    /// Moves the cursor to the beginning (min key) of the vqueue
+    fn seek_to_first(&mut self);
+    /// Peek item without advancing the cursor
+    fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>, StorageError>;
     /// Advancing the cursor. If this fails, the error is returned on the next call to peek()
     fn advance(&mut self);
 }
+
+#[derive(Debug, thiserror::Error)]
+pub enum CursorError {
+    #[error("operation cannot be completed without block this thread")]
+    WouldBlock,
+    #[error(transparent)]
+    Other(#[from] StorageError),
+}
diff --git a/crates/storage-api/src/vqueue_table/tables.rs b/crates/storage-api/src/vqueue_table/tables.rs
index 87142d2bfb..4984c07164 100644
--- a/crates/storage-api/src/vqueue_table/tables.rs
+++ b/crates/storage-api/src/vqueue_table/tables.rs
@@ -221,3 +221,20 @@ pub trait ScanVQueueMetaTable {
         f: F,
     ) -> Result<impl Future<Output = Result<()>> + Send>;
 }
+
+pub trait ScanVQueueInboxStages {
+    /// Used for data-fusion queries
+    fn for_each_vqueue_inbox_entry<
+        F: for<'a> FnMut(
+                (&'a VQueueId, Stage, &'a EntryKey, &'a EntryValue),
+            ) -> std::ops::ControlFlow<()>
+            + Send
+            + Sync
+            + 'static,
+    >(
+        &self,
+        range: KeyRange,
+        stage: Stage,
+        f: F,
+    ) -> Result<impl Future<Output = Result<()>> + Send>;
+}
diff --git a/crates/storage-query-datafusion/src/context.rs b/crates/storage-query-datafusion/src/context.rs
index 162a9fff55..a85b830f30 100644
--- a/crates/storage-query-datafusion/src/context.rs
+++ b/crates/storage-query-datafusion/src/context.rs
@@ -282,6 +282,12 @@ where
             self.partition_store_manager.clone(),
             &self.remote_scanner_manager,
         )?;
+        crate::vqueues::register_self(
+            ctx,
+            self.partition_selector.clone(),
+            self.partition_store_manager.clone(),
+            &self.remote_scanner_manager,
+        )?;
 
         ctx.datafusion_context.sql(SYS_INVOCATION_VIEW).await?;
 
diff --git a/crates/storage-query-datafusion/src/filter.rs b/crates/storage-query-datafusion/src/filter.rs
index eb2b740172..fe65313c7e 100644
--- a/crates/storage-query-datafusion/src/filter.rs
+++ b/crates/storage-query-datafusion/src/filter.rs
@@ -22,6 +22,7 @@ use datafusion::physical_expr_common::physical_expr::snapshot_physical_expr;
 use datafusion::physical_plan::PhysicalExpr;
 use datafusion::physical_plan::expressions::{BinaryExpr, Column, InListExpr, Literal};
 
+use restate_storage_api::vqueue_table::Stage;
 use restate_types::PartitionedResourceId;
 use restate_types::identifiers::partitioner::HashPartitioner;
 use restate_types::identifiers::{
@@ -282,6 +283,78 @@ fn extract_column_literal<'a>(
     Some((col, lit))
 }
 
+#[derive(Debug, Clone)]
+pub struct VQueueFilter {
+    pub partition_keys: KeyRange,
+    pub stages: Option<BTreeSet<Stage>>,
+}
+
+impl ScanLocalPartitionFilter for VQueueFilter {
+    fn new(range: KeyRange, predicate: Option<Arc<dyn PhysicalExpr>>) -> Self {
+        let mut stages: Option<BTreeSet<Stage>> = None;
+
+        if let Some(predicate) = predicate
+            && let Ok(predicate) = snapshot_physical_expr(predicate)
+        {
+            for conjunct in split_conjunction(&predicate) {
+                let Some(conjunct_stages) = parse_vqueue_stages("stage", conjunct) else {
+                    continue;
+                };
+
+                stages = Some(match stages {
+                    Some(current) => current.intersection(&conjunct_stages).copied().collect(),
+                    None => conjunct_stages,
+                });
+            }
+        }
+
+        Self {
+            partition_keys: range,
+            stages,
+        }
+    }
+}
+
+fn parse_vqueue_stages(
+    column_name: &str,
+    predicate: &Arc<dyn PhysicalExpr>,
+) -> Option<BTreeSet<Stage>> {
+    // OR-chain recursion depth; enough for `stage = a OR stage = b OR ...` over all stages.
+    let in_list = InList::parse(predicate, 5)?;
+
+    if in_list.col.name() != column_name || in_list.negated {
+        return None;
+    }
+
+    let mut stages = BTreeSet::new();
+    for literal in in_list.list {
+        let Some(Some(stage_str)) = literal.try_as_str() else {
+            continue;
+        };
+
+        if let Some(stage) = parse_stage_literal(stage_str) {
+            stages.insert(stage);
+        }
+    }
+
+    if stages.is_empty() {
+        None
+    } else {
+        Some(stages)
+    }
+}
+
+fn parse_stage_literal(value: &str) -> Option<Stage> {
+    match value.to_ascii_lowercase().as_str() {
+        "inbox" => Some(Stage::Inbox),
+        "run" | "running" => Some(Stage::Running),
+        "suspended" => Some(Stage::Suspended),
+        "paused" => Some(Stage::Paused),
+        "finished" => Some(Stage::Finished),
+        _ => None,
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct InvocationIdFilter {
     pub partition_keys: KeyRange,
@@ -347,12 +420,13 @@ mod tests {
     use datafusion::physical_plan::PhysicalExpr;
     use datafusion::physical_plan::expressions::{BinaryExpr, Column, InListExpr, Literal};
 
+    use restate_storage_api::vqueue_table::Stage;
     use restate_types::identifiers::{InvocationId, ServiceId, StateMutationId, WithPartitionKey};
     use restate_types::invocation::{InvocationTarget, VirtualObjectHandlerType};
     use restate_types::sharding::KeyRange;
 
     use crate::filter::{
-        FirstMatchingPartitionKeyExtractor, InvocationIdFilter, PartitionKeyExtractor,
+        FirstMatchingPartitionKeyExtractor, InvocationIdFilter, PartitionKeyExtractor, VQueueFilter,
     };
     use crate::partition_store_scanner::ScanLocalPartitionFilter;
 
@@ -716,4 +790,80 @@ mod tests {
         let filter = InvocationIdFilter::new(FULL_RANGE, Some(predicate));
         assert!(filter.invocation_ids.is_none());
     }
+
+    #[test]
+    fn vqueue_filter_single_stage_eq() {
+        let predicate = eq(col("stage"), utf8_lit("running"));
+
+        let filter = VQueueFilter::new(FULL_RANGE, Some(predicate));
+        assert_eq!(
+            filter.stages,
+            Some(std::collections::BTreeSet::from([Stage::Running]))
+        );
+    }
+
+    #[test]
+    fn vqueue_filter_in_list() {
+        let predicate = in_list("stage", vec![utf8_lit("running"), utf8_lit("paused")]);
+
+        let filter = VQueueFilter::new(FULL_RANGE, Some(predicate));
+        assert_eq!(
+            filter.stages,
+            Some(std::collections::BTreeSet::from([
+                Stage::Running,
+                Stage::Paused,
+            ]))
+        );
+    }
+
+    #[test]
+    fn vqueue_filter_or_expression() {
+        let predicate = or(
+            eq(col("stage"), utf8_lit("finished")),
+            eq(col("stage"), utf8_lit("inbox")),
+        );
+
+        let filter = VQueueFilter::new(FULL_RANGE, Some(predicate));
+        assert_eq!(
+            filter.stages,
+            Some(std::collections::BTreeSet::from([
+                Stage::Finished,
+                Stage::Inbox,
+            ]))
+        );
+    }
+
+    #[test]
+    fn vqueue_filter_conjunction_intersection() {
+        let predicate = and(
+            in_list(
+                "stage",
+                vec![utf8_lit("running"), utf8_lit("paused"), utf8_lit("inbox")],
+            ),
+            eq(col("stage"), utf8_lit("paused")),
+        );
+
+        let filter = VQueueFilter::new(FULL_RANGE, Some(predicate));
+        assert_eq!(
+            filter.stages,
+            Some(std::collections::BTreeSet::from([Stage::Paused]))
+        );
+    }
+
+    #[test]
+    fn vqueue_filter_invalid_stage_falls_back() {
+        let predicate = eq(col("stage"), utf8_lit("not-a-stage"));
+
+        let filter = VQueueFilter::new(FULL_RANGE, Some(predicate));
+        assert!(filter.stages.is_none());
+        assert_eq!(filter.partition_keys, FULL_RANGE);
+    }
+
+    #[test]
+    fn vqueue_filter_no_predicate() {
+        let filter = VQueueFilter::new(FULL_RANGE, None);
+
+        assert!(filter.stages.is_none());
+        assert_eq!(filter.partition_keys, FULL_RANGE);
+    }
 }
diff --git a/crates/storage-query-datafusion/src/lib.rs b/crates/storage-query-datafusion/src/lib.rs
index 139185dd1f..cd0cd69124 100644
--- a/crates/storage-query-datafusion/src/lib.rs
+++ b/crates/storage-query-datafusion/src/lib.rs
@@ -41,6 +41,7 @@ mod table_macro;
 mod table_providers;
 mod user_limits;
 mod vqueue_meta;
+mod vqueues;
 pub use table_providers::Scan;
 pub mod table_util;
 
diff --git a/crates/storage-query-datafusion/src/table_docs.rs b/crates/storage-query-datafusion/src/table_docs.rs
index 12c68c198d..7468e484b6 100644
--- a/crates/storage-query-datafusion/src/table_docs.rs
+++ b/crates/storage-query-datafusion/src/table_docs.rs
@@ -10,7 +10,7 @@
 
 use crate::{
     deployment, inbox, invocation_state, invocation_status, journal, journal_events,
-    keyed_service_status, promise, scheduler_status, service, state, vqueue_meta,
+    keyed_service_status, promise, scheduler_status, service, state, vqueue_meta, vqueues,
 };
 use std::borrow::Cow;
 
@@ -28,6 +28,7 @@ pub const ALL_TABLE_DOCS: &[StaticTableDocs] = &[
     service::schema::TABLE_DOCS,
     state::schema::TABLE_DOCS,
     vqueue_meta::schema::TABLE_DOCS,
+    vqueues::schema::TABLE_DOCS,
 ];
 
 pub trait TableDocs {
diff --git a/crates/storage-query-datafusion/src/vqueues/mod.rs b/crates/storage-query-datafusion/src/vqueues/mod.rs
new file mode 100644
index 0000000000..9e16f0e6d3
--- /dev/null
+++ b/crates/storage-query-datafusion/src/vqueues/mod.rs
@@ -0,0 +1,18 @@
+// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH.
+// All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+mod row;
+pub(crate) mod schema;
+mod table;
+
+pub(crate) use table::register_self;
+
+#[cfg(test)]
+mod tests;
diff --git a/crates/storage-query-datafusion/src/vqueues/row.rs b/crates/storage-query-datafusion/src/vqueues/row.rs
new file mode 100644
index 0000000000..8b357e9295
--- /dev/null
+++ b/crates/storage-query-datafusion/src/vqueues/row.rs
@@ -0,0 +1,101 @@
+// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH.
+// All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+use restate_storage_api::vqueue_table::{EntryKey, EntryKind, EntryValue, Stage};
+use restate_types::vqueues::VQueueId;
+
+use super::schema::SysVqueuesBuilder;
+
+#[inline]
+pub(crate) fn append_vqueues_row<'a>(
+    builder: &mut SysVqueuesBuilder,
+    qid: &'a VQueueId,
+    stage: Stage,
+    entry_key: &'a EntryKey,
+    entry: &'a EntryValue,
+) {
+    let mut row = builder.row();
+
+    row.partition_key(qid.partition_key());
+    if row.is_id_defined() {
+        row.fmt_id(qid);
+    }
+    if row.is_stage_defined() {
+        row.fmt_stage(stage);
+    }
+    if row.is_status_defined() {
+        row.fmt_status(entry.status);
+    }
+
+    if row.is_has_lock_defined() {
+        row.has_lock(entry_key.has_lock());
+    }
+    if matches!(stage, Stage::Inbox) && row.is_run_at_defined() {
+        row.run_at(entry_key.run_at().as_unix_millis().as_u64() as i64);
+    }
+    if row.is_sequence_number_defined() {
+        row.sequence_number(entry_key.seq().as_u64());
+    }
+
+    if row.is_entry_id_defined() {
+        row.fmt_entry_id(entry_key.entry_id().display(qid.partition_key()));
+    }
+
+    if row.is_entry_kind_defined() {
+        row.entry_kind(match entry_key.kind() {
+            EntryKind::Invocation => "invocation",
+            EntryKind::StateMutation => "state-mutation",
+            EntryKind::Unknown => "unknown",
+        });
+    }
+
+    if row.is_created_at_defined() {
+        row.created_at(entry.stats.created_at.to_unix_millis().as_u64() as i64);
+    }
+
+    if row.is_transitioned_at_defined() {
+        row.transitioned_at(entry.stats.transitioned_at.to_unix_millis().as_u64() as i64);
+    }
+
+    if row.is_num_attempts_defined() {
+        row.num_attempts(entry.stats.num_attempts);
+    }
+    if row.is_num_pauses_defined() {
+        row.num_pauses(entry.stats.num_paused);
+    }
+    if row.is_num_suspensions_defined() {
+        row.num_suspensions(entry.stats.num_suspensions);
+    }
+    if row.is_num_yields_defined() {
+        row.num_yields(entry.stats.num_yields);
+    }
+
+    if row.is_first_attempt_at_defined()
+        && let Some(first_attempt_at) = entry.stats.first_attempt_at
+    {
+        row.first_attempt_at(first_attempt_at.to_unix_millis().as_u64() as i64);
+    }
+
+    if row.is_latest_attempt_at_defined()
+        && let Some(latest_attempt_at) = entry.stats.latest_attempt_at
+    {
+        row.latest_attempt_at(latest_attempt_at.to_unix_millis().as_u64() as i64);
+    }
+
+    if row.is_first_runnable_at_defined() {
+        row.first_runnable_at(entry.stats.first_runnable_at.as_u64() as i64);
+    }
+
+    if row.is_deployment_defined()
+        && let Some(deployment) = &entry.metadata.deployment
+    {
+        row.fmt_deployment(deployment);
+    }
+}
diff --git a/crates/storage-query-datafusion/src/vqueues/schema.rs b/crates/storage-query-datafusion/src/vqueues/schema.rs
new file mode 100644
index 0000000000..2bb8452c5b
--- /dev/null
+++ b/crates/storage-query-datafusion/src/vqueues/schema.rs
@@ -0,0 +1,79 @@
+// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH.
+// All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+use crate::table_macro::*;
+
+use datafusion::arrow::datatypes::DataType;
+
+// Stage scans run concurrently and merge into a shared builder, so only
+// `partition_key` is monotone within a single partition's output stream.
+define_sort_order!(sys_vqueues(partition_key));
+
+define_table!(sys_vqueues(
+    /// Internal column that is used for partitioning. Can be ignored.
+    partition_key: DataType::UInt64,
+
+    /// The VQueue Identifier (vq_...).
+    id: DataType::Utf8,
+
+    /// The stage this entry currently belongs to. Choices are 'inbox', 'running', 'paused',
+    /// 'suspended', and 'finished'.
+    stage: DataType::Utf8,
+
+    /// The entry processing status. Examples are `new`, `scheduled`, `running`,
+    /// `backing_off`, `yielded`, `suspended`, `failed`, and `succeeded`.
+    status: DataType::Utf8,
+
+    /// Whether this entry currently holds a lock.
+    has_lock: DataType::Boolean,
+
+    /// The entry will be eligible to run after this timestamp. Only present for entries
+    /// that are in the waiting inbox.
+    run_at: TimestampMillisecond,
+
+    /// Sequence number encoded in the queue ordering key.
+    sequence_number: DataType::UInt64,
+
+    /// Identifier of the entry.
+    entry_id: DataType::Utf8,
+
+    /// Entry kind (`invocation` or `state-mutation`).
+    entry_kind: DataType::Utf8,
+
+    /// Creation timestamp of the entry.
+    created_at: TimestampMillisecond,
+
+    /// Timestamp of the latest stage transition.
+    transitioned_at: TimestampMillisecond,
+
+    /// Number of times this entry has been moved to the run queue.
+    num_attempts: DataType::UInt32,
+
+    /// Number of times this entry has been moved to the paused stage.
+    num_pauses: DataType::UInt32,
+
+    /// Number of times this entry has been moved to the suspended stage.
+    num_suspensions: DataType::UInt32,
+
+    /// Number of times this entry has yielded execution.
+    num_yields: DataType::UInt32,
+
+    /// Timestamp of the first attempt to run this entry.
+    first_attempt_at: TimestampMillisecond,
+
+    /// Timestamp of the latest attempt to run this entry.
+    latest_attempt_at: TimestampMillisecond,
+
+    /// The realistic earliest time at which this entry can run its first attempt.
+    first_runnable_at: TimestampMillisecond,
+
+    /// If set, the entry's pinned deployment identifier.
+    deployment: DataType::Utf8,
+));
diff --git a/crates/storage-query-datafusion/src/vqueues/table.rs b/crates/storage-query-datafusion/src/vqueues/table.rs
new file mode 100644
index 0000000000..a23177a1d4
--- /dev/null
+++ b/crates/storage-query-datafusion/src/vqueues/table.rs
@@ -0,0 +1,159 @@
+// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH.
+// All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+use std::fmt::Debug;
+use std::ops::ControlFlow;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use futures::future::try_join_all;
+use parking_lot::Mutex;
+
+use restate_partition_store::{PartitionStore, PartitionStoreManager};
+use restate_storage_api::StorageError;
+use restate_storage_api::vqueue_table::{EntryKey, EntryValue, ScanVQueueInboxStages, Stage};
+use restate_types::identifiers::StateMutationId;
+use restate_types::vqueues::VQueueId;
+
+use crate::context::{QueryContext, SelectPartitions};
+use crate::filter::{FirstMatchingPartitionKeyExtractor, VQueueFilter};
+use crate::partition_store_scanner::{LocalPartitionsScanner, ScanLocalPartition};
+use crate::remote_query_scanner_manager::RemoteScannerManager;
+use crate::table_providers::{PartitionedTableProvider, ScanPartition};
+use crate::vqueues::row::append_vqueues_row;
+use crate::vqueues::schema::{SysVqueuesBuilder, sys_vqueues_sort_order};
+
+const NAME: &str = "sys_vqueues";
+
+pub(crate) fn register_self(
+    ctx: &QueryContext,
+    partition_selector: impl SelectPartitions,
+    partition_store_manager: Arc<PartitionStoreManager>,
+    remote_scanner_manager: &RemoteScannerManager,
+) -> datafusion::common::Result<()> {
+    let local_scanner = Arc::new(LocalPartitionsScanner::new(
+        partition_store_manager,
+        VQueuesScanner,
+    )) as Arc<dyn ScanPartition>;
+
+    let table = PartitionedTableProvider::new(
+        partition_selector,
+        SysVqueuesBuilder::schema(),
+        sys_vqueues_sort_order(),
+        remote_scanner_manager.create_distributed_scanner(NAME, local_scanner),
+        FirstMatchingPartitionKeyExtractor::default()
+            .with_partitioned_resource_id::<VQueueId>("id")
+            // entry_id can be an invocation id or state mutation id
+            .with_invocation_id("entry_id")
+            .with_partitioned_resource_id::<StateMutationId>("entry_id"),
+    );
+
+    ctx.register_partitioned_table(NAME, Arc::new(table))
+}
+
+#[derive(Debug, Clone)]
+struct VQueuesScanner;
+
+const SCAN_STAGE_ORDER: [Stage; 5] = [
+    Stage::Paused,
+    Stage::Suspended,
+    Stage::Running,
+    Stage::Inbox,
+    Stage::Finished,
+];
+
+/// Bound on in-flight stage scans per partition scan. Keeps RocksDB iterator
+/// pressure and the shared builder's `Mutex` contention in check while still
+/// overlapping I/O across stages.
+const MAX_CONCURRENT_STAGE_SCANS: usize = 2;
+
+impl ScanLocalPartition for VQueuesScanner {
+    type Builder = SysVqueuesBuilder;
+    type Item<'a> = (&'a VQueueId, Stage, &'a EntryKey, &'a EntryValue);
+    type ConversionError = std::convert::Infallible;
+    type Filter = VQueueFilter;
+
+    fn for_each_row<
+        F: for<'a> FnMut(Self::Item<'a>) -> ControlFlow<Result<(), Self::ConversionError>>
+            + Send
+            + Sync
+            + 'static,
+    >(
+        partition_store: &PartitionStore,
+        filter: VQueueFilter,
+        f: F,
+    ) -> Result<impl Future<Output = restate_storage_api::Result<()>> + Send, StorageError> {
+        let range = filter.partition_keys;
+        let stages: Vec<_> = if let Some(requested_stages) = filter.stages {
+            SCAN_STAGE_ORDER
+                .into_iter()
+                .filter(|stage| requested_stages.contains(stage))
+                .collect()
+        } else {
+            SCAN_STAGE_ORDER.to_vec()
+        };
+
+        let callback = Arc::new(Mutex::new(f));
+        let should_stop = Arc::new(AtomicBool::new(false));
+
+        Ok(async move {
+            let scan_result = async {
+                for stage_batch in stages.chunks(MAX_CONCURRENT_STAGE_SCANS) {
+                    if should_stop.load(Ordering::Relaxed) {
+                        break;
+                    }
+
+                    let mut scans = Vec::with_capacity(stage_batch.len());
+                    for stage in stage_batch {
+                        let callback = callback.clone();
+                        let should_stop = should_stop.clone();
+
+                        scans.push(partition_store.for_each_vqueue_inbox_entry(
+                            range,
+                            *stage,
+                            move |item| {
+                                if should_stop.load(Ordering::Relaxed) {
+                                    return ControlFlow::Break(());
+                                }
+
+                                let mut callback = callback.lock();
+                                let control_flow = (*callback)(item);
+                                if control_flow.is_break() {
+                                    should_stop.store(true, Ordering::Relaxed);
+                                }
+                                control_flow.map_break(Result::unwrap)
+                            },
+                        )?);
+                    }
+
+                    try_join_all(scans).await?;
+                }
+
+                Ok(())
+            }
+            .await;
+
+            // The callback can own types with blocking cleanup logic (e.g. BatchSender).
+            // Drop it off the async runtime worker thread.
+            let _ = std::thread::spawn(move || drop(callback)).join();
+
+            scan_result
+        })
+    }
+
+    fn append_row<'a>(
+        row_builder: &mut Self::Builder,
+        value: Self::Item<'a>,
+    ) -> Result<(), Self::ConversionError> {
+        let (qid, stage, entry_key, entry) = value;
+        append_vqueues_row(row_builder, qid, stage, entry_key, entry);
+        Ok(())
+    }
+}
diff --git a/crates/storage-query-datafusion/src/vqueues/tests.rs b/crates/storage-query-datafusion/src/vqueues/tests.rs
new file mode 100644
index 0000000000..4192937860
--- /dev/null
+++ b/crates/storage-query-datafusion/src/vqueues/tests.rs
@@ -0,0 +1,192 @@
+// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH.
+// All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+use crate::mocks::*;
+use crate::row;
+
+use datafusion::arrow::array::{StringArray, TimestampMillisecondArray, UInt32Array};
+use datafusion::arrow::record_batch::RecordBatch;
+use futures::StreamExt;
+use googletest::all;
+use googletest::prelude::{assert_that, eq};
+
+use restate_storage_api::Transaction;
+use restate_storage_api::vqueue_table::{
+    EntryId, EntryKey, EntryKind, EntryMetadata, EntryValue, Stage, Status, WriteVQueueTable,
+    stats::EntryStatistics,
+};
+use restate_types::clock::UniqueTimestamp;
+use restate_types::time::MillisSinceEpoch;
+use restate_types::vqueues::VQueueId;
+
+#[restate_core::test(flavor = "multi_thread", worker_threads = 2)]
+async fn get_vqueue_entry_value_fields() {
+    let mut engine = MockQueryEngine::create().await;
+
+    let qid = VQueueId::custom(1337, "df-vqueue");
+    let key = EntryKey::new(
+        true,
+        MillisSinceEpoch::new(1_744_000_001_000),
+        7,
+        EntryId::new(EntryKind::Invocation, [7; 16]),
+    );
+
+    let created_at =
+        UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_100)).unwrap();
+    let transitioned_at =
+        UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_300)).unwrap();
+    let first_attempt_at =
+        UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_400)).unwrap();
+    let latest_attempt_at =
+        UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_500)).unwrap();
+
+    let mut stats = EntryStatistics::new(created_at, key.run_at().to_owned());
+    stats.transitioned_at = transitioned_at;
+    stats.num_attempts = 3;
+    stats.num_paused = 2;
+    stats.num_suspensions = 4;
+    stats.num_yields = 5;
+    stats.first_attempt_at = Some(first_attempt_at);
+    stats.latest_attempt_at = Some(latest_attempt_at);
+
+    let value = EntryValue {
+        status: Status::BackingOff,
+        metadata: EntryMetadata {
+            deployment: Some("dp_123".to_string()),
+        },
+        stats,
+    };
+
+    // This row should be returned.
+    let mut tx = engine.partition_store().transaction();
+    tx.put_vqueue_inbox(&qid, Stage::Inbox, &key, &value);
+
+    // This row should only be returned when stage filtering selects it.
+    tx.put_vqueue_inbox(&qid, Stage::Running, &key, &value);
+    tx.commit().await.unwrap();
+
+    let records = engine
+        .execute(
+            "SELECT stage, status, num_attempts, num_pauses, num_suspensions, num_yields, \
+            created_at, transitioned_at, first_attempt_at, latest_attempt_at, first_runnable_at, \
+            run_at, deployment FROM sys_vqueues WHERE stage = 'inbox' ORDER BY sequence_number",
+        )
+        .await
+        .unwrap()
+        .stream
+        .collect::<Vec<datafusion::common::Result<RecordBatch>>>()
+        .await
+        .remove(0)
+        .unwrap();
+
+    assert_eq!(records.num_rows(), 1);
+
+    assert_that!(
+        records,
+        all!(row!(
+            0,
+            {
+                "stage" => StringArray: eq("inbox"),
+                "status" => StringArray: eq("backing_off"),
+                "num_attempts" => UInt32Array: eq(3),
+                "num_pauses" => UInt32Array: eq(2),
+                "num_suspensions" => UInt32Array: eq(4),
+                "num_yields" => UInt32Array: eq(5),
+                "created_at" => TimestampMillisecondArray: eq(created_at.to_unix_millis().as_u64() as i64),
+                "transitioned_at" => TimestampMillisecondArray: eq(transitioned_at.to_unix_millis().as_u64() as i64),
+                "first_attempt_at" => TimestampMillisecondArray: eq(first_attempt_at.to_unix_millis().as_u64() as i64),
+                "latest_attempt_at" => TimestampMillisecondArray: eq(latest_attempt_at.to_unix_millis().as_u64() as i64),
+                "first_runnable_at" => TimestampMillisecondArray: eq(value.stats.first_runnable_at.as_u64() as i64),
+                "run_at" => TimestampMillisecondArray: eq(key.run_at().as_unix_millis().as_u64() as i64),
+                "deployment" => StringArray: eq("dp_123"),
+            }
+        ))
+    );
+}
+
+#[restate_core::test(flavor = "multi_thread", worker_threads = 2)]
+async fn vqueue_stage_filter_and_unfiltered_scan() {
+    let mut engine = MockQueryEngine::create().await;
+
+    let qid = VQueueId::custom(2337, "df-vqueue-stages");
+
+    let mut tx = engine.partition_store().transaction();
+    let stages = [
+        Stage::Inbox,
+        Stage::Running,
+        Stage::Suspended,
+        Stage::Paused,
+        Stage::Finished,
+    ];
+    for (index, stage) in stages.into_iter().enumerate() {
+        let key = EntryKey::new(
+            false,
+            MillisSinceEpoch::new(1_744_001_000_000 + index as u64),
+            (index + 1) as u64,
+            EntryId::new(EntryKind::Invocation, [index as u8 + 1; 16]),
+        );
+        let value = EntryValue {
+            status: Status::Started,
+            metadata: EntryMetadata::default(),
+            stats: EntryStatistics::new(
+                UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(
+                    1_744_001_000_100 + index as u64,
+                ))
+                .unwrap(),
+                key.run_at().to_owned(),
+            ),
+        };
+        tx.put_vqueue_inbox(&qid, stage, &key, &value);
+    }
+    tx.commit().await.unwrap();
+
+    let all_stages = engine
+        .execute("SELECT stage FROM sys_vqueues ORDER BY stage")
+        .await
+        .unwrap()
+        .stream
+        .collect::<Vec<datafusion::common::Result<RecordBatch>>>()
+        .await
+        .remove(0)
+        .unwrap();
+
+    assert_eq!(all_stages.num_rows(), 5);
+    assert_that!(
+        all_stages,
+        all!(
+            row!(0, { "stage" => StringArray: eq("finished") }),
+            row!(1, { "stage" => StringArray: eq("inbox") }),
+            row!(2, { "stage" => StringArray: eq("paused") }),
+            row!(3, { "stage" => StringArray: eq("running") }),
+            row!(4, { "stage" => StringArray: eq("suspended") })
+        )
+    );
+
+    let filtered_stages = engine
+        .execute(
+            "SELECT stage FROM sys_vqueues WHERE stage IN ('running', 'paused') ORDER BY stage",
+        )
+        .await
+        .unwrap()
+        .stream
+        .collect::<Vec<datafusion::common::Result<RecordBatch>>>()
+        .await
+        .remove(0)
+        .unwrap();
+
+    assert_eq!(filtered_stages.num_rows(), 2);
+    assert_that!(
+        filtered_stages,
+        all!(
+            row!(0, { "stage" => StringArray: eq("paused") }),
+            row!(1, { "stage" => StringArray: eq("running") })
+        )
+    );
+}
diff --git a/crates/vqueues/Cargo.toml b/crates/vqueues/Cargo.toml
index 6ea4b67b02..7394eb4291 100644
--- a/crates/vqueues/Cargo.toml
+++ b/crates/vqueues/Cargo.toml
@@ -18,9 +18,11 @@ restate-clock = { workspace = true }
 restate-futures-util = { workspace = true }
 restate-limiter = { workspace = true }
 restate-memory = { workspace = true }
+restate-platform = { workspace = true }
 restate-rocksdb = { workspace = true }
 restate-serde-util = { workspace = true }
 restate-storage-api = { workspace = true }
+restate-time-util = { workspace = true }
 restate-types = { workspace = true }
 restate-util-string = { workspace = true }
 restate-worker-api = { workspace = true }
diff --git a/crates/vqueues/src/cache.rs b/crates/vqueues/src/cache.rs
index e90ea943e1..fcb8139300 100644
--- a/crates/vqueues/src/cache.rs
+++ b/crates/vqueues/src/cache.rs
@@ -8,19 +8,20 @@
 // the Business Source License, use of this software will be governed
 // by the Apache License, Version 2.0.
 
-use hashbrown::{HashMap, hash_map};
 use slotmap::SlotMap;
 use tokio::task::JoinHandle;
 use tracing::{debug, trace};
 
+use restate_platform::hash::HashMap;
 use restate_storage_api::StorageError;
 use restate_storage_api::vqueue_table::metadata::{self, VQueueMeta};
 use restate_storage_api::vqueue_table::{ReadVQueueTable, ScanVQueueTable};
+use restate_types::sharding::PartitionKey;
 use restate_types::vqueues::VQueueId;
 
 type Result<T> = std::result::Result<T, StorageError>;
 
-slotmap::new_key_type! { pub struct VQueueCacheKey; }
+slotmap::new_key_type! { pub struct VQueueHandle; }
 
 // A read-only view over the in-memory stash of vqueues.
 #[derive(Copy, Clone)]
@@ -34,10 +35,15 @@ impl<'a> VQueuesMeta<'a> {
         Self { inner: cache }
     }
 
-    pub fn get(&self, key: VQueueCacheKey) -> Option<&Slot> {
+    pub fn get(&self, key: VQueueHandle) -> Option<&Slot> {
         self.inner.slab.get(key)
     }
 
+    /// Lookup the cache handle of the vqueue by its id.
+    pub fn handle_for(&self, qid: &VQueueId) -> Option<VQueueHandle> {
+        self.inner.queues.get(qid).copied()
+    }
+
     pub fn get_vqueue(&'a self, qid: &VQueueId) -> Option<&'a VQueueMeta> {
         self.inner
             .queues
@@ -46,11 +52,13 @@ impl<'a> VQueuesMeta<'a> {
             .map(|slot| &slot.meta)
     }
 
-    pub fn iter_active_vqueues(&'a self) -> impl Iterator<Item = (&'a VQueueId, &'a VQueueMeta)> {
+    pub fn iter_active_vqueues(
+        &'a self,
+    ) -> impl Iterator<Item = (VQueueHandle, &'a VQueueId, &'a VQueueMeta)> {
         self.inner
             .slab
-            .values()
-            .filter_map(|Slot { qid, meta }| meta.is_active().then_some((qid, meta)))
+            .iter()
+            .filter_map(|(key, Slot { qid, meta })| meta.is_active().then_some((key, qid, meta)))
     }
 
     pub fn num_active(&self) -> usize {
@@ -64,6 +72,10 @@ impl<'a> VQueuesMeta<'a> {
     pub fn report(&self) {
         self.inner.report();
     }
+
+    pub fn capacity(&self) -> usize {
+        self.inner.slab.capacity()
+    }
 }
 
 #[derive(Clone)]
@@ -78,6 +90,11 @@ impl Slot {
         &self.qid
     }
 
+    #[inline(always)]
+    pub fn partition_key(&self) -> PartitionKey {
+        self.qid.partition_key()
+    }
+
     #[inline(always)]
     pub fn meta(&self) -> &VQueueMeta {
         &self.meta
@@ -97,8 +114,12 @@ impl Slot {
 // Needs rewriting after the workload pattern becomes more clear.
 #[derive(Clone)]
 pub struct VQueuesMetaCache {
-    queues: HashMap<VQueueId, VQueueCacheKey>,
-    slab: SlotMap<VQueueCacheKey, Slot>,
+    queues: HashMap<VQueueId, VQueueHandle>,
+    slab: SlotMap<VQueueHandle, Slot>,
+    /// Soft cap; partition processor triggers `compact()` once `len()` reaches
+    /// this number. The slab/hashmap will still grow past this if compaction
+    /// frees nothing.
+    target_capacity: usize,
 }
 
 impl VQueuesMetaCache {
@@ -106,31 +127,70 @@ impl VQueuesMetaCache {
         VQueuesMeta::new(self)
     }
 
-    pub fn get(&self, key: VQueueCacheKey) -> Option<&Slot> {
+    pub fn get(&self, key: VQueueHandle) -> Option<&Slot> {
         self.slab.get(key)
     }
 
-    pub fn get_mut(&mut self, key: VQueueCacheKey) -> Option<&mut Slot> {
+    pub fn get_mut(&mut self, key: VQueueHandle) -> Option<&mut Slot> {
         self.slab.get_mut(key)
     }
 
+    pub fn len(&self) -> usize {
+        self.slab.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.slab.is_empty()
+    }
+
+    /// Sweeps the cache and evicts entries whose vqueues are no longer needed.
+    /// A slot is safe to evict when its meta reports `!is_active()`. The scheduler
+    /// drops its handle eagerly on the events that flip the meta inactive
+    /// (`RemovedFromInbox`, `QueuePaused`), so meta inactivity implies the
+    /// scheduler has already released the handle. Returns the number of evicted
+    /// entries.
+    ///
+    /// Triggered automatically by `insert` once occupancy reaches
+    /// `target_capacity`, so steady-state operations never pay for compaction.
+    fn compact(&mut self) -> usize {
+        let mut evicted = 0;
+        self.slab.retain(|_handle, slot| {
+            if slot.meta.is_active() {
+                true
+            } else {
+                self.queues.remove(&slot.qid);
+                evicted += 1;
+                false
+            }
+        });
+        evicted
+    }
+
     #[cfg(any(test, feature = "test-util"))]
-    pub fn new_empty() -> Self {
+    pub fn new_empty(target_capacity: usize) -> Self {
         Self {
-            slab: Default::default(),
-            queues: Default::default(),
+            slab: SlotMap::with_capacity_and_key(target_capacity),
+            queues: HashMap::with_capacity(target_capacity),
+            target_capacity,
         }
     }
 
     /// Initializes the vqueue cache by loading all active vqueues into the cache.
     ///
+    /// `target_capacity` is the soft cap that drives compaction; the cache will
+    /// still grow past it if compaction frees nothing.
+    ///
     /// From this point on, the cache remains in-sync with the storage state by
     /// using the "apply_updates" method.
-    pub async fn create<S: ScanVQueueTable + Send + Sync + 'static>(storage: S) -> Result<Self> {
+    pub async fn create<S: ScanVQueueTable + Send + Sync + 'static>(
+        storage: S,
+        target_capacity: usize,
+    ) -> Result<Self> {
         let handle: JoinHandle<Result<_>> = tokio::task::spawn_blocking({
             move || {
-                let mut slab = SlotMap::default();
-                let mut queues = HashMap::default();
+                // Allocation doesn't bump the RSS until we write to the allocated pages.
+                let mut slab = SlotMap::with_capacity_and_key(target_capacity);
+                let mut queues = HashMap::with_capacity(target_capacity);
                 // find and load all active vqueues.
                 storage.scan_active_vqueues(|qid, meta| {
                     let key = slab.insert(Slot {
@@ -148,35 +208,44 @@ impl VQueuesMetaCache {
             .await
             .map_err(|e| StorageError::Generic(e.into()))??;
 
-        Ok(Self { slab, queues })
+        Ok(Self {
+            slab,
+            queues,
+            target_capacity,
+        })
     }
 
     pub async fn load<S: ReadVQueueTable>(
         &mut self,
         storage: &mut S,
         qid: &VQueueId,
-    ) -> Result<Option<VQueueCacheKey>> {
-        Ok(match self.queues.entry_ref(qid) {
-            hash_map::EntryRef::Occupied(entry) => Some(*entry.get()),
-            hash_map::EntryRef::Vacant(entry) => {
-                // We don't have it in cache, we must check storage.
-                match storage.get_vqueue(qid).await? {
-                    None => None,
-                    Some(meta) => {
-                        let key = self.slab.insert(Slot {
-                            qid: qid.clone(),
-                            meta,
-                        });
-                        entry.insert(key);
-                        Some(key)
-                    }
-                }
-            }
-        })
+    ) -> Result<Option<VQueueHandle>> {
+        if self.queues.contains_key(qid) {
+            return Ok(self.queues.get(qid).copied());
+        }
+        // Not in cache; consult storage.
+        match storage.get_vqueue(qid).await? {
+            None => Ok(None),
+            Some(meta) => Ok(Some(self.insert(qid.clone(), meta))),
+        }
     }
 
-    /// Inserts a vqueue metadata unconditionally to the cache.
-    pub(super) fn insert(&mut self, qid: VQueueId, meta: VQueueMeta) -> VQueueCacheKey {
+    /// Inserts a vqueue metadata unconditionally to the cache. The single point
+    /// of entry into the cache; runs compaction when occupancy reaches the
+    /// configured `target_capacity`.
+    pub(super) fn insert(&mut self, qid: VQueueId, meta: VQueueMeta) -> VQueueHandle {
+        if self.slab.len() >= self.target_capacity {
+            let evicted = self.compact();
+            if evicted == 0 {
+                tracing::info!(
+                    "vqueue cache at {} entries with no inactive queues to evict; cache will grow past target_capacity={}",
+                    self.slab.len(),
+                    self.target_capacity,
+                );
+            } else {
+                trace!("vqueue cache compaction freed {evicted} entries");
+            }
+        }
         let key = self.slab.insert(Slot {
             qid: qid.clone(),
             meta,
@@ -196,3 +265,149 @@ impl VQueuesMetaCache {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use restate_clock::time::MillisSinceEpoch;
+    use restate_limiter::LimitKey;
+    use restate_storage_api::vqueue_table::Stage;
+    use restate_storage_api::vqueue_table::metadata::{Action, MoveMetrics, Update, VQueueLink};
+    use restate_types::clock::UniqueTimestamp;
+
+    use super::*;
+
+    fn ts(unix_ms: u64) -> UniqueTimestamp {
+        UniqueTimestamp::from_unix_millis_unchecked(MillisSinceEpoch::new(unix_ms))
+    }
+
+    fn empty_meta(at: UniqueTimestamp) -> VQueueMeta {
+        VQueueMeta::new(at, None, LimitKey::None, VQueueLink::None)
+    }
+
+    /// Bumps inbox count so `is_active()` returns true.
+    fn enqueue_to_inbox(meta: &mut VQueueMeta, at: UniqueTimestamp) {
+        let metrics = MoveMetrics {
+            last_transition_at: at,
+            has_started: false,
+            blocked_on_concurrency_rules_ms: 0,
+            blocked_on_invoker_throttling_ms: 0,
+            first_runnable_at: at.to_unix_millis(),
+        };
+        meta.apply_update(&Update::new(
+            at,
+            Action::Move {
+                prev_stage: None,
+                next_stage: Stage::Inbox,
+                metrics,
+            },
+        ));
+    }
+
+    #[test]
+    fn compact_evicts_inactive_and_keeps_active() {
+        let now = ts(1_744_000_000_000);
+        // Use a high cap so insert() does not auto-compact during setup.
+        let mut cache = VQueuesMetaCache::new_empty(1024);
+
+        let qid_active = VQueueId::custom(1, "alive");
+        let qid_inactive = VQueueId::custom(2, "dormant");
+        let qid_empty_paused = VQueueId::custom(3, "paused");
+
+        // Active: has an inbox entry.
+        let mut active_meta = empty_meta(now);
+        enqueue_to_inbox(&mut active_meta, now);
+        let h_active = cache.insert(qid_active.clone(), active_meta);
+
+        // Inactive: brand-new meta has no entries.
+        let h_inactive = cache.insert(qid_inactive.clone(), empty_meta(now));
+
+        // Paused queue with empty inbox is also !is_active.
+        let mut paused_meta = empty_meta(now);
+        paused_meta.apply_update(&Update::new(now, Action::PauseVQueue {}));
+        let h_paused = cache.insert(qid_empty_paused.clone(), paused_meta);
+
+        assert_eq!(cache.len(), 3);
+
+        let evicted = cache.compact();
+
+        assert_eq!(evicted, 2);
+        assert_eq!(cache.len(), 1);
+
+        // Active stayed; inactive ones gone from both maps.
+        assert!(cache.get(h_active).is_some());
+        assert!(cache.get(h_inactive).is_none());
+        assert!(cache.get(h_paused).is_none());
+        assert_eq!(cache.view().handle_for(&qid_active), Some(h_active));
+        assert_eq!(cache.view().handle_for(&qid_inactive), None);
+        assert_eq!(cache.view().handle_for(&qid_empty_paused), None);
+    }
+
+    #[test]
+    fn compact_on_empty_cache_is_noop() {
+        let mut cache = VQueuesMetaCache::new_empty(1024);
+        assert_eq!(cache.compact(), 0);
+        assert_eq!(cache.len(), 0);
+    }
+
+    #[test]
+    fn compact_on_all_active_evicts_nothing() {
+        let now = ts(1_744_000_000_000);
+        let mut cache = VQueuesMetaCache::new_empty(1024);
+        for i in 0..5 {
+            let qid = VQueueId::custom(i, "q");
+            let mut meta = empty_meta(now);
+            enqueue_to_inbox(&mut meta, now);
+            cache.insert(qid, meta);
+        }
+        assert_eq!(cache.compact(), 0);
+        assert_eq!(cache.len(), 5);
+    }
+
+    #[test]
+    fn insert_auto_compacts_when_capacity_hit() {
+        let now = ts(1_744_000_000_000);
+        // Tiny cap so we can drive the auto-compact path in two steps.
+        let mut cache = VQueuesMetaCache::new_empty(3);
+
+        // Fill with 3 inactive entries.
+        for i in 0..3 {
+            cache.insert(VQueueId::custom(i, "stale"), empty_meta(now));
+        }
+        assert_eq!(cache.len(), 3);
+
+        // The next insert should observe `len >= target_capacity` and compact
+        // the inactive ones before adding the new entry.
+        let mut active_meta = empty_meta(now);
+        enqueue_to_inbox(&mut active_meta, now);
+        cache.insert(VQueueId::custom(99, "fresh"), active_meta);
+
+        assert_eq!(cache.len(), 1);
+        assert!(
+            cache
+                .view()
+                .handle_for(&VQueueId::custom(99, "fresh"))
+                .is_some()
+        );
+    }
+
+    #[test]
+    fn insert_grows_past_capacity_when_nothing_to_evict() {
+        let now = ts(1_744_000_000_000);
+        let mut cache = VQueuesMetaCache::new_empty(2);
+
+        // Two active entries fill the cap.
+        for i in 0..2 {
+            let mut meta = empty_meta(now);
+            enqueue_to_inbox(&mut meta, now);
+            cache.insert(VQueueId::custom(i, "active"), meta);
+        }
+        assert_eq!(cache.len(), 2);
+
+        // Third insert: compact attempts but frees nothing; cache grows.
+        let mut meta = empty_meta(now);
+        enqueue_to_inbox(&mut meta, now);
+        cache.insert(VQueueId::custom(2, "active"), meta);
+
+        assert_eq!(cache.len(), 3);
+    }
+}
diff --git a/crates/vqueues/src/lib.rs b/crates/vqueues/src/lib.rs
index 94ff262348..b5d64fc3cb 100644
--- a/crates/vqueues/src/lib.rs
+++ b/crates/vqueues/src/lib.rs
@@ -14,13 +14,14 @@ pub mod scheduler;
 mod util;
 
 // Re-exports
-pub use cache::{VQueuesMeta, VQueuesMetaCache};
+pub use cache::{VQueueHandle, VQueuesMeta, VQueuesMetaCache};
 pub use metric_definitions::describe_metrics;
 pub use restate_worker_api::{ResourceKind, SchedulingStatus, VQueueSchedulerStatus};
 pub use scheduler::{ResourceManager, SchedulerService};
+pub use util::*;
+
 use smallvec::SmallVec;
 use tracing::debug;
-pub use util::*;
 
 use restate_clock::RoughTimestamp;
 use restate_clock::time::MillisSinceEpoch;
@@ -33,42 +34,37 @@ use restate_storage_api::vqueue_table::{
     WriteVQueueTable, metadata,
 };
 use restate_storage_api::{StorageError, lock_table};
+use restate_types::ServiceName;
 use restate_types::clock::UniqueTimestamp;
 use restate_types::identifiers::PartitionKey;
 use restate_types::invocation::InvocationTarget;
 use restate_types::vqueues::{EntryId, Seq, VQueueId};
-use restate_types::{LockName, Scope, ServiceName};
+#[cfg(test)]
+use restate_types::{LockName, Scope};
 use restate_util_string::ReString;
 
-use self::cache::VQueueCacheKey;
-use self::scheduler::MetaLiteUpdate;
-
 // Token bucket used for throttling over all vqueues
 type GlobalTokenBucket<C = gardal::TokioClock> =
     gardal::TokenBucket<gardal::PaddedAtomicSharedStorage, C>;
 
 #[derive(Debug)]
 pub enum EventDetails {
-    // A vqueue that had empty inbox and now the scheduler needs to monitor
-    //
-    // It's implied that this vqueue is active (not in paused state)
-    AddVQueue {
-        scope: Option<Scope>,
-        limit_key: LimitKey<ReString>,
-        lock_name: Option<LockName>,
-    },
-    // An inbox enqueue, inbox remove, or pause/unpause of the queue.
-    InboxUpdate(MetaLiteUpdate),
-    LockReleased {
-        scope: Option<Scope>,
-        lock_name: LockName,
+    /// The vqueue was manually paused
+    QueuePaused,
+    /// The vqueue was resumed
+    QueueResumed,
+    RemovedFromInbox(EntryKey),
+    EnqueuedToInbox {
+        key: EntryKey,
+        value: EntryValue,
     },
+    LockReleased,
 }
 
 #[derive(Debug)]
 pub struct VQueueEvent {
-    pub qid: VQueueId,
-    pub updates: SmallVec<[EventDetails; 2]>,
+    pub queue: VQueueHandle,
+    pub updates: SmallVec<[EventDetails; 1]>,
 }
 
 impl VQueueEvent {
@@ -76,9 +72,9 @@ impl VQueueEvent {
         self.updates.is_empty()
     }
 
-    pub const fn new(qid: VQueueId) -> Self {
+    pub const fn new(queue: VQueueHandle) -> Self {
         Self {
-            qid,
+            queue,
             updates: SmallVec::new_const(),
         }
     }
@@ -95,7 +91,7 @@ impl VQueueEvent {
 pub struct VQueue<'a, A, S> {
     storage: &'a mut S,
     cache: &'a mut VQueuesMetaCache,
-    cache_key: VQueueCacheKey,
+    handle: VQueueHandle,
     // action collector is only available if we have a scheduler to notify
     action_collector: Option<&'a mut Vec<A>>,
 }
@@ -117,8 +113,12 @@ where
     A: From<VQueueEvent> + 'static,
     S: WriteVQueueTable + ReadVQueueTable + WriteLockTable,
 {
+    pub fn handle(&self) -> VQueueHandle {
+        self.handle
+    }
+
     pub fn meta(&self) -> &VQueueMeta {
-        self.cache.get(self.cache_key).unwrap().meta()
+        self.cache.get(self.handle).unwrap().meta()
     }
 
     /// The entry has completed execution and it needs to be removed from the vqueue.
@@ -164,7 +164,7 @@ where
 
         Ok(Some(Self {
             storage,
-            cache_key,
+            handle: cache_key,
             cache,
             action_collector,
         }))
@@ -202,7 +202,7 @@ where
 
         Ok(Self {
             storage,
-            cache_key,
+            handle: cache_key,
             cache,
             action_collector,
         })
@@ -240,7 +240,7 @@ where
 
         Ok(Self {
             storage,
-            cache_key,
+            handle: cache_key,
             cache,
             action_collector,
         })
@@ -257,7 +257,7 @@ where
         entry_id: impl Into<EntryId>,
         metadata: impl Into<EntryMetadata>,
     ) {
-        let meta = self.cache.get_mut(self.cache_key).unwrap();
+        let meta = self.cache.get_mut(self.handle).unwrap();
 
         let created_at_unix = created_at.to_unix_millis();
         let (run_at, status) = match run_at {
@@ -290,7 +290,6 @@ where
         );
 
         // Update cache
-        let was_inbox_empty = meta.meta().is_inbox_empty();
         let (was_active_before, is_active_now) = meta.apply_update(&update);
 
         // Update vqueue meta in storage
@@ -329,19 +328,8 @@ where
         if let Some(collector) = self.action_collector.as_deref_mut() {
             // Let the scheduler know about the new entry to keep its head-of-line cache of the vqueue
             // as fresh as possible.
-            let mut event = VQueueEvent::new(meta.vqueue_id().clone());
-            if was_inbox_empty {
-                event.push(EventDetails::AddVQueue {
-                    scope: meta.meta().scope().clone(),
-                    limit_key: meta.meta().limit_key().clone(),
-                    lock_name: meta.meta().lock_name().cloned(),
-                });
-            }
-
-            event.push(EventDetails::InboxUpdate(MetaLiteUpdate::EnqueuedToInbox {
-                key,
-                value,
-            }));
+            let mut event = VQueueEvent::new(self.handle);
+            event.push(EventDetails::EnqueuedToInbox { key, value });
 
             collector.push(A::from(event));
         }
@@ -365,7 +353,7 @@ where
     ) -> EntryKey {
         let vqueue_id = header.vqueue_id();
         let partition_key = vqueue_id.partition_key();
-        let meta = self.cache.get_mut(self.cache_key).unwrap();
+        let meta = self.cache.get_mut(self.handle).unwrap();
         assert_eq!(vqueue_id, meta.vqueue_id());
         assert!(matches!(header.stage(), Stage::Inbox));
 
@@ -478,7 +466,7 @@ where
         updated_metadata: Option<EntryMetadata>,
     ) {
         let vqueue_id = header.vqueue_id();
-        let meta = self.cache.get_mut(self.cache_key).unwrap();
+        let meta = self.cache.get_mut(self.handle).unwrap();
         assert_eq!(vqueue_id, meta.vqueue_id());
         assert!(matches!(header.stage(), Stage::Paused | Stage::Suspended));
 
@@ -496,7 +484,6 @@ where
         );
 
         // Update cache
-        let was_inbox_empty = meta.meta().is_inbox_empty();
         let (was_active_before, is_active_now) = meta.apply_update(&update);
         // Update vqueue meta in storage
         self.storage.update_vqueue(vqueue_id, &update);
@@ -543,20 +530,12 @@ where
             .put_vqueue_inbox(vqueue_id, Stage::Inbox, &modified_key, &value);
 
         if let Some(collector) = self.action_collector.as_deref_mut() {
-            let mut event = VQueueEvent::new(vqueue_id.clone());
-
-            if was_inbox_empty {
-                event.push(EventDetails::AddVQueue {
-                    scope: meta.meta().scope().clone(),
-                    limit_key: meta.meta().limit_key().clone(),
-                    lock_name: meta.meta().lock_name().cloned(),
-                });
-            }
+            let mut event = VQueueEvent::new(self.handle);
 
-            event.push(EventDetails::InboxUpdate(MetaLiteUpdate::EnqueuedToInbox {
+            event.push(EventDetails::EnqueuedToInbox {
                 key: modified_key,
                 value,
-            }));
+            });
             collector.push(A::from(event));
         }
     }
@@ -586,7 +565,7 @@ where
         next_stage: Stage,
     ) {
         let vqueue_id = header.vqueue_id();
-        let meta = self.cache.get_mut(self.cache_key).unwrap();
+        let meta = self.cache.get_mut(self.handle).unwrap();
         assert_eq!(vqueue_id, meta.vqueue_id());
         assert!(matches!(next_stage, Stage::Paused | Stage::Suspended));
 
@@ -649,10 +628,8 @@ where
         if let Some(collector) = self.action_collector.as_deref_mut()
             && matches!(header.stage(), Stage::Inbox)
         {
-            let mut event = VQueueEvent::new(vqueue_id.clone());
-            event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox(
-                *header.entry_key(),
-            )));
+            let mut event = VQueueEvent::new(self.handle);
+            event.push(EventDetails::RemovedFromInbox(*header.entry_key()));
             collector.push(A::from(event));
         }
     }
@@ -670,7 +647,7 @@ where
         new_status: Status,
     ) {
         let vqueue_id = header.vqueue_id();
-        let meta = self.cache.get_mut(self.cache_key).unwrap();
+        let meta = self.cache.get_mut(self.handle).unwrap();
         assert_eq!(vqueue_id, meta.vqueue_id());
 
         debug!(
@@ -693,7 +670,6 @@ where
         );
 
         // Update cache
-        let was_inbox_empty = meta.meta().is_inbox_empty();
         let (was_active_before, is_active_now) = meta.apply_update(&update);
         self.storage.update_vqueue(vqueue_id, &update);
 
@@ -741,31 +717,21 @@ where
             .put_vqueue_inbox(vqueue_id, Stage::Inbox, &modified_key, &value);
 
         if let Some(collector) = self.action_collector.as_deref_mut() {
-            let mut event = VQueueEvent::new(vqueue_id.clone());
-
-            if was_inbox_empty {
-                event.push(EventDetails::AddVQueue {
-                    scope: meta.meta().scope().clone(),
-                    limit_key: meta.meta().limit_key().clone(),
-                    lock_name: meta.meta().lock_name().cloned(),
-                });
-            }
+            let mut event = VQueueEvent::new(self.handle);
 
             if matches!(header.stage(), Stage::Inbox) {
                 // Inbox -> Inbox
                 // The scheduler is moving the item from inbox, so we need to confirm its
                 // decision. We only do that if prev_stage is Inbox because it's the only
                 // stage that needs confirmation.
-                event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox(
-                    *header.entry_key(),
-                )));
+                event.push(EventDetails::RemovedFromInbox(*header.entry_key()));
             }
 
             // Enqueue the replaced entry now
-            event.push(EventDetails::InboxUpdate(MetaLiteUpdate::EnqueuedToInbox {
+            event.push(EventDetails::EnqueuedToInbox {
                 key: modified_key,
                 value,
-            }));
+            });
 
             collector.push(A::from(event));
         }
@@ -780,7 +746,7 @@ where
         // todo: add a paramter to specify the "scrub time" for this item.
     ) {
         let vqueue_id = header.vqueue_id();
-        let meta = self.cache.get_mut(self.cache_key).unwrap();
+        let meta = self.cache.get_mut(self.handle).unwrap();
         assert_eq!(vqueue_id, meta.vqueue_id());
 
         // Remove from the current stage
@@ -846,21 +812,14 @@ where
         }
 
         if let Some(collector) = self.action_collector.as_deref_mut() {
-            let mut event = VQueueEvent::new(vqueue_id.clone());
+            let mut event = VQueueEvent::new(self.handle);
             // Release the lock if this entry has been holding a lock already
-            if header.has_lock()
-                && let Some(lock_name) = meta.meta().lock_name()
-            {
-                event.push(EventDetails::LockReleased {
-                    scope: meta.meta().scope().clone(),
-                    lock_name: lock_name.clone(),
-                });
+            if header.has_lock() && meta.meta().lock_name().is_some() {
+                event.push(EventDetails::LockReleased);
             }
 
             if matches!(header.stage(), Stage::Inbox) {
-                event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox(
-                    *header.entry_key(),
-                )));
+                event.push(EventDetails::RemovedFromInbox(*header.entry_key()));
             }
             if !event.is_empty() {
                 collector.push(A::from(event));
@@ -908,7 +867,7 @@ where
         status: Status,
     ) {
         let vqueue_id = header.vqueue_id();
-        let meta = self.cache.get_mut(self.cache_key).unwrap();
+        let meta = self.cache.get_mut(self.handle).unwrap();
         assert_eq!(vqueue_id, meta.vqueue_id());
         assert!(matches!(header.stage(), Stage::Inbox));
 
@@ -979,12 +938,10 @@ where
         );
 
         if let Some(collector) = self.action_collector.as_deref_mut() {
-            let mut event = VQueueEvent::new(vqueue_id.clone());
+            let mut event = VQueueEvent::new(self.handle);
             // In the case of state mutation, we execute it inline and we can
             // ask the scheduler to drop any pending resources for this entry.
-            event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox(
-                *header.entry_key(),
-            )));
+            event.push(EventDetails::RemovedFromInbox(*header.entry_key()));
 
             collector.push(A::from(event));
         }
diff --git a/crates/vqueues/src/scheduler.rs b/crates/vqueues/src/scheduler.rs
index e35995a0ae..9c6a91b49a 100644
--- a/crates/vqueues/src/scheduler.rs
+++ b/crates/vqueues/src/scheduler.rs
@@ -23,9 +23,10 @@ use restate_types::vqueues::VQueueId;
 use restate_worker_api::resources::ReservedResources;
 use restate_worker_api::{SchedulingStatus, UserLimitCounterEntry, VQueueSchedulerStatus};
 
-use crate::VQueueEvent;
-use crate::VQueuesMetaCache;
+use crate::cache::VQueueHandle;
 use crate::metric_definitions::publish_scheduler_decision_metrics;
+use crate::{VQueueEvent, VQueuesMeta};
+use crate::{VQueuesMetaCache, cache};
 
 use self::drr::DRRScheduler;
 use self::resource_manager::PermitBuilder;
@@ -35,18 +36,14 @@ mod clock;
 mod drr;
 mod eligible;
 mod queue;
-mod queue_meta;
 mod resource_manager;
 mod vqueue_state;
 
 // Re-exports
-pub use self::queue_meta::{MetaLiteUpdate, VQueueMetaLite};
 pub use resource_manager::ResourceManager;
 
 type UnconfirmedAssignments = hashbrown::HashMap<EntryKey, PermitBuilder>;
 
-slotmap::new_key_type! { pub(crate) struct VQueueHandle; }
-
 fn status_from_detailed_eligibility(value: DetailedEligibility) -> SchedulingStatus {
     match value {
         DetailedEligibility::EligibleRunning | DetailedEligibility::EligibleInbox => {
@@ -160,9 +157,9 @@ impl<S: VQueueStore> SchedulerService<S> {
         Ok(Self { state })
     }
 
-    pub fn on_inbox_event(&mut self, event: VQueueEvent) {
+    pub fn on_inbox_event(&mut self, metas: VQueuesMeta<'_>, event: VQueueEvent) {
         if let State::Active(ref mut drr_scheduler) = self.state {
-            drr_scheduler.as_mut().on_inbox_event(event);
+            drr_scheduler.as_mut().on_inbox_event(metas, event);
         }
     }
 
@@ -172,46 +169,63 @@ impl<S: VQueueStore> SchedulerService<S> {
     /// Resources will not be returned if the unconfirmed assignment was rejected or removed.
     pub fn confirm_run_attempt(
         &mut self,
-        qid: &VQueueId,
+        handle: VQueueHandle,
+        slot: &cache::Slot,
         key: &EntryKey,
     ) -> Option<ReservedResources> {
         if let State::Active(ref mut drr_scheduler) = self.state {
-            drr_scheduler.as_mut().confirm_run_attempt(qid, key)
+            drr_scheduler
+                .as_mut()
+                .confirm_run_attempt(handle, slot, key)
         } else {
             None
         }
     }
 
-    pub async fn schedule_next(&mut self) -> Result<Decisions, StorageError> {
-        poll_fn(|cx| self.poll_schedule_next(cx)).await
+    pub async fn schedule_next(
+        &mut self,
+        metas: VQueuesMeta<'_>,
+    ) -> Result<Decisions, StorageError> {
+        poll_fn(|cx| self.poll_schedule_next(metas, cx)).await
     }
 
     pub fn poll_schedule_next(
         &mut self,
+        metas: VQueuesMeta<'_>,
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<Decisions, StorageError>> {
         match self.state {
             // if scheduler is disabled, we always return pending.
             State::Disabled => Poll::Pending,
-            State::Active(ref mut drr_scheduler) => drr_scheduler.as_mut().poll_schedule_next(cx),
+            State::Active(ref mut drr_scheduler) => {
+                drr_scheduler.as_mut().poll_schedule_next(metas, cx)
+            }
         }
     }
 
     /// Returns the scheduling status for a specific vqueue.
-    pub fn get_status(&self, qid: &VQueueId) -> VQueueSchedulerStatus {
+    #[cfg(test)]
+    pub fn get_status(
+        &self,
+        metas: VQueuesMeta<'_>,
+        handle: VQueueHandle,
+    ) -> VQueueSchedulerStatus {
         match self.state {
             State::Disabled => VQueueSchedulerStatus::default(),
-            State::Active(ref drr_scheduler) => drr_scheduler.get_status(qid),
+            State::Active(ref drr_scheduler) => drr_scheduler.get_status(metas, handle),
         }
     }
 
     /// Returns an iterator over the scheduling status of all tracked vqueues.
     ///
     /// Returns `None` if the scheduler is disabled.
-    pub fn iter_status(&self) -> Option<impl Iterator<Item = (VQueueId, VQueueSchedulerStatus)>> {
+    pub fn iter_status(
+        &self,
+        metas: VQueuesMeta<'_>,
+    ) -> Option<impl Iterator<Item = (VQueueId, VQueueSchedulerStatus)>> {
         match self.state {
             State::Disabled => None,
-            State::Active(ref drr_scheduler) => Some(drr_scheduler.iter_status()),
+            State::Active(ref drr_scheduler) => Some(drr_scheduler.iter_status(metas)),
         }
     }
 
diff --git a/crates/vqueues/src/scheduler/drr.rs b/crates/vqueues/src/scheduler/drr.rs
index 2d7d50ba4c..5159370647 100644
--- a/crates/vqueues/src/scheduler/drr.rs
+++ b/crates/vqueues/src/scheduler/drr.rs
@@ -14,10 +14,9 @@ use std::task::Poll;
 use std::task::Waker;
 use std::time::Duration;
 
-use hashbrown::HashMap;
 use metrics::counter;
 use pin_project::pin_project;
-use slotmap::SlotMap;
+use slotmap::SecondaryMap;
 use tokio::time::Instant;
 use tracing::{info, trace, warn};
 
@@ -32,16 +31,16 @@ use restate_worker_api::UserLimitCounterEntry;
 use crate::EventDetails;
 use crate::VQueueEvent;
 use crate::VQueuesMeta;
+use crate::cache;
+use crate::cache::VQueueHandle;
+use crate::metric_definitions::VQUEUE_ENQUEUE;
 use crate::metric_definitions::VQUEUE_RUN_CONFIRMED;
-use crate::scheduler::MetaLiteUpdate;
 use crate::scheduler::eligible::EligibilityTracker;
 use crate::scheduler::vqueue_state::Pop;
 
 use super::Decisions;
 use super::ReservedResources;
 use super::ResourceManager;
-use super::VQueueHandle;
-use super::VQueueMetaLite;
 use super::VQueueSchedulerStatus;
 use super::vqueue_state::VQueueState;
 
@@ -50,9 +49,7 @@ pub struct DRRScheduler<S: VQueueStore> {
     resource_manager: ResourceManager,
     // sorted by queue_id
     eligible: EligibilityTracker,
-    /// Mapping of vqueue_id -> handle for active vqueues
-    id_lookup: HashMap<VQueueId, VQueueHandle>,
-    q: SlotMap<VQueueHandle, VQueueState<S>>,
+    q: SecondaryMap<VQueueHandle, VQueueState<S>>,
     /// Waker to be notified when scheduler is potentially able to scheduler more work
     waker: Waker,
     /// Time of the last memory reporting and memory compaction
@@ -68,7 +65,6 @@ pub struct DRRScheduler<S: VQueueStore> {
 }
 
 impl<S: VQueueStore> DRRScheduler<S> {
-    #[allow(clippy::too_many_arguments)]
     pub fn new(
         limit_qid_per_poll: NonZeroU16,
         max_items_per_decision: NonZeroU16,
@@ -79,23 +75,14 @@ impl<S: VQueueStore> DRRScheduler<S> {
         let mut total_running = 0;
         let mut total_waiting = 0;
 
-        let num_active = vqueues.num_active();
-        let mut q = SlotMap::with_capacity_and_key(num_active);
-        let mut id_lookup = HashMap::with_capacity(num_active);
-        let mut eligible: EligibilityTracker = EligibilityTracker::with_capacity(num_active);
+        let mut q = SecondaryMap::with_capacity(vqueues.capacity());
+        let mut eligible: EligibilityTracker =
+            EligibilityTracker::with_capacity(vqueues.capacity());
 
-        for (qid, meta) in vqueues.iter_active_vqueues() {
+        for (handle, qid, meta) in vqueues.iter_active_vqueues() {
             total_running += meta.num_running();
             total_waiting += meta.total_waiting();
-            let handle = q.insert_with_key(|handle| {
-                VQueueState::new(
-                    qid.clone(),
-                    handle,
-                    VQueueMetaLite::new(meta),
-                    meta.num_running(),
-                )
-            });
-            id_lookup.insert(qid.clone(), handle);
+            q.insert(handle, VQueueState::new(qid, &storage, meta.num_running()));
             // We init all active vqueues as eligible first
             eligible.insert_eligible(handle);
         }
@@ -107,7 +94,6 @@ impl<S: VQueueStore> DRRScheduler<S> {
 
         Self {
             resource_manager,
-            id_lookup,
             q,
             eligible,
             waker: Waker::noop().clone(),
@@ -128,6 +114,7 @@ impl<S: VQueueStore> DRRScheduler<S> {
 
     pub fn poll_schedule_next(
         mut self: Pin<&mut Self>,
+        metas: VQueuesMeta<'_>,
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<Decisions, StorageError>> {
         let mut decisions = Decisions::default();
@@ -156,32 +143,36 @@ impl<S: VQueueStore> DRRScheduler<S> {
                 break;
             }
 
-            let Some(handle) = this.eligible.next_eligible(this.storage, this.q)? else {
+            let Some(handle) = this
+                .eligible
+                .next_eligible(cx, metas, this.storage, this.q)?
+            else {
                 break;
             };
 
             let qstate = this.q.get_mut(handle).unwrap();
+            let slot = metas.get(handle).unwrap();
 
-            match qstate.try_pop(cx, this.storage, this.resource_manager)? {
+            match qstate.try_pop(cx, handle, slot, this.resource_manager)? {
                 Pop::NeedsCredit => {
                     this.eligible.rotate_one();
                 }
                 Pop::Run(action) => {
                     coop.made_progress();
-                    decisions.push(&qstate.qid, action);
+                    decisions.push(slot.vqueue_id(), action);
                     // We need to set the state so we check eligibility and setup
                     // necessary schedules when we poll the queue again.
                     this.eligible.front_needs_poll();
                 }
                 Pop::Yield(action) => {
                     coop.made_progress();
-                    decisions.push(&qstate.qid, action);
+                    decisions.push(slot.vqueue_id(), action);
                     // We need to set the state so we check eligibility and setup
                     // necessary schedules when we poll the queue again.
                     this.eligible.front_needs_poll();
                 }
                 Pop::Blocked(resource) => {
-                    trace!("VQueue {:?} is blocked on {resource:?}", qstate.qid);
+                    trace!("VQueue {} is blocked on {resource:?}", slot.vqueue_id());
                     this.eligible.front_blocked(resource);
                 }
             }
@@ -214,7 +205,6 @@ impl<S: VQueueStore> DRRScheduler<S> {
                 .remove_vqueue(handle, &blocked_resource);
         }
 
-        self.id_lookup.remove(qid);
         self.q.remove(handle);
         self.eligible.remove(handle);
     }
@@ -222,21 +212,21 @@ impl<S: VQueueStore> DRRScheduler<S> {
     // To be replaced by an invoke event that runs the invocation within the scheduler itself.
     pub fn confirm_run_attempt(
         mut self: Pin<&mut Self>,
-        qid: &VQueueId,
+        handle: VQueueHandle,
+        slot: &cache::Slot,
         key: &EntryKey,
     ) -> Option<ReservedResources> {
-        let handle = self.id_lookup.get(qid).copied()?;
         let qstate = self.q.get_mut(handle)?;
 
         // I've the resources. Let's run it.
         let permit = qstate.remove_from_unconfirmed_assignments(key)?;
-        // To make sure the num_waiting counter is updated.
-        qstate.apply_meta_update(&MetaLiteUpdate::RemovedFromInbox(*key));
         counter!(VQUEUE_RUN_CONFIRMED).increment(1);
 
-        if qstate.is_dormant() {
-            trace!("VQueue {qid} is no longer observed by the scheduler");
-            self.id_lookup.remove(qid);
+        if qstate.is_dormant(slot.meta()) {
+            trace!(
+                "VQueue {} is no longer observed by the scheduler",
+                slot.vqueue_id()
+            );
             self.q.remove(handle);
             self.eligible.remove(handle);
         }
@@ -245,79 +235,51 @@ impl<S: VQueueStore> DRRScheduler<S> {
 
     #[tracing::instrument(skip_all)]
     #[track_caller]
-    pub fn on_inbox_event(&mut self, event: VQueueEvent) {
-        let qid = event.qid;
-        let mut maybe_handle = self.id_lookup.get(&qid).copied();
-        debug_assert!(maybe_handle.map(|h| self.q.contains_key(h)).unwrap_or(true));
+    pub fn on_inbox_event(&mut self, metas: VQueuesMeta<'_>, event: VQueueEvent) {
+        let slot = metas
+            .get(event.queue)
+            .expect("vqueue meta must be in cache");
 
         for update in event.updates {
             match update {
-                EventDetails::AddVQueue {
-                    scope,
-                    limit_key,
-                    lock_name,
-                } => {
-                    if maybe_handle.is_none() {
-                        // add it.
-                        trace!("VQueue {qid} is added to the scheduler");
-                        maybe_handle = Some(self.q.insert_with_key(|h| {
-                            VQueueState::new_empty(
-                                qid.clone(),
-                                h,
-                                VQueueMetaLite::new_empty(scope, limit_key, lock_name),
-                            )
-                        }));
-                        self.id_lookup.insert(qid.clone(), maybe_handle.unwrap());
-                        // Note: We don't make this eligible yet. We'll do that when we see
-                        // an actual enqueue because we initialize the vqueue metadata with
-                        // length 0.
-                    }
-                }
-                EventDetails::LockReleased { scope, lock_name } => {
-                    self.release_lock(&scope, &lock_name);
+                EventDetails::LockReleased => {
+                    self.release_lock(slot.meta().scope(), slot.meta().lock_name().unwrap());
                 }
-                EventDetails::InboxUpdate(ref update @ MetaLiteUpdate::QueuePaused) => {
-                    let Some(handle) = maybe_handle else {
-                        continue;
-                    };
-                    let qstate = self.q.get_mut(handle).unwrap();
-                    qstate.apply_meta_update(update);
-                    if qstate.is_dormant() {
-                        self.mark_vqueue_as_dormant(&qid, handle);
+                EventDetails::QueuePaused => {
+                    let qstate = self.q.get_mut(event.queue).unwrap();
+                    if qstate.is_dormant(slot.meta()) {
+                        self.mark_vqueue_as_dormant(slot.vqueue_id(), event.queue);
                     }
                 }
-                EventDetails::InboxUpdate(ref update @ MetaLiteUpdate::QueueResumed { .. }) => {
-                    let Some(handle) = maybe_handle else {
-                        continue;
-                    };
-                    let qstate = self.q.get_mut(handle).unwrap();
-                    qstate.apply_meta_update(update);
-                    if self.eligible.refresh_membership(qstate) {
+                EventDetails::QueueResumed => {
+                    let qstate = self.q.get_mut(event.queue).unwrap();
+                    if self
+                        .eligible
+                        .refresh_membership(event.queue, slot.meta(), qstate)
+                    {
                         self.wake_up();
                     }
                 }
-                EventDetails::InboxUpdate(
-                    ref update @ MetaLiteUpdate::EnqueuedToInbox { ref key, ref value },
-                ) => {
-                    assert!(
-                        maybe_handle.is_some(),
-                        "scheduler must know about queue {qid}  before an inbox enqueue",
-                    );
-                    let handle = maybe_handle.unwrap();
-                    let qstate = self.q.get_mut(handle).unwrap();
-
-                    // keep inbox size in sync
-                    qstate.apply_meta_update(update);
+                EventDetails::EnqueuedToInbox { ref key, ref value } => {
+                    let qstate = self.q.entry(event.queue).unwrap().or_insert_with(|| {
+                        trace!("VQueue {} is added to the scheduler", slot.vqueue_id());
+                        VQueueState::new_empty()
+                    });
+
+                    counter!(VQUEUE_ENQUEUE).increment(1);
 
                     if let Some(permit_builder) = qstate.notify_enqueued(key, value) {
                         // The newly enqueued item became the head of the queue.
                         // If the vqueue was blocked we need to place it back
                         // on the ready ring if it wasn't already there.
                         let mut wake_up = false;
-                        if let Some(resource) = self.eligible.mark_queue_unblocked(handle) {
-                            self.resource_manager.remove_vqueue(handle, &resource);
+                        if let Some(resource) = self.eligible.mark_queue_unblocked(event.queue) {
+                            self.resource_manager.remove_vqueue(event.queue, &resource);
                             wake_up |= true;
-                        } else if self.eligible.refresh_membership(qstate) {
+                        } else if self
+                            .eligible
+                            .refresh_membership(event.queue, slot.meta(), qstate)
+                        {
                             wake_up |= true;
                         }
 
@@ -333,13 +295,10 @@ impl<S: VQueueStore> DRRScheduler<S> {
                         }
                     }
                 }
-                EventDetails::InboxUpdate(
-                    ref update @ MetaLiteUpdate::RemovedFromInbox(ref key),
-                ) => {
-                    let Some(handle) = maybe_handle else {
+                EventDetails::RemovedFromInbox(ref key) => {
+                    let Some(qstate) = self.q.get_mut(event.queue) else {
                         continue;
                     };
-                    let qstate = self.q.get_mut(handle).unwrap();
 
                     // Three cases:
                     // 1. The item was pending confirmation (we hold resources that should be
@@ -347,11 +306,7 @@ impl<S: VQueueStore> DRRScheduler<S> {
                     // 2. The item was the head of the queue (not scheduled yet). We _may_ have a
                     // permit builder in-flight that we can release.
                     // 3. None of the above, removing only changes the vqueue metadata.
-
                     let mut wake_up = false;
-                    // keep inbox size in sync
-                    qstate.apply_meta_update(update);
-
                     // If we have been holding a concurrency permit for this item, we release it.
                     if let Some(permit) = qstate.remove_from_unconfirmed_assignments(key) {
                         // Case 1:
@@ -363,15 +318,15 @@ impl<S: VQueueStore> DRRScheduler<S> {
                         // Case 2:
                         // This means it might be the current head. Let's invalidate it if
                         // that's the case.
-                        if let Some(resource) = self.eligible.find_blocking_resource(handle) {
+                        if let Some(resource) = self.eligible.find_blocking_resource(event.queue) {
                             // The head was removed and the queue was blocked on a resource.
-                            self.resource_manager.remove_vqueue(handle, resource);
+                            self.resource_manager.remove_vqueue(event.queue, resource);
                         }
 
-                        if !qstate.is_dormant() {
+                        if !qstate.is_dormant(slot.meta()) {
                             // force the queue to be polled again since the head
                             // will definitely be unknown at this point.
-                            self.eligible.ensure_queue_needs_polling(handle);
+                            self.eligible.ensure_queue_needs_polling(event.queue);
                             wake_up |= true;
                         }
                         // let the other queues that were blocked on my partial permit
@@ -382,9 +337,9 @@ impl<S: VQueueStore> DRRScheduler<S> {
                             .revert_permit_builder(&mut self.eligible, permit_builder);
                     }
 
-                    if qstate.is_dormant() {
+                    if qstate.is_dormant(slot.meta()) {
                         // the removal makes the queue dormant. Remove it from everything
-                        self.mark_vqueue_as_dormant(&qid, handle);
+                        self.mark_vqueue_as_dormant(slot.vqueue_id(), event.queue);
                     }
 
                     if wake_up {
@@ -404,29 +359,39 @@ impl<S: VQueueStore> DRRScheduler<S> {
         }
     }
 
-    pub fn iter_status(&self) -> impl Iterator<Item = (VQueueId, VQueueSchedulerStatus)> {
+    pub fn iter_status(
+        &self,
+        metas: VQueuesMeta<'_>,
+    ) -> impl Iterator<Item = (VQueueId, VQueueSchedulerStatus)> {
         // Resolver shared across every queue's status snapshot in this sweep —
         // the rule store doesn't change while we hold `&self`.
         let resolve_rule = |handle| self.resource_manager.resolve_user_rule(handle);
-        self.q.iter().map(move |(_handle, qstate)| {
+        self.q.iter().map(move |(handle, qstate)| {
+            let slot = metas.get(handle).unwrap();
             let status = VQueueSchedulerStatus {
                 wait_stats: qstate.get_head_wait_stats(),
                 remaining_running: qstate.num_remaining_in_running_stage(),
-                waiting_inbox: qstate.num_waiting_inbox(),
-                status: self.eligible.get_status(qstate, &resolve_rule),
+                waiting_inbox: qstate.num_waiting_inbox(slot.meta()),
+                status: self
+                    .eligible
+                    .get_status(handle, slot.meta(), qstate, &resolve_rule),
                 head_entry_id: qstate.head_entry_id(),
             };
 
-            (qstate.qid.clone(), status)
+            (slot.vqueue_id().clone(), status)
         })
     }
 
-    pub fn get_status(&self, qid: &VQueueId) -> VQueueSchedulerStatus {
-        let Some(qstate) = self
-            .id_lookup
-            .get(qid)
-            .and_then(|handle| self.q.get(*handle))
-        else {
+    #[cfg(test)]
+    pub fn get_status(
+        &self,
+        metas: VQueuesMeta<'_>,
+        handle: VQueueHandle,
+    ) -> VQueueSchedulerStatus {
+        let Some(qstate) = self.q.get(handle) else {
+            return VQueueSchedulerStatus::default();
+        };
+        let Some(slot) = metas.get(handle) else {
             return VQueueSchedulerStatus::default();
         };
 
@@ -434,8 +399,10 @@ impl<S: VQueueStore> DRRScheduler<S> {
         VQueueSchedulerStatus {
             wait_stats: qstate.get_head_wait_stats(),
             remaining_running: qstate.num_remaining_in_running_stage(),
-            waiting_inbox: qstate.num_waiting_inbox(),
-            status: self.eligible.get_status(qstate, &resolve_rule),
+            waiting_inbox: qstate.num_waiting_inbox(slot.meta()),
+            status: self
+                .eligible
+                .get_status(handle, slot.meta(), qstate, &resolve_rule),
             head_entry_id: qstate.head_entry_id(),
         }
     }
@@ -486,6 +453,8 @@ mod tests {
     use crate::cache::VQueuesMetaCache;
     use crate::{GlobalTokenBucket, SchedulingStatus, VQueue, VQueueEvent};
 
+    const TEST_VQUEUES_CAPACITY: usize = 1024;
+
     use super::*;
 
     const BASE_RUN_AT_MS: u64 = 1_744_000_000_000;
@@ -653,10 +622,11 @@ mod tests {
 
     fn poll_scheduler(
         mut scheduler: Pin<&mut DRRScheduler<PartitionDb>>,
+        metas: VQueuesMeta<'_>,
     ) -> Poll<Result<Decisions, restate_storage_api::StorageError>> {
         let waker = std::task::Waker::noop();
         let mut cx = std::task::Context::from_waker(waker);
-        scheduler.as_mut().poll_schedule_next(&mut cx)
+        scheduler.as_mut().poll_schedule_next(metas, &mut cx)
     }
 
     fn run_keys(decision: &Decisions) -> Vec<EntryKey> {
@@ -675,11 +645,13 @@ mod tests {
     async fn test_empty_scheduler_returns_pending() {
         let rocksdb = storage_test_environment().await;
         let db = rocksdb.partition_db();
-        let cache = VQueuesMetaCache::create(db.clone()).await.unwrap();
+        let cache = VQueuesMetaCache::create(db.clone(), TEST_VQUEUES_CAPACITY)
+            .await
+            .unwrap();
 
         let mut scheduler = create_scheduler(db, &cache).await;
         assert!(matches!(
-            poll_scheduler(Pin::new(&mut scheduler)),
+            poll_scheduler(Pin::new(&mut scheduler), cache.view()),
             Poll::Pending
         ));
     }
@@ -687,7 +659,7 @@ mod tests {
     #[restate_core::test]
     async fn test_poll_yields_enqueued_item() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
         let qid = test_qid(2000);
 
         let mut txn = rocksdb.transaction();
@@ -697,7 +669,7 @@ mod tests {
         let db = rocksdb.partition_db();
         let mut scheduler = create_scheduler(db, &cache).await;
 
-        let result = poll_scheduler(Pin::new(&mut scheduler));
+        let result = poll_scheduler(Pin::new(&mut scheduler), cache.view());
         assert!(matches!(result, Poll::Ready(Ok(ref decision)) if !decision.is_empty()));
 
         let Poll::Ready(Ok(decision)) = result else {
@@ -711,7 +683,7 @@ mod tests {
     #[restate_core::test]
     async fn test_run_at_below_now_preempts_within_inbox() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
         let qid = test_qid(2_100);
         let now = MillisSinceEpoch::now().as_u64();
 
@@ -739,7 +711,8 @@ mod tests {
         let db = rocksdb.partition_db();
         let mut scheduler = create_scheduler(db, &cache).await;
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
 
@@ -753,7 +726,7 @@ mod tests {
     #[restate_core::test]
     async fn test_round_robin_fairness() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
 
         let mut txn = rocksdb.transaction();
         for i in 1..=3u64 {
@@ -781,7 +754,8 @@ mod tests {
             cache.view(),
         );
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
 
@@ -804,7 +778,7 @@ mod tests {
     #[restate_core::test]
     async fn test_concurrency_limiting() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
         let qid1 = test_qid(7000);
         let qid2 = test_qid(7001);
 
@@ -818,10 +792,19 @@ mod tests {
         let db = rocksdb.partition_db();
         let mut scheduler = create_scheduler_with_concurrency(db, &cache, 1).await;
 
-        assert_eq!(scheduler.get_status(&qid1).status, SchedulingStatus::Ready,);
-        assert_eq!(scheduler.get_status(&qid2).status, SchedulingStatus::Ready,);
+        let h_qid1 = cache.view().handle_for(&qid1).unwrap();
+        let h_qid2 = cache.view().handle_for(&qid2).unwrap();
+
+        assert_eq!(
+            scheduler.get_status(cache.view(), h_qid1).status,
+            SchedulingStatus::Ready,
+        );
+        assert_eq!(
+            scheduler.get_status(cache.view(), h_qid2).status,
+            SchedulingStatus::Ready,
+        );
 
-        let result = poll_scheduler(Pin::new(&mut scheduler));
+        let result = poll_scheduler(Pin::new(&mut scheduler), cache.view());
         assert!(matches!(result, Poll::Ready(Ok(ref d)) if d.total_items() == 1));
         let Poll::Ready(Ok(result)) = result else {
             panic!("expected decision");
@@ -830,29 +813,34 @@ mod tests {
 
         let first_pop_qid = result.qids.keys().next().unwrap().clone();
         let first_pop_key = run_keys(&result)[0];
+        let h_first_pop = cache.view().handle_for(&first_pop_qid).unwrap();
         qids.retain(|qid| qid != &first_pop_qid);
+        let h_remaining = cache.view().handle_for(&qids[0]).unwrap();
 
         assert!(matches!(
-            poll_scheduler(Pin::new(&mut scheduler)),
+            poll_scheduler(Pin::new(&mut scheduler), cache.view()),
             Poll::Pending
         ));
         assert_eq!(
-            scheduler.get_status(&first_pop_qid).status,
+            scheduler.get_status(cache.view(), h_first_pop).status,
             SchedulingStatus::Empty,
         );
         assert_eq!(
-            scheduler.get_status(&qids[0]).status,
+            scheduler.get_status(cache.view(), h_remaining).status,
             SchedulingStatus::BlockedOn(BlockedResource::InvokerConcurrency),
         );
 
+        let metas = cache.view();
+        let first_pop_slot = metas.get(h_first_pop).unwrap();
         let mut scheduler = Pin::new(&mut scheduler);
-        let resources = scheduler
-            .as_mut()
-            .confirm_run_attempt(&first_pop_qid, &first_pop_key);
+        let resources =
+            scheduler
+                .as_mut()
+                .confirm_run_attempt(h_first_pop, first_pop_slot, &first_pop_key);
         assert!(resources.is_some());
         drop(resources);
 
-        let Poll::Ready(Ok(result2)) = poll_scheduler(scheduler.as_mut()) else {
+        let Poll::Ready(Ok(result2)) = poll_scheduler(scheduler.as_mut(), cache.view()) else {
             panic!("expected decision");
         };
         assert_eq!(result2.num_run(), 1);
@@ -861,16 +849,19 @@ mod tests {
         qids.retain(|qid| qid != &second_pop_qid);
         assert!(qids.is_empty());
         assert_eq!(
-            scheduler.get_status(&qid1).status,
+            scheduler.get_status(cache.view(), h_qid1).status,
             SchedulingStatus::Dormant,
         );
-        assert_eq!(scheduler.get_status(&qid2).status, SchedulingStatus::Empty,);
+        assert_eq!(
+            scheduler.get_status(cache.view(), h_qid2).status,
+            SchedulingStatus::Empty,
+        );
     }
 
     #[restate_core::test]
     async fn test_get_status_reports_invoker_throttling_retry_estimate() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
         let qid1 = test_qid(21_101);
         let qid2 = test_qid(21_102);
 
@@ -901,7 +892,8 @@ mod tests {
             cache.view(),
         );
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
         assert_eq!(decision.num_run(), 1);
@@ -914,7 +906,10 @@ mod tests {
             .expect("a single queue should be scheduled");
         let blocked_qid = if running_qid == qid1 { qid2 } else { qid1 };
 
-        let blocked_status = scheduler.get_status(&blocked_qid).status;
+        let h_blocked = cache.view().handle_for(&blocked_qid).unwrap();
+        let h_running = cache.view().handle_for(&running_qid).unwrap();
+
+        let blocked_status = scheduler.get_status(cache.view(), h_blocked).status;
         let SchedulingStatus::BlockedOn(BlockedResource::InvokerThrottling { estimated_retry_at }) =
             blocked_status
         else {
@@ -923,7 +918,7 @@ mod tests {
         assert!(estimated_retry_at.is_some());
 
         assert_eq!(
-            scheduler.get_status(&running_qid).status,
+            scheduler.get_status(cache.view(), h_running).status,
             SchedulingStatus::Empty
         );
     }
@@ -931,7 +926,7 @@ mod tests {
     #[restate_core::test]
     async fn test_waiters_outside_throttling_window_report_no_estimate() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
 
         let qids = [
             test_qid(22_001),
@@ -977,7 +972,8 @@ mod tests {
             cache.view(),
         );
 
-        let Poll::Ready(Ok(_decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        let Poll::Ready(Ok(_decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
 
@@ -985,9 +981,10 @@ mod tests {
         let mut none_estimate = 0;
 
         for qid in &qids {
+            let handle = cache.view().handle_for(qid).unwrap();
             if let SchedulingStatus::BlockedOn(BlockedResource::InvokerThrottling {
                 estimated_retry_at,
-            }) = scheduler.get_status(qid).status
+            }) = scheduler.get_status(cache.view(), handle).status
             {
                 if estimated_retry_at.is_some() {
                     some_estimate += 1;
@@ -1004,7 +1001,7 @@ mod tests {
     #[restate_core::test]
     async fn test_max_items_per_decision_limit() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
 
         let mut txn = rocksdb.transaction();
         for i in 1..=5u64 {
@@ -1022,7 +1019,7 @@ mod tests {
             cache.view(),
         );
 
-        if let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) {
+        if let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) {
             assert!(!decision.is_empty());
             assert!(decision.total_items() <= 2);
         }
@@ -1031,7 +1028,7 @@ mod tests {
     #[restate_core::test]
     async fn test_higher_priority_preempts_between_polls() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
         let qid = test_qid(14_000);
         let mut events = Vec::new();
 
@@ -1050,7 +1047,8 @@ mod tests {
             cache.view(),
         );
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
         let mut in_flight = run_keys(&decision);
@@ -1063,10 +1061,11 @@ mod tests {
         enqueue_entry(&mut txn, &mut cache, &qid, 125, 0, Some(&mut events)).await;
         txn.commit().await.expect("commit should succeed");
         for event in events.drain(..) {
-            scheduler.on_inbox_event(event);
+            scheduler.on_inbox_event(cache.view(), event);
         }
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
         let next_keys = run_keys(&decision);
@@ -1082,10 +1081,11 @@ mod tests {
         }
         txn.commit().await.expect("commit should succeed");
         for event in events.drain(..) {
-            scheduler.on_inbox_event(event);
+            scheduler.on_inbox_event(cache.view(), event);
         }
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
         let keys = run_keys(&decision);
@@ -1097,7 +1097,7 @@ mod tests {
     #[restate_core::test]
     async fn test_get_status_reflects_scheduling_states() {
         let mut rocksdb = storage_test_environment().await;
-        let mut cache = VQueuesMetaCache::new_empty();
+        let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY);
         let qid1 = test_qid(21_001);
         let qid2 = test_qid(21_002);
 
@@ -1120,16 +1120,26 @@ mod tests {
             cache.view(),
         );
 
-        assert_eq!(scheduler.get_status(&qid1).status, SchedulingStatus::Ready);
-        assert_eq!(scheduler.get_status(&qid2).status, SchedulingStatus::Ready);
+        let h_qid1 = cache.view().handle_for(&qid1).unwrap();
+        let h_qid2 = cache.view().handle_for(&qid2).unwrap();
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else {
+        assert_eq!(
+            scheduler.get_status(cache.view(), h_qid1).status,
+            SchedulingStatus::Ready
+        );
+        assert_eq!(
+            scheduler.get_status(cache.view(), h_qid2).status,
+            SchedulingStatus::Ready
+        );
+
+        let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view())
+        else {
             panic!("expected decision");
         };
         assert_eq!(decision.total_items(), 3);
         assert_eq!(decision.num_queues(), 2);
 
-        let status = scheduler.get_status(&qid1);
+        let status = scheduler.get_status(cache.view(), h_qid1);
         assert_eq!(
             status.status,
             SchedulingStatus::BlockedOn(BlockedResource::InvokerConcurrency)
@@ -1137,7 +1147,7 @@ mod tests {
         assert_eq!(status.waiting_inbox, 1);
         assert_eq!(status.remaining_running, 0);
 
-        let status = scheduler.get_status(&qid2);
+        let status = scheduler.get_status(cache.view(), h_qid2);
         assert_eq!(status.status, SchedulingStatus::Empty);
         assert_eq!(status.waiting_inbox, 0);
         assert_eq!(status.remaining_running, 0);
@@ -1153,22 +1163,32 @@ mod tests {
             })
             .expect("qid2 run action should exist");
 
+        let metas = cache.view();
+        let qid2_slot = metas.get(h_qid2).unwrap();
         let mut scheduler = Pin::new(&mut scheduler);
-        let resources = scheduler.as_mut().confirm_run_attempt(&qid2, &qid2_key);
+        let resources = scheduler
+            .as_mut()
+            .confirm_run_attempt(h_qid2, qid2_slot, &qid2_key);
         assert!(resources.is_some());
         drop(resources);
 
-        let Poll::Ready(Ok(decision)) = poll_scheduler(scheduler.as_mut()) else {
+        let Poll::Ready(Ok(decision)) = poll_scheduler(scheduler.as_mut(), cache.view()) else {
             panic!("expected decision");
         };
         assert_eq!(decision.total_items(), 1);
         assert_eq!(decision.num_queues(), 1);
 
-        assert_eq!(scheduler.get_status(&qid1).status, SchedulingStatus::Empty);
         assert_eq!(
-            scheduler.get_status(&qid2).status,
+            scheduler.get_status(cache.view(), h_qid1).status,
+            SchedulingStatus::Empty
+        );
+        assert_eq!(
+            scheduler.get_status(cache.view(), h_qid2).status,
             SchedulingStatus::Dormant
         );
-        assert!(matches!(poll_scheduler(scheduler.as_mut()), Poll::Pending));
+        assert!(matches!(
+            poll_scheduler(scheduler.as_mut(), cache.view()),
+            Poll::Pending
+        ));
     }
 }
diff --git a/crates/vqueues/src/scheduler/eligible.rs b/crates/vqueues/src/scheduler/eligible.rs
index dc4d281212..5aaf79840b 100644
--- a/crates/vqueues/src/scheduler/eligible.rs
+++ b/crates/vqueues/src/scheduler/eligible.rs
@@ -11,8 +11,9 @@
 use std::collections::VecDeque;
 use std::task::Poll;
 
+use restate_storage_api::vqueue_table::metadata::VQueueMeta;
+use slotmap::SecondaryMap;
 use slotmap::secondary::Entry;
-use slotmap::{SecondaryMap, SlotMap};
 use tokio_util::time::{DelayQueue, delay_queue};
 
 use restate_limiter::RuleHandle;
@@ -22,9 +23,10 @@ use restate_types::time::MillisSinceEpoch;
 use restate_util_string::ReString;
 use restate_worker_api::{ResourceKind, SchedulingStatus};
 
-use super::VQueueHandle;
 use super::clock::SchedulerClock;
 use super::vqueue_state::VQueueState;
+use crate::VQueuesMeta;
+use crate::cache::VQueueHandle;
 use crate::scheduler::vqueue_state::Eligibility;
 
 #[derive(Debug, Copy, Clone)]
@@ -71,14 +73,20 @@ impl EligibilityTracker {
         self.ready_ring.push_back(handle);
     }
 
-    pub fn get_status<S, R>(&self, qstate: &VQueueState<S>, resolve_rule: &R) -> SchedulingStatus
+    pub fn get_status<S, R>(
+        &self,
+        handle: VQueueHandle,
+        meta: &VQueueMeta,
+        qstate: &VQueueState<S>,
+        resolve_rule: &R,
+    ) -> SchedulingStatus
     where
         S: VQueueStore,
         R: Fn(RuleHandle) -> Option<ReString>,
     {
-        match self.states.get(qstate.handle) {
+        match self.states.get(handle) {
             None | Some(State::NeedsPoll) | Some(State::Ready) => {
-                super::status_from_detailed_eligibility(qstate.check_eligibility())
+                super::status_from_detailed_eligibility(qstate.check_eligibility(meta))
             }
             Some(State::Scheduled { wake_up }) => SchedulingStatus::Scheduled {
                 at: wake_up.ts.into(),
@@ -110,10 +118,14 @@ impl EligibilityTracker {
 
     pub fn next_eligible<S: VQueueStore>(
         &mut self,
+        cx: &mut std::task::Context<'_>,
+        metas: VQueuesMeta<'_>,
         storage: &S,
-        vqueues: &mut SlotMap<VQueueHandle, VQueueState<S>>,
+        vqueues: &mut SecondaryMap<VQueueHandle, VQueueState<S>>,
     ) -> Result<Option<VQueueHandle>, StorageError> {
-        loop {
+        let n = self.ready_ring.len();
+        // avoid rescanning the ready ring multiple rounds
+        for _ in 0..n {
             // what is my current status
             let Some(handle) = self.ready_ring.front().copied() else {
                 return Ok(None);
@@ -134,15 +146,17 @@ impl EligibilityTracker {
                 continue;
             };
 
+            let slot = metas.get(handle).unwrap();
+
             match current_state {
                 State::NeedsPoll => {
                     // update the state based on eligibility.
-                    match qstate.poll_eligibility(storage)?.as_compact() {
-                        Eligibility::Eligible => {
+                    match qstate.poll_eligibility(cx, slot, storage) {
+                        Poll::Ready(Ok(Eligibility::Eligible)) => {
                             *current_state = State::Ready;
                             return Ok(Some(handle));
                         }
-                        Eligibility::EligibleAt(ts) => {
+                        Poll::Ready(Ok(Eligibility::EligibleAt(ts))) => {
                             let ts = ts.as_unix_millis();
                             let duration = ts.duration_since(SchedulerClock.now_millis());
                             let timer_key = self.delayed_eligibility.insert(handle, duration);
@@ -152,24 +166,29 @@ impl EligibilityTracker {
                             self.ready_ring.pop_front();
                             continue;
                         }
-                        Eligibility::NotEligible => {
+                        Poll::Ready(Ok(Eligibility::NotEligible)) => {
                             self.ready_ring.pop_front();
                             self.remove(handle);
                             continue;
                         }
+                        Poll::Ready(Err(err)) => {
+                            return Err(err);
+                        }
+                        Poll::Pending => {
+                            continue;
+                        }
                     }
                 }
-                State::BlockedOn(_) => {
+                State::BlockedOn(_) | State::Scheduled { .. } => {
                     self.ready_ring.pop_front();
                 }
                 State::Ready => {
                     return Ok(Some(handle));
                 }
-                State::Scheduled { .. } => {
-                    self.ready_ring.pop_front();
-                }
             }
         }
+
+        Ok(None)
     }
 
     pub fn remove(&mut self, handle: VQueueHandle) {
@@ -254,8 +273,13 @@ impl EligibilityTracker {
     /// Returns true if scheduler should be woken up
     ///
     /// If the vqueue is blocked on a resource, this function will not touch it.
-    pub fn refresh_membership<S: VQueueStore>(&mut self, vqueue: &VQueueState<S>) -> bool {
-        let Some(current_state) = self.states.entry(vqueue.handle) else {
+    pub fn refresh_membership<S: VQueueStore>(
+        &mut self,
+        handle: VQueueHandle,
+        meta: &VQueueMeta,
+        vqueue: &VQueueState<S>,
+    ) -> bool {
+        let Some(current_state) = self.states.entry(handle) else {
             // the vqueue handle was removed from the original slot map.
             return false;
         };
@@ -263,11 +287,11 @@ impl EligibilityTracker {
         match current_state {
             Entry::Vacant(vacant_entry) => {
                 vacant_entry.insert(State::NeedsPoll);
-                self.ready_ring.push_back(vqueue.handle);
+                self.ready_ring.push_back(handle);
                 true
             }
             Entry::Occupied(mut occupied_entry) => {
-                let eligibility = vqueue.check_eligibility().as_compact();
+                let eligibility = vqueue.check_eligibility(meta).as_compact();
                 match (occupied_entry.get(), eligibility) {
                     (State::NeedsPoll, _) => {
                         // do nothing.
@@ -282,14 +306,14 @@ impl EligibilityTracker {
                         // happens, we err on the safe side and switch to polling.
                         self.delayed_eligibility.remove(&wake_up.timer_key);
                         occupied_entry.insert(State::NeedsPoll);
-                        self.ready_ring.push_back(vqueue.handle);
+                        self.ready_ring.push_back(handle);
                         true
                     }
                     (State::Scheduled { wake_up }, Eligibility::Eligible) => {
                         // wake up now
                         self.delayed_eligibility.remove(&wake_up.timer_key);
-                        self.states.insert(vqueue.handle, State::NeedsPoll);
-                        self.ready_ring.push_back(vqueue.handle);
+                        self.states.insert(handle, State::NeedsPoll);
+                        self.ready_ring.push_back(handle);
                         true
                     }
                     (State::Scheduled { wake_up }, Eligibility::EligibleAt(eligible_at_ts)) => {
diff --git a/crates/vqueues/src/scheduler/queue.rs b/crates/vqueues/src/scheduler/queue.rs
index f381f3f5ac..53c60184ed 100644
--- a/crates/vqueues/src/scheduler/queue.rs
+++ b/crates/vqueues/src/scheduler/queue.rs
@@ -8,20 +8,197 @@
 // the Business Source License, use of this software will be governed
 // by the Apache License, Version 2.0.
 
-use restate_storage_api::StorageError;
-use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueCursor, VQueueStore};
+use std::collections::VecDeque;
+use std::num::NonZeroU32;
+use std::pin::Pin;
+use std::task::{Poll, ready};
+
+use itertools::{EitherOrBoth, Itertools as _};
+use tokio::time::Instant;
+use tracing::trace;
+
+use restate_storage_api::vqueue_table::CursorError;
+use restate_storage_api::vqueue_table::{
+    EntryKey, EntryValue, VQueueCursor, VQueueRunningCursor, VQueueStore,
+};
+use restate_storage_api::{StorageError, vqueue_table};
+use restate_time_util::DurationExt;
 use restate_types::vqueues::VQueueId;
 
 use super::UnconfirmedAssignments;
 
-#[derive(Debug)]
-enum Head {
-    /// We need a seek+read to know the head.
-    Unknown,
-    /// The current cursor's head
-    Known { key: EntryKey, value: EntryValue },
-    /// We know that we've reached the end of the vqueue
-    Empty,
+/// The number of entries we are willing to keep in cache
+const INBOX_CACHE_CAPACITY: usize = 24;
+
+enum Overlay {
+    Add(EntryValue),
+    Tombstone,
+}
+
+/// In-flight async refill, plus an overlay of `notify_enqueued` /
+/// `notify_removed` events that arrived while the task was running.
+///
+/// # Storage-write-before-notify invariant
+///
+/// The partition processor commits inbox writes/deletes to RocksDB *before*
+/// dispatching `EnqueuedToInbox` / `RemovedFromInbox` events to the
+/// scheduler. This ordering rule underpins the following overlay reasoning:
+///
+/// 1. **Why tombstones are needed at all.** The refill task's RocksDB
+///    snapshot is fixed at task-start time. A `notify_removed` that arrives
+///    while the task is in flight may correspond to a delete that committed
+///    *before* the task started its scan (snapshot still sees the row) or
+///    *after* (snapshot doesn't). We can't tell from the notification
+///    alone, so we record a tombstone and let `merge_join_by` suppress the
+///    row if it shows up in the result.
+///
+/// 2. **Why `push_added_item` panics on a key it already tombstoned.** A
+///    later add of a key we previously tombstoned would imply the
+///    notifications arrived out-of-order with respect to storage commits,
+///    which the invariant rules out.
+///
+/// 3. **Why cancelling an in-flight task discards the overlay safely** (see
+///    [`RefillState::update_anchor`]). When cancellation kicks in, every
+///    pending notification has already had its storage write committed.
+///    The next fresh refill takes a new snapshot and sees post-commit
+///    state directly, so the overlay would be redundant.
+struct RefillTask {
+    started_at: Instant,
+    refill_anchor: Option<EntryKey>,
+    /// Items that are known to be added or removed while the refill was in-flight
+    /// Sorted ascending by EntryKey.
+    overlay: VecDeque<(EntryKey, Overlay)>,
+    handle: tokio::task::JoinHandle<Result<Vec<(EntryKey, EntryValue)>, StorageError>>,
+    /// Exclusive upper bound on the overlay's coverage. `None` until the
+    /// first at-capacity event; once set, only shrinks.
+    ///
+    /// On overflow we set this (exclusive upper bound) instead of
+    /// evicting from the overlay. Overlay events and merge-time storage rows
+    /// with `key >= horizon` are then dropped; the next refill rediscovers
+    /// them via `seek_after(cache.back)` (always below `horizon`). Avoids
+    /// the silent-drop hazard of back-eviction — a popped tombstone could
+    /// re-admit a deleted row — at the cost of re-fetching some
+    /// already-read storage rows.
+    horizon: Option<EntryKey>,
+}
+impl RefillTask {
+    /// Capacity/horizon handling shared by the `push_*` methods. Caller resolves
+    /// duplicates first; `pos` is insertion position.
+    ///
+    /// Returns `Some(pos)` to insert there, or `None` if the event was dropped.
+    fn prepare_overlay_insert(&mut self, key: EntryKey, pos: usize) -> Option<usize> {
+        if self.horizon.is_some_and(|h| key >= h) {
+            return None;
+        }
+        if self.overlay.len() < INBOX_CACHE_CAPACITY {
+            return Some(pos);
+        }
+        // At capacity: shrink the horizon to seal the affected range.
+        if pos == self.overlay.len() {
+            // New key sorts past every overlay entry — it becomes the
+            // horizon itself and is not tracked.
+            self.horizon = Some(self.horizon.map_or(key, |h| h.min(key)));
+            None
+        } else {
+            // New key displaces the back; the back's key becomes the
+            // horizon (anything at-or-above it is now uncertain).
+            let back_key = self
+                .overlay
+                .back()
+                .expect("overlay at capacity must have a back")
+                .0;
+            self.horizon = Some(self.horizon.map_or(back_key, |h| h.min(back_key)));
+            self.overlay.pop_back();
+            Some(pos)
+        }
+    }
+
+    fn push_tombstone(&mut self, key_to_remove: &EntryKey) {
+        let pos = match self
+            .overlay
+            .binary_search_by_key(key_to_remove, |&(k, _)| k)
+        {
+            Ok(pos) => {
+                // Existing overlay entry → upgrade to Tombstone.
+                self.overlay.get_mut(pos).unwrap().1 = Overlay::Tombstone;
+                return;
+            }
+            Err(pos) => pos,
+        };
+        if let Some(pos) = self.prepare_overlay_insert(*key_to_remove, pos) {
+            self.overlay
+                .insert(pos, (*key_to_remove, Overlay::Tombstone));
+        }
+    }
+
+    fn push_added_item(&mut self, key_to_add: &EntryKey, value: &EntryValue) {
+        let pos = match self.overlay.binary_search_by_key(key_to_add, |&(k, _)| k) {
+            Ok(_) => panic!(
+                "Something went wrong here. We should not see duplicate enqueues or an enqueue after a removal of the same key: {key_to_add:?}"
+            ),
+            Err(pos) => pos,
+        };
+        if let Some(pos) = self.prepare_overlay_insert(*key_to_add, pos) {
+            self.overlay
+                .insert(pos, (*key_to_add, Overlay::Add(value.clone())));
+        }
+    }
+}
+
+#[derive(derive_more::Debug)]
+enum RefillState {
+    #[debug("standby")]
+    Standby {
+        /// `seek_after` anchor for the next refill, and the upper bound used to
+        /// decide whether to accept a `notify_enqueued`. Items currently in the
+        /// cache, items previously consumed from the cache, and items in the
+        /// caller's skip set together account for every inbox key
+        /// `<= refill_anchor`. Keys `> refill_anchor` are undiscovered and will
+        /// appear on the next refill via `seek_after(refill_anchor)`.
+        refill_anchor: Option<EntryKey>,
+    },
+    #[debug("in-flight (age: {})", _0.started_at.elapsed().friendly())]
+    InFlight(Box<RefillTask>),
+}
+
+impl Default for RefillState {
+    fn default() -> Self {
+        Self::Standby {
+            refill_anchor: None,
+        }
+    }
+}
+
+impl RefillState {
+    /// Updates the standby anchor, or cancels an in-flight task and resets
+    /// to standby with the given anchor.
+    ///
+    /// Dropping an in-flight `RefillTask` here also drops its overlay
+    /// (pending `Add` / `Tombstone` records). This is safe per the
+    /// storage-write-before-notify invariant documented on [`RefillTask`]:
+    /// every notification we have observed (and thus every overlay entry)
+    /// has its storage write already committed, so the next fresh refill's
+    /// snapshot reflects them directly without needing the overlay.
+    fn update_anchor(&mut self, new_anchor: Option<EntryKey>) {
+        match self {
+            RefillState::Standby { refill_anchor } => {
+                *refill_anchor = new_anchor;
+            }
+            RefillState::InFlight(_) => {
+                trace!("Refill task was in-flight but no longer needed, cancelling it");
+                // Drop the task; the receiver going away signals the worker
+                // thread to bail. See `RefillTask` doc for why discarding
+                // the overlay is safe here.
+                *self = RefillState::Standby {
+                    refill_anchor: new_anchor,
+                }
+            }
+        }
+    }
+
+    fn is_in_flight(&self) -> bool {
+        matches!(self, Self::InFlight(_))
+    }
 }
 
 #[derive(Debug)]
@@ -38,683 +215,639 @@ pub enum QueueItem<'a> {
 }
 
 #[derive(derive_more::Debug)]
-pub(crate) enum Reader<S: VQueueStore> {
-    /// Reader was never opened and might need to scan running items
-    New { already_running: u32 },
+enum Stage<S: VQueueStore> {
+    /// Brand-new queue; running items still need to be drained first.
+    New {
+        already_running: NonZeroU32,
+        reader: Option<S::RunningReader>,
+    },
+    /// In running stage. Single-item head, single-shot reader.
     #[debug("Running")]
     Running {
-        remaining: u32,
+        head: (EntryKey, EntryValue),
         reader: S::RunningReader,
+        remaining: u32,
     },
-    #[debug("Inbox")]
-    Inbox(S::InboxReader),
-    // We can transition back to Reader::Inbox if new items have been added to the inbox
-    // but we should never return to `Running`.
-    #[debug("Closed")]
-    Closed,
+    /// In inbox stage. The queue's `inbox_cache` is the source of truth
+    /// between refills.
+    Inbox,
+    /// Inbox is fully drained.
+    Empty,
 }
 
 pub(crate) struct Queue<S: VQueueStore> {
-    head: Head,
-    reader: Reader<S>,
+    stage: Stage<S>,
+    /// Backing cache for the inbox stage.
+    /// Meaningful only when `stage` is `Inbox` or `Empty`; for
+    /// other stages it must be empty (invariant maintained by `advance` /
+    /// `remove` / `enqueue`).
+    ///
+    /// Sorted ascending by EntryKey. Front = current head.
+    items: VecDeque<(EntryKey, EntryValue)>,
+    refill_state: RefillState,
 }
 
 impl<S: VQueueStore> Queue<S> {
     /// Creates a new queue that must first go through the given number of running items
     /// before it switches to reading the waiting inbox.
-    pub fn new(num_running: u32) -> Self {
+    pub fn new(num_running: u32, storage: &S, qid: &VQueueId) -> Self {
+        let stage = if num_running > 0 {
+            Stage::New {
+                already_running: NonZeroU32::new(num_running).unwrap(),
+                reader: Some(storage.new_run_reader(qid)),
+            }
+        } else {
+            Stage::Inbox
+        };
+
         Self {
-            head: Head::Unknown,
-            reader: Reader::New {
-                already_running: num_running,
-            },
+            stage,
+            items: VecDeque::with_capacity(INBOX_CACHE_CAPACITY),
+            refill_state: Default::default(),
         }
     }
 
     /// Creates an empty queue
     pub fn new_closed() -> Self {
         Self {
-            head: Head::Empty,
-            reader: Reader::Closed,
+            stage: Stage::Empty,
+            items: VecDeque::with_capacity(INBOX_CACHE_CAPACITY),
+            refill_state: Default::default(),
         }
     }
 
     /// If the queue is known to be empty (no more items to dequeue)
     pub fn is_empty(&self) -> bool {
-        matches!(self.head, Head::Empty)
+        matches!(self.stage, Stage::Empty)
     }
 
+    /// Returns `true` iff the removed key was the head of the queue.
+    ///
+    /// While the queue is still in the running stage, this is a no-op: the
+    /// scheduler may still yield a running item after the state machine has
+    /// declared it removed, and the state machine must ignore that yield.
     pub fn remove(&mut self, key_to_remove: &EntryKey) -> bool {
-        // Can this be the known head?
-        // Yes. Perhaps it expired/ended externally.
-        // We do not do anything if the reader is still at the running stage,
-        //
-        // This means that the scheduler might still yield the "running" item after
-        // the state machine has declared it as completed/removed. The state machine
-        // must be able to handle this case and "ignore" the yield command of this item.
-        if matches!(self.reader, Reader::Closed | Reader::Inbox(..))
-            && let Head::Known { ref key, .. } = self.head
-            && key == key_to_remove
-        {
-            self.head = Head::Unknown;
-            // Ensure that next advance would re-seek to the newly added item
-            self.reader = Reader::Closed;
-            true
-        } else {
-            false
+        if matches!(
+            self.stage,
+            Stage::New { .. } | Stage::Running { .. } | Stage::Empty
+        ) {
+            return false;
         }
+
+        // We are in Inbox/Waiting stage.
+        let Ok(pos) = self.items.binary_search_by_key(key_to_remove, |&(k, _)| k) else {
+            // This branch handles when we don't have the item in cache.
+            //
+            // The removed item can be:
+            // - Cached
+            // - Unconfirmed (not in cache) || < refill anchor
+            // - > refill anchor
+            //   - We have in-flight refill
+            //   - No in-inflight refills
+            //
+            // Scenarios:
+            // - We have empty cache (or does it matter?), maybe what matters is if we have it in cache or not.
+            //   - We have waiting inbox (inbox > 0)
+            //   - We have waiting inbox but with unconfirmed items in flights, we even out at zero.
+            //     which means that we should not see any removals outside what we have already
+            //     decided to start. We are effectively (empty).
+            //   - No inflight refill yet. Ignore.
+            //   - In flight refill exists. --> Maybe tombstone it.
+            // - We have cached, no in-flight refill.
+            //   - Remove if in cache, otherwise, ignore.
+            // - We have cached items, removal is < the refill anchor, we don't have it in cache. Ignore.
+            // - We have cached items, but the removal is > the refill anchor --> Maybe tombstone it.
+            //
+            // Removed item is beyond the refill anchor
+            match self.refill_state {
+                RefillState::Standby { .. } => {}
+                RefillState::InFlight(ref mut task)
+                    if task
+                        .refill_anchor
+                        .is_none_or(|ref refill| key_to_remove > refill) =>
+                {
+                    // The refill task is in flight. We cannot determine if the key will impact
+                    // its result or not, we'll keep at most INBOX_CACHE_CAPACITY tombstones to
+                    // use as overlay when the refill is complete.
+                    task.push_tombstone(key_to_remove);
+                }
+                RefillState::InFlight(_) => {
+                    // Ignore it. This item has either been shipped (unconfirmed assignment)
+                    // or was never in the inbox to start with.
+                }
+            }
+
+            return false;
+        };
+        self.items.remove(pos);
+        // The anchor stays put — even if we removed the back, the entry was
+        // also removed from storage, so `seek_after(anchor)` will skip it
+        // naturally on the next refill.
+        pos == 0
     }
 
-    /// Returns true if the head was changed
+    /// Returns `true` iff the new item became the head of the queue.
     pub fn enqueue(&mut self, key: &EntryKey, value: &EntryValue) -> bool {
-        match (&self.head, &self.reader) {
-            // we are only unknown if we are new and didn't read the running list yet,
-            // we might also be in a limbo state if advance() failed.
-            (_, Reader::New { .. } | Reader::Running { .. }) => { /* do nothing */ }
-            (Head::Unknown, _) => { /* do nothing */ }
-            (Head::Empty, _) => {
-                self.reader = Reader::Closed;
-                self.head = Head::Known {
-                    key: *key,
-                    value: value.clone(),
-                };
+        match self.stage {
+            Stage::New { .. } | Stage::Running { .. } => return false,
+            Stage::Empty => {
+                // The cache is already allocated and empty (kept around
+                // across the previous `Inbox -> Empty` transition). Just
+                // re-seed it and flip the marker.
+                assert!(self.items.is_empty());
+                assert!(!self.refill_state.is_in_flight());
+                self.items.push_back((*key, value.clone()));
+                self.refill_state.update_anchor(Some(*key));
+                self.stage = Stage::Inbox;
                 return true;
             }
-            (
-                Head::Known {
-                    key: current_key, ..
-                },
-                Reader::Inbox(_) | Reader::Closed,
-            ) => {
-                if key < current_key {
-                    self.head = Head::Known {
-                        key: *key,
-                        value: value.clone(),
-                    };
-                    // Ensure that next advance would re-seek to the newly added item
-                    self.reader = Reader::Closed;
-                    return true;
-                } else {
-                    // This is a temporary fix to ensure that we perform a re-seek
-                    // to fix the issue where the iterator wouldn't see the newly added
-                    // items if the memtable was flushed prior the seek.
-                    self.reader = Reader::Closed;
-                }
+            Stage::Inbox => { /* fall-through */ }
+        }
+
+        // Insert `(key, value)` into the sorted cache.
+        //
+        // Returns `true` iff the item became the new front (head). Returns
+        // `false` if the item was rejected because its key falls strictly above
+        // the cache's coverage zone (`> refill_anchor`); in that case the item
+        // will be discovered on the next refill.
+
+        // Priority-queue rule: ignore items strictly above our coverage zone.
+        // The bound is `refill_anchor`, NOT `cache.back`. After consuming the
+        // back of the cache, `cache.back` can be lower than `refill_anchor`,
+        // and items in `(cache.back, refill_anchor]` would be lost (the next
+        // refill seek_after(refill_anchor) only returns keys strictly greater
+        // than the anchor).
+        match self.refill_state {
+            RefillState::Standby { refill_anchor, .. }
+                if refill_anchor.is_none_or(|ref refill| key > refill) =>
+            {
+                // We cannot accept items beyond our coverage zone.
+                return false;
+            }
+            RefillState::Standby { .. } => { /* in coverage zone */ }
+            // This enqueue may or may not appear in the in-flight operation and
+            // there is no way to determine that. So, we stage it on the in-flight
+            // task overlay until we receive the result.
+            RefillState::InFlight(ref mut task)
+                if task.refill_anchor.is_none_or(|ref refill| key > refill) =>
+            {
+                task.push_added_item(key, value);
+                return false;
+            }
+            RefillState::InFlight(ref task) => {
+                // the new item is < than what the refill task is interested in. Therefore,
+                // we add it right now.
+                assert!(task.refill_anchor.is_some_and(|ref refill| key <= refill));
             }
         }
-        false
-    }
 
-    /// Returns the head if known, or None if the queue needs advancing
-    pub fn head(&self) -> Option<QueueItem<'_>> {
-        match (&self.head, &self.reader) {
-            (Head::Unknown, _) => None,
-            (_, Reader::New { .. }) => None,
-            (Head::Known { key, value }, Reader::Running { .. }) => {
-                Some(QueueItem::Running { key, value })
+        let pos = match self.items.binary_search_by_key(key, |&(k, _)| k) {
+            Ok(_) => {
+                // We already know about this key which means that the head has not changed
+                // as a result of this enqueue.
+                return false;
             }
-            (Head::Known { key, value }, Reader::Inbox(_) | Reader::Closed) => {
-                Some(QueueItem::Inbox { key, value })
+            Err(pos) => pos,
+        };
+
+        // We need to be careful about moving the refill anchor if there is an
+        // in-flight refill task.
+        //
+        // let's say we have a refill task in flight, and we are enqueueing smaller
+        // than it's anchor point, we are confident that the task will never return
+        // this item. If as a result of the enqueue we exceeded the cache capacity.
+        //
+        // If there was an in-flight refill, we must cancel it and reset our refill
+        // anchor to point to the back of the cache so that the next refill can
+        // re-discover it. This is an acceptable trade-off because we must be here
+        // because we have sufficiently populated the cache with recently enqueued
+        // items and we wouldn't need the refill immediately anyway.
+
+        // At-cap fast path: avoid the VecDeque grow/shrink that would happen
+        // if we inserted first and evicted afterwards.
+        if self.items.len() >= INBOX_CACHE_CAPACITY {
+            if pos == self.items.len() {
+                // The new key would land at the back and be evicted on the
+                // very next step. Skip the insert, but still lower the anchor
+                // to the current back so the next refill can re-discover `key`
+                // via `seek_after(anchor)`.
+                //
+                // Cancels the in-flight task if exists
+                self.refill_state
+                    .update_anchor(self.items.back().map(|(k, _)| *k));
+                return false;
             }
-            (Head::Empty, _) => Some(QueueItem::None),
+            // Make room first; `pos` is unaffected because pos < old_len in
+            // this branch (we shift elements right of `pos` regardless).
+            debug_assert!(pos < self.items.len());
+            self.items.pop_back();
+            self.items.insert(pos, (*key, value.clone()));
+
+            // Cancels the in-flight task if exists
+            self.refill_state
+                .update_anchor(self.items.back().map(|(k, _)| *k));
+            return pos == 0;
         }
+
+        // Normal path: cache below capacity.
+        self.items.insert(pos, (*key, value.clone()));
+        // Anchor maintenance: if it was None (very first insert), set it to
+        // this key. Otherwise the precondition above guarantees
+        // `key <= anchor`, so the anchor stays put.
+        if let RefillState::Standby { refill_anchor } = &mut self.refill_state
+            && refill_anchor.is_none()
+        {
+            *refill_anchor = Some(*key);
+        }
+        pos == 0
     }
 
-    pub fn advance_if_needed(
+    /// Returns the head if known, or `None` if the queue needs advancing.
+    pub fn head(&self) -> Option<QueueItem<'_>> {
+        match &self.stage {
+            Stage::New { .. } => None,
+            Stage::Running { head: (k, v), .. } => Some(QueueItem::Running { key: k, value: v }),
+            Stage::Inbox => self
+                .items
+                .front()
+                .map(|(k, v)| QueueItem::Inbox { key: k, value: v }),
+            Stage::Empty => Some(QueueItem::None),
+        }
+    }
+
+    pub fn poll_advance_if_needed(
         &mut self,
+        cx: &mut std::task::Context<'_>,
         storage: &S,
         skip: &UnconfirmedAssignments,
         qid: &VQueueId,
-    ) -> Result<QueueItem<'_>, StorageError> {
-        // Keep advancing until the head is known
-        while matches!(self.head, Head::Unknown) {
-            self.advance(storage, skip, qid)?;
-        }
-
-        match (&self.head, &self.reader) {
-            (Head::Unknown, _) => unreachable!("head must be known"),
-            (_, Reader::New { .. }) => unreachable!("reader cannot be new after poll"),
-            (Head::Known { key, value }, Reader::Running { .. }) => {
-                Ok(QueueItem::Running { key, value })
+        effectively_empty: bool,
+        allow_blocking_io: bool,
+    ) -> Poll<Result<QueueItem<'_>, StorageError>> {
+        loop {
+            let needs_advance = match self.stage {
+                Stage::New { .. } => true,
+                Stage::Inbox => self.items.is_empty(),
+                _ => false,
+            };
+            if !needs_advance {
+                break;
             }
-            (Head::Known { key, value }, Reader::Inbox(_) | Reader::Closed) => {
-                Ok(QueueItem::Inbox { key, value })
+            if !self.try_advance()? {
+                // We cannot advance without a refill
+                if self.refill_state.is_in_flight() {
+                    ready!(self.poll_refill_task(cx, skip));
+                    // If we ended up also being empty and we can't determine if we are
+                    // empty or not, we should start another refill task and return Pending.
+                    // This happens automatically because in that case we'll "continue"
+                } else {
+                    // Do we need to refill?
+                    // We don't need to refill if we are at Inbox stage and we know no more
+                    // inbox entries are available.
+                    if effectively_empty && matches!(self.stage, Stage::Inbox) {
+                        self.stage = Stage::Empty;
+                        break;
+                    }
+                    // A) try refill immediate refill if allowed
+                    match self.try_refill(storage, qid, skip, allow_blocking_io) {
+                        Ok(_) => {}
+                        Err(CursorError::WouldBlock) => {
+                            // B) start an async refill task
+                            self.start_refill_task(storage, qid);
+                        }
+                        Err(CursorError::Other(e)) => {
+                            // C) fail miserably
+                            return Poll::Ready(Err(e));
+                        }
+                    }
+                }
             }
-            (Head::Empty, _) => Ok(QueueItem::None),
         }
+
+        Poll::Ready(Ok(match &self.stage {
+            Stage::New { .. } => unreachable!("head must be resolved after advance_if_needed"),
+            Stage::Running { head: (k, v), .. } => QueueItem::Running { key: k, value: v },
+            Stage::Inbox => {
+                let (k, v) = self
+                    .items
+                    .front()
+                    .expect("inbox cache must have a head after advance_if_needed");
+                QueueItem::Inbox { key: k, value: v }
+            }
+            Stage::Empty => QueueItem::None,
+        }))
     }
 
     /// Advances the queue to the next item.
     ///
-    /// The queue reader will skip over items in `skip` when reading the inbox stage. When reading
-    /// the running stage, the `skip` set is ignored.
-    pub fn advance(
-        &mut self,
-        storage: &S,
-        skip: &UnconfirmedAssignments,
-        qid: &VQueueId,
-    ) -> Result<(), StorageError> {
+    /// In the inbox stage this consumes the current head (if any) and exposes
+    /// the next cached item; the cache is refilled from storage in batches of
+    /// up to [`INBOX_CACHE_CAPACITY`] when it empties. The `skip` set is
+    /// consulted only when reading the inbox stage; it is ignored when
+    /// reading the running stage.
+    ///
+    ///
+    /// Returns false if advance was not possible and we need to perform a refill.
+    pub fn try_advance(&mut self) -> Result<bool, StorageError> {
+        // Split into disjoint borrows so the `Stage::Inbox` arm below can
+        // mutate both `stage` and `inbox_cache` without fighting the
+        // borrow checker.
+        let Self { stage, items, .. } = self;
         loop {
-            match self.reader {
-                Reader::New { already_running } if already_running > 0 => {
-                    let mut reader = storage.new_run_reader(qid);
+            match stage {
+                Stage::New {
+                    already_running,
+                    reader,
+                } => {
+                    let mut reader = reader.take().unwrap();
                     reader.seek_to_first();
-                    let item = reader.peek()?;
-                    if let Some((key, value)) = item {
-                        self.head = Head::Known { key, value };
-                        self.reader = Reader::Running {
-                            remaining: already_running,
+                    if let Some((key, value)) = reader.peek()? {
+                        *stage = Stage::Running {
+                            head: (key, value),
                             reader,
+                            remaining: already_running.get(),
                         };
-                        break;
-                    } else {
-                        assert!(
-                            already_running > 0,
-                            "vqueue {qid:?} has no running items but its metadata says that it has {already_running} running items",
-                        );
-                        // move to inbox reading
-                        self.head = Head::Unknown;
-                        self.reader = Reader::Closed;
+                        return Ok(true);
                     }
+                    debug_assert!(
+                        false,
+                        "vqueue has no running items but its metadata says it has {}",
+                        already_running.get(),
+                    );
+                    *stage = Stage::Inbox;
                 }
-                Reader::New { .. } => {
-                    // create new inbox reader
-                    self.reader = Reader::Closed;
-                }
-                Reader::Running {
-                    ref mut reader,
-                    ref mut remaining,
+                Stage::Running {
+                    reader,
+                    remaining,
+                    head,
                 } => {
                     reader.advance();
                     *remaining = remaining.saturating_sub(1);
-                    let item = reader.peek()?;
-                    if let Some((key, value)) = item {
-                        debug_assert!(*remaining > 0);
-                        self.head = Head::Known { key, value };
-                        break;
-                    } else {
-                        debug_assert_eq!(0, *remaining);
-                        // move to inbox reading
-                        self.head = Head::Unknown;
-                        self.reader = Reader::Closed;
-                    }
-                }
-                Reader::Inbox(ref mut reader) => {
-                    reader.advance();
-                    let key = reader.current_key()?;
-                    if let Some(key) = key {
-                        if skip.contains_key(&key) {
-                            continue;
-                        }
-                        self.head = Head::Known {
-                            key,
-                            value: reader.current_value()?.unwrap(),
-                        };
-                        break;
-                    } else {
-                        // we are done reading inbox
-                        self.head = Head::Empty;
-                        self.reader = Reader::Closed;
-                        break;
-                    }
-                }
-                Reader::Closed => {
-                    match self.head {
-                        Head::Unknown => {
-                            let mut reader = storage.new_inbox_reader(qid);
-                            reader.seek_to_first();
-                            let key = reader.current_key()?;
-                            if let Some(key) = key {
-                                if skip.contains_key(&key) {
-                                    self.reader = Reader::Inbox(reader);
-                                    continue;
-                                }
-                                self.head = Head::Known {
-                                    key,
-                                    value: reader.current_value()?.unwrap(),
-                                };
-                                self.reader = Reader::Inbox(reader);
-                                break;
-                            } else {
-                                self.head = Head::Empty;
-                                self.reader = Reader::Closed;
-                            }
-                        }
-                        Head::Known { ref key, .. } => {
-                            // seek to known head first, then advance.
-                            let mut reader = storage.new_inbox_reader(qid);
-                            reader.seek_after(qid, key);
-                            let next_key = reader.current_key()?;
-                            if let Some(next_key) = next_key {
-                                if skip.contains_key(&next_key) {
-                                    self.reader = Reader::Inbox(reader);
-                                    continue;
-                                }
-                                self.head = Head::Known {
-                                    key: next_key,
-                                    value: reader.current_value()?.unwrap(),
-                                };
-                                self.reader = Reader::Inbox(reader);
-                                break;
-                            } else {
-                                self.head = Head::Empty;
-                                self.reader = Reader::Closed;
-                            }
+                    match reader.peek()? {
+                        Some(next) => {
+                            debug_assert!(*remaining > 0);
+                            *head = next;
+                            return Ok(true);
                         }
-                        Head::Empty => {
-                            // do nothing.
-                            return Ok(());
+                        None => {
+                            debug_assert_eq!(0, *remaining);
+                            *stage = Stage::Inbox;
                         }
                     }
                 }
+                Stage::Inbox => return Ok(items.pop_front().is_some()),
+                Stage::Empty => return Ok(true),
             }
         }
-        Ok(())
     }
 
-    pub(crate) fn remaining_in_running_stage(&self) -> u32 {
-        match self.reader {
-            Reader::New { already_running } => already_running,
-            Reader::Running { remaining, .. } => remaining,
-            Reader::Inbox(..) => 0,
-            Reader::Closed => 0,
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use restate_clock::RoughTimestamp;
-    use restate_clock::time::MillisSinceEpoch;
-    use restate_core::TaskCenter;
-    use restate_partition_store::{PartitionDb, PartitionStore, PartitionStoreManager};
-    use restate_rocksdb::RocksDbManager;
-    use restate_storage_api::Transaction;
-    use restate_storage_api::vqueue_table::{
-        EntryId, EntryKey, EntryKind, EntryMetadata, EntryValue, Stage, Status, WriteVQueueTable,
-        stats::EntryStatistics,
-    };
-    use restate_types::clock::UniqueTimestamp;
-    use restate_types::identifiers::{PartitionId, PartitionKey};
-    use restate_types::partitions::Partition;
-    use restate_types::sharding::KeyRange;
-    use restate_types::vqueues::VQueueId;
-
-    const BASE_RUN_AT_MS: u64 = 1_744_000_000_000;
-
-    fn test_entry_at(id: u8, has_lock: bool, run_at: MillisSinceEpoch) -> (EntryKey, EntryValue) {
-        let run_at = RoughTimestamp::from_unix_millis_clamped(run_at);
-        let created_at = UniqueTimestamp::try_from(1_000u64 + id as u64).unwrap();
-        let entry_id = EntryId::new(EntryKind::Invocation, [id; EntryId::REMAINDER_LEN]);
-        let key = EntryKey::new(has_lock, run_at, id as u64, entry_id);
-        let stats = EntryStatistics::new(created_at, run_at);
-        let value = EntryValue {
-            status: if stats.first_runnable_at > created_at.to_unix_millis() {
-                Status::Scheduled
-            } else {
-                Status::New
-            },
-            metadata: EntryMetadata::default(),
-            stats,
+    /// Refills `cache` with up to [`INBOX_CACHE_CAPACITY`] items from storage if
+    /// it's possible to do so without blocking the thread.
+    ///
+    /// Starting at `cache.refill_anchor` (or the very first key if no anchor is
+    /// set yet). Returns true if items were added. Items in the `skip` set are
+    /// not added to the cache, but the anchor advances past them so they are not
+    /// reconsidered on the next refill.
+    fn try_refill(
+        &mut self,
+        storage: &S,
+        qid: &VQueueId,
+        skip: &UnconfirmedAssignments,
+        allow_blocking_io: bool,
+    ) -> Result<(), CursorError> {
+        let start = Instant::now();
+        let mut reader = storage.new_inbox_reader(qid, vqueue_table::Options { allow_blocking_io });
+        let RefillState::Standby { refill_anchor } = &mut self.refill_state else {
+            panic!("refill state must be standby");
         };
 
-        (key, value)
-    }
-
-    fn test_entry(id: u8, has_lock: bool, run_at_ms: u64) -> (EntryKey, EntryValue) {
-        test_entry_at(
-            id,
-            has_lock,
-            MillisSinceEpoch::new(BASE_RUN_AT_MS + run_at_ms),
-        )
-    }
-
-    fn default_entry(id: u8) -> (EntryKey, EntryValue) {
-        test_entry(id, false, 0)
-    }
-
-    fn test_qid(partition_key: u64) -> VQueueId {
-        VQueueId::custom(partition_key, "1")
-    }
-
-    async fn storage_test_environment() -> PartitionStore {
-        let rocksdb_manager = RocksDbManager::init();
-        TaskCenter::set_on_shutdown(Box::pin(async {
-            rocksdb_manager.shutdown().await;
-        }));
-
-        let manager = PartitionStoreManager::create()
-            .await
-            .expect("DB storage creation succeeds");
-        manager
-            .open(
-                &Partition::new(PartitionId::MIN, KeyRange::new(0, PartitionKey::MAX - 1)),
-                None,
-            )
-            .await
-            .expect("DB storage creation succeeds")
-    }
-
-    async fn insert_entries(
-        rocksdb: &mut PartitionStore,
-        qid: &VQueueId,
-        stage: Stage,
-        entries: &[(EntryKey, EntryValue)],
-    ) {
-        let mut txn = rocksdb.transaction();
-        for (key, value) in entries {
-            txn.put_vqueue_inbox(qid, stage, key, value);
+        match refill_anchor {
+            Some(anchor) => reader.seek_after(anchor),
+            None => reader.seek_to_first(),
         }
-        txn.commit().await.expect("commit should succeed");
-    }
 
-    #[restate_core::test]
-    async fn test_queue_running_to_inbox_to_empty() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(1000);
-        let running_entry = default_entry(1);
-        let inbox_entry = default_entry(2);
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Running,
-            std::slice::from_ref(&running_entry),
-        )
-        .await;
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            std::slice::from_ref(&inbox_entry),
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(1);
-        let mut skip = UnconfirmedAssignments::new();
-
-        assert!(!queue.is_empty());
-
-        let head = queue.advance_if_needed(db, &skip, &qid).unwrap();
-        assert!(matches!(head, QueueItem::Running { key, .. } if *key == running_entry.0));
-        assert!(
-            matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running_entry.0)
-        );
+        while self.items.len() < INBOX_CACHE_CAPACITY {
+            let Some((key, value)) = reader.peek()? else {
+                if self.items.is_empty() {
+                    self.stage = Stage::Empty;
+                }
+                break;
+            };
 
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(
-            matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == inbox_entry.0)
+            if !skip.contains_key(&key) {
+                self.items.push_back((key, value));
+            }
+            *refill_anchor = Some(refill_anchor.map_or(key, |a| a.max(key)));
+            reader.advance();
+        }
+        tracing::debug!(
+            "non-blocking refill finished in {}, cache has {} items",
+            start.elapsed().friendly(),
+            self.items.len()
         );
-        let Some(QueueItem::Inbox { key, .. }) = queue.head() else {
-            panic!("expected inbox head");
-        };
-        skip.insert(*key, Default::default());
-        assert!(!queue.is_empty());
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(queue.is_empty());
-
-        let higher = default_entry(0);
-        assert!(queue.enqueue(&higher.0, &higher.1));
-        let head = queue.advance_if_needed(db, &skip, &qid).unwrap();
-        assert!(matches!(head, QueueItem::Inbox { key, .. } if *key == higher.0));
+        Ok(())
     }
 
-    #[restate_core::test]
-    async fn test_entry_key_ordering() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(2000);
-        let low = test_entry(1, false, 3_000);
-        let high = test_entry(2, false, 2_000);
-        let highest = test_entry(3, true, 9_000);
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            &[low.clone(), high.clone(), highest.clone()],
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(0);
-        let skip = UnconfirmedAssignments::new();
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == highest.0));
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == high.0));
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == low.0));
-    }
+    fn start_refill_task(&mut self, storage: &S, qid: &VQueueId) {
+        assert!(!self.refill_state.is_in_flight());
+        let RefillState::Standby { refill_anchor } = self.refill_state else {
+            panic!("refill state must be standby");
+        };
 
-    #[restate_core::test]
-    async fn test_run_at_below_now_bumps_entry_higher_in_inbox() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(2_500);
-        let now = MillisSinceEpoch::now().as_u64();
-        let future_entry =
-            test_entry_at(1, false, MillisSinceEpoch::new(now.saturating_add(60_000)));
-        let overdue_entry =
-            test_entry_at(2, false, MillisSinceEpoch::new(now.saturating_sub(1_000)));
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            &[future_entry.clone(), overdue_entry.clone()],
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(0);
-        let skip = UnconfirmedAssignments::new();
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(
-            matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == overdue_entry.0)
+        let mut reader = storage.new_inbox_reader(
+            qid,
+            vqueue_table::Options {
+                allow_blocking_io: true,
+            },
         );
 
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(
-            matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == future_entry.0)
-        );
-    }
+        let handle = tokio::task::spawn_blocking(move || {
+            // collect and send the results at the end
+            let mut results = Vec::with_capacity(INBOX_CACHE_CAPACITY);
 
-    #[restate_core::test]
-    async fn test_enqueue_replaces_head_on_smaller_key() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(3000);
-        let initial = test_entry(1, false, 3_000);
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            std::slice::from_ref(&initial),
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(0);
-        let skip = UnconfirmedAssignments::new();
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == initial.0));
-
-        let higher = test_entry(2, false, 2_000);
-        assert!(queue.enqueue(&higher.0, &higher.1));
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == higher.0));
-        assert!(
-            matches!(queue.advance_if_needed(db, &skip, &qid).unwrap(), QueueItem::Inbox { key, .. } if *key == higher.0)
-        );
+            match refill_anchor {
+                Some(anchor) => reader.seek_after(&anchor),
+                None => reader.seek_to_first(),
+            }
 
-        let lower = test_entry(3, false, 4_000);
-        assert!(!queue.enqueue(&lower.0, &lower.1));
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == higher.0));
+            while results.len() < INBOX_CACHE_CAPACITY {
+                // In this mode, we don't expect to see WouldBlock
+                match reader.peek() {
+                    Ok(Some((key, value))) => {
+                        results.push((key, value));
+                        reader.advance();
+                    }
+                    Ok(None) => {
+                        // no more items
+                        break;
+                    }
+                    Err(CursorError::WouldBlock) => {
+                        unreachable!("background refill task should never see WouldBlock");
+                    }
+                    Err(CursorError::Other(e)) => {
+                        tracing::error!("refill task failed: {e}");
+                        return Err(e);
+                    }
+                }
+            }
+            Ok(results)
+        });
+
+        let task = Box::new(RefillTask {
+            started_at: Instant::now(),
+            refill_anchor,
+            horizon: None,
+            overlay: VecDeque::with_capacity(INBOX_CACHE_CAPACITY),
+            handle,
+        });
+
+        self.refill_state = RefillState::InFlight(task);
     }
 
-    #[restate_core::test]
-    async fn test_remove() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(4000);
-        let entry = default_entry(1);
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            std::slice::from_ref(&entry),
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(0);
-        let mut skip = UnconfirmedAssignments::new();
-
-        queue.advance(db, &skip, &qid).unwrap();
-        let head_key = match queue.head() {
-            Some(QueueItem::Inbox { key, .. }) => *key,
-            _ => panic!("expected inbox head"),
+    fn poll_refill_task(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+        skip: &UnconfirmedAssignments,
+    ) -> Poll<()> {
+        let (items, overlay, mut refill_anchor, horizon) = match self.refill_state {
+            RefillState::Standby { .. } => return Poll::Ready(()),
+            RefillState::InFlight(ref mut task) => {
+                match ready!(Pin::new(&mut task.handle).poll(cx)) {
+                    Err(join_err) => {
+                        tracing::error!("refill task panicked: {join_err}");
+                        self.refill_state = RefillState::Standby {
+                            refill_anchor: task.refill_anchor.take(),
+                        };
+                        return Poll::Ready(());
+                    }
+                    Ok(Err(err)) => {
+                        tracing::error!("refill task failed: {err}");
+                        self.refill_state = RefillState::Standby {
+                            refill_anchor: task.refill_anchor.take(),
+                        };
+                        return Poll::Ready(());
+                    }
+                    Ok(Ok(result)) => {
+                        tracing::debug!(
+                            "refill task finished with {} items in  {}",
+                            result.len(),
+                            task.started_at.elapsed().friendly()
+                        );
+                        (
+                            result,
+                            std::mem::take(&mut task.overlay),
+                            task.refill_anchor.take(),
+                            task.horizon.take(),
+                        )
+                    }
+                }
+            }
         };
+        // If we got less items than what we asked, then this must have been the end of the queue
+        // at the time of the refill.
+        let end_of_queue = items.len() < INBOX_CACHE_CAPACITY;
 
-        assert!(!queue.remove(&default_entry(99).0));
-        assert!(queue.remove(&head_key));
-        assert!(queue.head().is_none());
-        assert!(
-            matches!(queue.advance_if_needed(db, &skip, &qid).unwrap(), QueueItem::Inbox { key, .. } if *key == entry.0)
-        );
+        // We now need to merge the items we received, apply the overlays, and deduplicate
+        // with existing entries in cache.
+        // At the end, we figure out what's the next anchor to use for the next refill.
+        //
+        //
+        // The strategy here is more complex that the blocking/inline version because
+        // concurrency is hard, who knew!
+        // We need to do all that while keeping the cache capacity in check. We don't
+        // want to re-allocate/resize the cache.
+        //
+        // We navigate both the overlays and the items in together in semi-lockstep.
+        // Technically, this is a LSM-style read/compaction algorithm.
+        for either in items
+            .into_iter()
+            .merge_join_by(overlay, |(key, _), (overlay_key, _)| key.cmp(overlay_key))
+        {
+            // First key at-or-above the horizon ends the merge: overlay
+            // entries are all below it by construction, and the merge
+            // walks ascending, so the rest is storage that the next
+            // refill will rediscover via `seek_after`.
+            let key = match &either {
+                EitherOrBoth::Left((k, _))
+                | EitherOrBoth::Right((k, _))
+                | EitherOrBoth::Both((k, _), _) => *k,
+            };
+            if horizon.is_some_and(|h| key >= h) {
+                break;
+            }
+            // Left is the item from db, right is the overlay
+            match either {
+                // Somewhat similar to a normal enqueue
+                EitherOrBoth::Left((key, value))
+                | EitherOrBoth::Right((key, Overlay::Add(value)))
+                    // overlay always wins.
+                | EitherOrBoth::Both((key, _), (_, Overlay::Add(value))) => {
+                    if skip.contains_key(&key) {
+                            // The key was already dispatched, skip it.
+                        refill_anchor = Some(refill_anchor.map_or(key, |a| a.max(key)));
+                        continue;
+                    }
+                    // Insert sorted in cache and ignore it if we already have it.
+                    // If this item pushes us over the cache capcity, then we ignore it and reset
+                    // the refill anchor to it.
+                    let pos = match self.items.binary_search_by_key(&key, |&(k, _)| k) {
+                        Ok(_) => continue,
+                        Err(pos) => pos,
+                    };
+                    if self.items.len() >= INBOX_CACHE_CAPACITY {
+                        if pos == self.items.len() {
+                            // beyond capacity, ignore the item. Reset the anchor.
+                            refill_anchor = self.items.back().map(|(k, _)| *k);
+                            break;
+                        }
+                        // Make room first;
+                        self.items.pop_back();
+                        self.items.insert(pos, (key, value));
+                        refill_anchor = self.items.back().map(|(k, _)| *k);
+                        break;
+                    }
 
-        skip.insert(entry.0, Default::default());
-        assert!(queue.remove(&head_key));
-        assert!(queue.head().is_none());
-        assert!(matches!(
-            queue.advance_if_needed(db, &skip, &qid).unwrap(),
-            QueueItem::None
-        ));
-    }
+                    // Normal path: cache below capacity.
+                    self.items.insert(pos, (key, value));
+                    refill_anchor = Some(key);
+                }
+                EitherOrBoth::Right((key, Overlay::Tombstone))
+                    // overlay always wins.
+                | EitherOrBoth::Both((key, _), (_, Overlay::Tombstone)) => {
+                    // In theory, we should never see a tombstone that impacts the existing
+                    // cache.
+                    debug_assert!(self.items.binary_search_by_key(&key, |&(k, _)| k).is_err());
+                    // we should push the anchor to this item.
+                    refill_anchor = Some(key);
+                }
+            }
+        }
 
-    #[restate_core::test]
-    async fn test_skip_set() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(5000);
-        let entry1 = default_entry(1);
-        let entry2 = default_entry(2);
-        let entry3 = default_entry(3);
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            &[entry1.clone(), entry2.clone(), entry3.clone()],
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(0);
-        let mut skip = UnconfirmedAssignments::new();
-        skip.insert(entry1.0, Default::default());
-        skip.insert(entry2.0, Default::default());
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == entry3.0));
-    }
+        // It's very important is that we must reset the task to standby
+        self.refill_state = RefillState::Standby {
+            refill_anchor: refill_anchor.or(self.items.back().map(|(k, _)| *k)),
+        };
 
-    #[restate_core::test]
-    async fn test_running_before_inbox_regardless_of_key() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(6000);
-        let running_entry = default_entry(1);
-        let inbox_entry = test_entry(2, true, 0);
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Running,
-            std::slice::from_ref(&running_entry),
-        )
-        .await;
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            std::slice::from_ref(&inbox_entry),
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(1);
-        let skip = UnconfirmedAssignments::new();
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Running { .. })));
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Inbox { .. })));
-    }
+        // at the end, if the cache is empty and end_of_queue is true, then we must
+        // have exhausted the inbox.
+        if self.items.is_empty() && end_of_queue {
+            self.stage = Stage::Empty;
+        }
 
-    #[restate_core::test]
-    async fn test_enqueue_and_remove_ignored_during_running_stage() {
-        let mut rocksdb = storage_test_environment().await;
-
-        let qid = test_qid(7000);
-        let running1 = default_entry(1);
-        let running2 = default_entry(2);
-        let inbox_entry = test_entry(10, true, 0);
-
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Running,
-            &[running1.clone(), running2.clone()],
-        )
-        .await;
-        insert_entries(
-            &mut rocksdb,
-            &qid,
-            Stage::Inbox,
-            std::slice::from_ref(&inbox_entry),
-        )
-        .await;
-
-        let db = rocksdb.partition_db();
-        let mut queue: Queue<PartitionDb> = Queue::new(2);
-        let skip = UnconfirmedAssignments::new();
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0));
-
-        let even_higher = test_entry(11, true, 0);
-        assert!(!queue.enqueue(&even_higher.0, &even_higher.1));
-        assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0));
-
-        assert!(!queue.remove(&running2.0));
-        assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0));
-
-        assert!(!queue.remove(&running1.0));
-        assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0));
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running2.0));
-
-        queue.advance(db, &skip, &qid).unwrap();
-        assert!(
-            matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == inbox_entry.0)
-        );
+        Poll::Ready(())
+    }
 
-        assert!(queue.remove(&inbox_entry.0));
-        assert!(queue.head().is_none());
+    pub(crate) fn remaining_in_running_stage(&self) -> u32 {
+        match &self.stage {
+            Stage::New {
+                already_running, ..
+            } => already_running.get(),
+            Stage::Running { remaining, .. } => *remaining,
+            Stage::Inbox | Stage::Empty => 0,
+        }
     }
 }
+
+#[cfg(test)]
+#[path = "queue_test.rs"]
+mod queue_test;
diff --git a/crates/vqueues/src/scheduler/queue_meta.rs b/crates/vqueues/src/scheduler/queue_meta.rs
deleted file mode 100644
index 712773166a..0000000000
--- a/crates/vqueues/src/scheduler/queue_meta.rs
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH.
-// All rights reserved.
-//
-// Use of this software is governed by the Business Source License
-// included in the LICENSE file.
-//
-// As of the Change Date specified in that file, in accordance with
-// the Business Source License, use of this software will be governed
-// by the Apache License, Version 2.0.
-
-use metrics::counter;
-
-use restate_limiter::LimitKey;
-use restate_storage_api::vqueue_table::metadata::VQueueMeta;
-use restate_storage_api::vqueue_table::{EntryKey, EntryValue};
-use restate_types::{LockName, Scope};
-use restate_util_string::ReString;
-
-use crate::metric_definitions::VQUEUE_ENQUEUE;
-
-/// A smaller version of the vqueue metadata that only contains the information needed for the
-/// scheduler to operate.
-#[derive(Debug, Clone)]
-pub struct VQueueMetaLite {
-    /// if true, the vqueue is paused, we don't pop entries from it until it's resumed.
-    queue_is_paused: bool,
-
-    /// the length of the inbox stage in the vqueue
-    num_waiting: usize,
-    scope: Option<Scope>,
-    limit_key: LimitKey<ReString>,
-    lock_name: Option<LockName>,
-}
-
-impl VQueueMetaLite {
-    pub fn new(original: &VQueueMeta) -> Self {
-        Self {
-            queue_is_paused: original.queue_is_paused(),
-            num_waiting: original.total_waiting() as usize,
-            scope: original.scope().clone(),
-            limit_key: original.limit_key().clone(),
-            lock_name: original.lock_name().cloned(),
-        }
-    }
-
-    pub const fn new_empty(
-        scope: Option<Scope>,
-        limit_key: LimitKey<ReString>,
-        lock_name: Option<LockName>,
-    ) -> Self {
-        Self {
-            queue_is_paused: false,
-            num_waiting: 0,
-            scope,
-            limit_key,
-            lock_name,
-        }
-    }
-
-    pub fn inbox_len(&self) -> usize {
-        self.num_waiting
-    }
-
-    pub fn is_inbox_empty(&self) -> bool {
-        self.num_waiting == 0
-    }
-
-    pub fn is_queue_paused(&self) -> bool {
-        self.queue_is_paused
-    }
-
-    pub fn scope(&self) -> &Option<Scope> {
-        &self.scope
-    }
-
-    pub fn lock_name(&self) -> Option<&LockName> {
-        match self.lock_name {
-            Some(ref lock_name) => Some(lock_name),
-            _ => None,
-        }
-    }
-
-    // pub fn service_name(&self) -> Option<&ServiceName> {
-    //     match self.link {
-    //         VQueueLink::Service(ref service_name) => Some(service_name),
-    //         VQueueLink::Lock(ref lock) => Some(lock.service_name()),
-    //         _ => None,
-    //     }
-    // }
-
-    pub fn limit_key(&self) -> &LimitKey<ReString> {
-        &self.limit_key
-    }
-
-    pub fn apply_update(&mut self, update: &MetaLiteUpdate) {
-        match update {
-            MetaLiteUpdate::QueuePaused => self.queue_is_paused = true,
-            MetaLiteUpdate::QueueResumed { num_waiting } => {
-                self.queue_is_paused = false;
-                self.num_waiting = *num_waiting as usize;
-            }
-            MetaLiteUpdate::EnqueuedToInbox { .. } => {
-                counter!(VQUEUE_ENQUEUE).increment(1);
-                self.num_waiting += 1;
-            }
-            MetaLiteUpdate::RemovedFromInbox(_) => {
-                self.num_waiting = self.num_waiting.saturating_sub(1)
-            }
-        }
-    }
-}
-
-#[derive(Debug)]
-pub enum MetaLiteUpdate {
-    QueuePaused,
-    QueueResumed { num_waiting: u64 },
-    RemovedFromInbox(EntryKey),
-    EnqueuedToInbox { key: EntryKey, value: EntryValue },
-}
diff --git a/crates/vqueues/src/scheduler/queue_test.rs b/crates/vqueues/src/scheduler/queue_test.rs
new file mode 100644
index 0000000000..38551902a7
--- /dev/null
+++ b/crates/vqueues/src/scheduler/queue_test.rs
@@ -0,0 +1,403 @@
+// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH.
+// All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+//! Test harness for the vqueue refill machinery.
+//!
+//! These tests use a fake [`VQueueStore`] (`GatedStore`) that lets the
+//! caller freeze the snapshot a refill task sees and interleave
+//! `notify_enqueued` / `notify_removed` events deterministically against
+//! the in-flight refill thread. That's what makes overlay and cancellation
+//! behaviour testable without a real RocksDB.
+//!
+//! Each [`GatedReader`] is created with `Options::allow_blocking_io` set
+//! by the queue itself:
+//! - The first attempt (`try_refill`) passes `allow_blocking_io: false`;
+//!   the fake responds with [`CursorError::WouldBlock`] which drives the
+//!   queue down the async refill path.
+//! - The async path then constructs a second reader with
+//!   `allow_blocking_io: true`; that reader parks on its first `peek`
+//!   (signalling the test that the snapshot is frozen) and resumes once
+//!   the test releases the latch.
+
+use std::sync::{Arc, Condvar, Mutex};
+use std::task::{Context, Poll, Waker};
+
+use restate_clock::RoughTimestamp;
+use restate_clock::time::MillisSinceEpoch;
+use restate_storage_api::StorageError;
+use restate_storage_api::vqueue_table::stats::EntryStatistics;
+use restate_storage_api::vqueue_table::{
+    CursorError, EntryId, EntryKey, EntryKind, EntryMetadata, EntryValue, Options, Status,
+    VQueueCursor, VQueueRunningCursor, VQueueStore,
+};
+use restate_types::clock::UniqueTimestamp;
+use restate_types::vqueues::VQueueId;
+
+use super::*;
+
+// ---------- helpers ----------
+
+const BASE_RUN_AT_MS: u64 = 1_744_000_000_000;
+
+/// Builds an entry whose sort position is determined entirely by `seq`
+/// (other `EntryKey` components are fixed).
+fn entry_at_seq(seq: u64) -> (EntryKey, EntryValue) {
+    let run_at = RoughTimestamp::from_unix_millis_clamped(MillisSinceEpoch::new(BASE_RUN_AT_MS));
+    let created_at = UniqueTimestamp::try_from(1_000u64 + seq).unwrap();
+    let entry_id = EntryId::new(EntryKind::Invocation, [0u8; EntryId::REMAINDER_LEN]);
+    let key = EntryKey::new(false, run_at, seq, entry_id);
+    let stats = EntryStatistics::new(created_at, run_at);
+    let value = EntryValue {
+        status: Status::New,
+        metadata: EntryMetadata::default(),
+        stats,
+    };
+    (key, value)
+}
+
+fn test_qid(partition_key: u64) -> VQueueId {
+    VQueueId::custom(partition_key, "1")
+}
+
+/// Polls the queue once and asserts it returned `Pending`. Used to kick
+/// off an in-flight refill task so the test can interleave events while
+/// the refill thread is parked at its gate.
+fn poll_once_expect_pending<S: VQueueStore>(
+    queue: &mut Queue<S>,
+    storage: &S,
+    skip: &UnconfirmedAssignments,
+    qid: &VQueueId,
+) {
+    let mut cx = Context::from_waker(Waker::noop());
+    match queue.poll_advance_if_needed(&mut cx, storage, skip, qid, false, false) {
+        Poll::Pending => {}
+        Poll::Ready(Ok(item)) => panic!("expected Pending, got Ready({item:?})"),
+        Poll::Ready(Err(e)) => panic!("expected Pending, got error: {e:?}"),
+    }
+}
+
+/// Drives `poll_advance_if_needed` until it returns `Ready`. Yields between
+/// Pending iterations so the `spawn_blocking` refill task can make progress.
+async fn drive_until_ready<S: VQueueStore>(
+    queue: &mut Queue<S>,
+    storage: &S,
+    skip: &UnconfirmedAssignments,
+    qid: &VQueueId,
+) {
+    let mut cx = Context::from_waker(Waker::noop());
+    loop {
+        match queue.poll_advance_if_needed(&mut cx, storage, skip, qid, false, false) {
+            Poll::Ready(Ok(_)) => return,
+            Poll::Ready(Err(e)) => panic!("queue error: {e:?}"),
+            Poll::Pending => tokio::time::sleep(std::time::Duration::from_millis(1)).await,
+        }
+    }
+}
+
+/// Pops everything currently in the cache (no refill, no I/O) and returns
+/// the keys in head-to-tail order.
+fn drain_cache<S: VQueueStore>(queue: &mut Queue<S>) -> Vec<EntryKey> {
+    let mut keys = vec![];
+    while let Some(QueueItem::Inbox { key, .. }) = queue.head() {
+        keys.push(*key);
+        queue.try_advance().unwrap();
+    }
+    keys
+}
+
+// ---------- fake VQueueStore ----------
+
+/// In-memory storage with hooks to gate the refill thread at a precise
+/// moment, so tests can deterministically interleave queue events against
+/// an in-flight refill.
+struct GatedStore {
+    /// Source of truth for what `new_inbox_reader` will snapshot. Sorted
+    /// ascending by `EntryKey`.
+    entries: Mutex<Vec<(EntryKey, EntryValue)>>,
+    /// Latch released by the test when it wants the parked refill thread
+    /// to proceed past its first `peek`.
+    release: Arc<(Mutex<bool>, Condvar)>,
+    /// Signalled by the reader the moment it parks. The test waits on
+    /// this to know the snapshot is frozen and queue events can now be
+    /// safely interleaved.
+    parked: Arc<(Mutex<bool>, Condvar)>,
+}
+
+impl GatedStore {
+    fn new(entries: Vec<(EntryKey, EntryValue)>) -> Self {
+        Self {
+            entries: Mutex::new(entries),
+            release: Arc::new((Mutex::new(false), Condvar::new())),
+            parked: Arc::new((Mutex::new(false), Condvar::new())),
+        }
+    }
+
+    fn release_refill_thread(&self) {
+        let (lock, cv) = &*self.release;
+        *lock.lock().unwrap() = true;
+        cv.notify_all();
+    }
+
+    fn wait_until_parked(&self) {
+        let (lock, cv) = &*self.parked;
+        let mut parked = lock.lock().unwrap();
+        while !*parked {
+            parked = cv.wait(parked).unwrap();
+        }
+    }
+}
+
+struct GatedReader {
+    /// Frozen at construction time (matches RocksDB snapshot semantics).
+    /// Sorted ascending by `EntryKey`.
+    snapshot: Vec<(EntryKey, EntryValue)>,
+    cursor: usize,
+    /// `true` for blocking readers (those constructed via the async refill
+    /// path with `allow_blocking_io: true`); they park at the gate on the
+    /// first `peek` so the test can pin the snapshot moment. Non-blocking
+    /// readers always return `WouldBlock` instead, driving the queue down
+    /// the async refill path.
+    blocking: bool,
+    release: Arc<(Mutex<bool>, Condvar)>,
+    parked: Arc<(Mutex<bool>, Condvar)>,
+    /// Whether this reader has parked yet. Each reader parks at most
+    /// once, on its first peek/key/value call.
+    has_parked: bool,
+}
+
+impl GatedReader {
+    fn block_until_released(&mut self) {
+        if self.has_parked {
+            return;
+        }
+        self.has_parked = true;
+        // Tell the test we're parked.
+        let (lock, cv) = &*self.parked;
+        *lock.lock().unwrap() = true;
+        cv.notify_all();
+        // Wait for the test to release us.
+        let (lock, cv) = &*self.release;
+        let mut released = lock.lock().unwrap();
+        while !*released {
+            released = cv.wait(released).unwrap();
+        }
+    }
+}
+
+impl VQueueCursor for GatedReader {
+    fn seek_to_first(&mut self) {
+        self.cursor = 0;
+    }
+
+    fn seek_after(&mut self, anchor: &EntryKey) {
+        self.cursor = self
+            .snapshot
+            .iter()
+            .position(|(k, _)| k > anchor)
+            .unwrap_or(self.snapshot.len());
+    }
+
+    fn advance(&mut self) {
+        if self.cursor < self.snapshot.len() {
+            self.cursor += 1;
+        }
+    }
+
+    fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>, CursorError> {
+        if !self.blocking {
+            return Err(CursorError::WouldBlock);
+        }
+        self.block_until_released();
+        Ok(self.snapshot.get(self.cursor).cloned())
+    }
+}
+
+/// Stub for the running stage. Tests construct queues with
+/// `num_running = 0`, so this is never actually exercised; it only exists
+/// to satisfy the [`VQueueStore`] trait bound.
+struct StubRunningReader;
+
+impl VQueueRunningCursor for StubRunningReader {
+    fn seek_to_first(&mut self) {}
+    fn peek(&mut self) -> Result<Option<(EntryKey, EntryValue)>, StorageError> {
+        Ok(None)
+    }
+    fn advance(&mut self) {}
+}
+
+impl VQueueStore for GatedStore {
+    type RunningReader = StubRunningReader;
+    type InboxReader = GatedReader;
+
+    fn new_run_reader(&self, _qid: &VQueueId) -> Self::RunningReader {
+        StubRunningReader
+    }
+
+    fn new_inbox_reader(&self, _qid: &VQueueId, opts: Options) -> Self::InboxReader {
+        // Snapshot freezes here.
+        let snapshot = self.entries.lock().unwrap().clone();
+        GatedReader {
+            snapshot,
+            cursor: 0,
+            blocking: opts.allow_blocking_io,
+            release: self.release.clone(),
+            parked: self.parked.clone(),
+            has_parked: false,
+        }
+    }
+}
+
+// ---------- tests ----------
+
+/// Sanity: a single in-flight refill with no concurrent events surfaces
+/// every storage row in the cache, in `EntryKey` order.
+#[restate_core::test]
+async fn refill_without_overlay_activity() {
+    let entries: Vec<_> = (1..=5).map(entry_at_seq).collect();
+    let storage = GatedStore::new(entries.clone());
+    let qid = test_qid(1);
+    let mut queue: Queue<GatedStore> = Queue::new(0, &storage, &qid);
+    let skip = UnconfirmedAssignments::new();
+
+    poll_once_expect_pending(&mut queue, &storage, &skip, &qid);
+    storage.wait_until_parked();
+    storage.release_refill_thread();
+    drive_until_ready(&mut queue, &storage, &skip, &qid).await;
+
+    let drained = drain_cache(&mut queue);
+    assert_eq!(
+        drained,
+        entries.iter().map(|(k, _)| *k).collect::<Vec<_>>(),
+        "cache should contain every storage row in order",
+    );
+}
+
+/// Tombstones in the overlay correctly suppress the matching storage row
+/// during merge — no overflow happens here, so the overlay's information
+/// is fully preserved. This is the "happy path" for the
+/// commit-before-notify invariant.
+#[restate_core::test]
+async fn tombstone_in_overlay_suppresses_storage_row() {
+    let r_target = entry_at_seq(100);
+    let storage = GatedStore::new(vec![r_target.clone()]);
+    let qid = test_qid(2);
+    let mut queue: Queue<GatedStore> = Queue::new(0, &storage, &qid);
+    let skip = UnconfirmedAssignments::new();
+
+    poll_once_expect_pending(&mut queue, &storage, &skip, &qid);
+    storage.wait_until_parked();
+    // Storage commit (modelled here as the snapshot being frozen with
+    // r_target visible) lands first; the notify_removed lands second.
+    queue.remove(&r_target.0);
+    storage.release_refill_thread();
+    drive_until_ready(&mut queue, &storage, &skip, &qid).await;
+
+    let drained = drain_cache(&mut queue);
+    assert!(
+        !drained.contains(&r_target.0),
+        "deleted row should be suppressed by overlay tombstone, got: {drained:?}",
+    );
+}
+
+/// **Bug demonstration.** When the overlay is at capacity and the back
+/// entry is a tombstone, `push_added_item`'s `pop_back` silently drops
+/// that tombstone. The merge then lets the (already-deleted) row into
+/// the cache.
+///
+/// Layout at the moment of overflow:
+///
+/// ```text
+/// overlay[0]                     = Tombstone(seq=50)        // pre-tombstone
+/// overlay[1]                     = Tombstone(seq=100)       // pre-tombstone
+/// overlay[2..CAPACITY-1]         = Add(seq=150..)           // CAPACITY-3 adds
+/// overlay[CAPACITY-1]            = Tombstone(seq=500)       // *r_target*
+/// ```
+///
+/// The trigger add (seq=400) sorts at `pos = CAPACITY-1`, which is
+/// `< overlay.len() == CAPACITY`, so `push_added_item` evicts the back
+/// (`Tombstone(500)`) and inserts the trigger. After this, the overlay
+/// holds only `[T(50), T(100), Add(150..), Add(400)]` — the tombstone
+/// for `r_target` is gone.
+///
+/// Storage is `[r_target=seq500]` (a single row that's been deleted but
+/// is still in the in-flight task's snapshot, faithful to the race the
+/// invariant doc on `RefillTask` describes). The merge produces:
+///
+/// ```text
+/// [T(50), T(100), Add(150..), Add(400), Left(seq500)]
+/// ```
+///
+/// With the tombstone evicted there's nothing to suppress `Left(seq500)`,
+/// so the merge inserts it into the cache. The drain then sees `seq500`,
+/// which is the bug.
+#[restate_core::test]
+async fn tombstone_evicted_on_overlay_overflow_leaks_deleted_row() {
+    // Overlay layout requires CAPACITY >= 4 (2 front tombstones + at
+    // least one add + 1 back tombstone). Bail loudly if someone shrinks
+    // the cache below that.
+    const { assert!(INBOX_CACHE_CAPACITY >= 4) };
+
+    // Front-anchor tombstones: low seqs so they sort to the front of the
+    // overlay and don't influence the eviction we want to trigger.
+    let pre_tombstone_a = entry_at_seq(50);
+    let pre_tombstone_b = entry_at_seq(100);
+
+    // The deleted row that the in-flight task's snapshot still sees.
+    let r_target = entry_at_seq(500);
+
+    // Adds that fit between the front tombstones and r_target's
+    // tombstone. Together with the three tombstones the overlay reaches
+    // exactly CAPACITY. We use a contiguous seq range starting at 150.
+    let n_adds = INBOX_CACHE_CAPACITY - 3;
+    let pre_adds: Vec<_> = (150..150 + n_adds as u64).map(entry_at_seq).collect();
+
+    // The trigger: an add that sorts BEFORE r_target's tombstone but
+    // after every other overlay entry. With `pos < overlay.len()` and
+    // the overlay at cap, `push_added_item` will `pop_back` — evicting
+    // `Tombstone(r_target)`.
+    let trigger_add = entry_at_seq(400);
+
+    let storage = GatedStore::new(vec![r_target.clone()]);
+    let qid = test_qid(3);
+    let mut queue: Queue<GatedStore> = Queue::new(0, &storage, &qid);
+    let skip = UnconfirmedAssignments::new();
+
+    // Kick off the in-flight refill so subsequent enqueue/remove events
+    // route through the overlay rather than the cache.
+    poll_once_expect_pending(&mut queue, &storage, &skip, &qid);
+    storage.wait_until_parked();
+
+    // Three tombstones; only Tombstone(r_target) matters for the bug.
+    queue.remove(&pre_tombstone_a.0);
+    queue.remove(&pre_tombstone_b.0);
+    queue.remove(&r_target.0);
+
+    // Fill the overlay up to CAPACITY with the front-of-r_target adds.
+    for (k, v) in &pre_adds {
+        queue.enqueue(k, v);
+    }
+
+    // Trigger the eviction. After this, Tombstone(r_target) is gone.
+    queue.enqueue(&trigger_add.0, &trigger_add.1);
+
+    // Release the refill thread; it returns the snapshot ([r_target]).
+    storage.release_refill_thread();
+    drive_until_ready(&mut queue, &storage, &skip, &qid).await;
+
+    let drained = drain_cache(&mut queue);
+
+    // Bug: r_target shows up in the cache even though we issued
+    // `notify_removed` for it before the merge ran. The matching
+    // tombstone was dropped on overlay overflow.
+    assert!(
+        !drained.contains(&r_target.0),
+        "deleted row leaked into cache: {drained:?}",
+    );
+}
diff --git a/crates/vqueues/src/scheduler/resource_manager.rs b/crates/vqueues/src/scheduler/resource_manager.rs
index 8a430d8794..a99e8d8aa6 100644
--- a/crates/vqueues/src/scheduler/resource_manager.rs
+++ b/crates/vqueues/src/scheduler/resource_manager.rs
@@ -28,6 +28,7 @@ use restate_limiter::RuleHandle;
 use restate_memory::{MemoryPool, NonZeroByteCount};
 use restate_storage_api::StorageError;
 use restate_storage_api::lock_table::LoadLocks;
+use restate_storage_api::vqueue_table::metadata::VQueueMeta;
 use restate_storage_api::vqueue_table::{EntryKey, EntryMetadata};
 use restate_types::identifiers::PartitionKey;
 use restate_types::vqueues::EntryKind;
@@ -44,7 +45,6 @@ use self::permit::ProvisionalPermit;
 use self::user_limiter::UserLimiter;
 use super::VQueueHandle;
 use super::eligible::EligibilityTracker;
-use super::queue_meta::VQueueMetaLite;
 use crate::GlobalTokenBucket;
 
 // A set of queues waiting on a resource
@@ -177,7 +177,7 @@ impl ResourceManager {
         &mut self,
         cx: &mut std::task::Context<'_>,
         vqueue: VQueueHandle,
-        meta: &VQueueMetaLite,
+        meta: &VQueueMeta,
         key: &EntryKey,
         _metadata: &EntryMetadata,
         current_permit: &mut PermitBuilder,
diff --git a/crates/vqueues/src/scheduler/vqueue_state.rs b/crates/vqueues/src/scheduler/vqueue_state.rs
index b83fc73696..bcb3406224 100644
--- a/crates/vqueues/src/scheduler/vqueue_state.rs
+++ b/crates/vqueues/src/scheduler/vqueue_state.rs
@@ -8,18 +8,21 @@
 // the Business Source License, use of this software will be governed
 // by the Apache License, Version 2.0.
 
+use std::task::{Poll, ready};
 use std::time::Duration;
 
 use enum_map::{Enum, EnumMap};
 use metrics::counter;
+use restate_storage_api::StorageError;
+use restate_storage_api::vqueue_table::metadata::VQueueMeta;
 use tokio::time::Instant;
 
 use restate_clock::RoughTimestamp;
-use restate_storage_api::StorageError;
 use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueStore, stats::WaitStats};
 use restate_types::vqueues::{EntryId, VQueueId};
 use restate_worker_api::ResourceKind;
 
+use crate::cache::{self, VQueueHandle};
 use crate::metric_definitions::{
     VQUEUE_CONCURRENCY_RULES_WAIT_MS, VQUEUE_DEPLOYMENT_CONCURRENCY_WAIT_MS,
     VQUEUE_INVOKER_CONCURRENCY_WAIT_MS, VQUEUE_INVOKER_MEMORY_WAIT_MS,
@@ -29,10 +32,8 @@ use crate::scheduler::queue::QueueItem;
 
 use super::clock::SchedulerClock;
 use super::queue::Queue;
-use super::queue_meta::MetaLiteUpdate;
-pub use super::queue_meta::VQueueMetaLite;
 use super::resource_manager::{AcquireOutcome, PermitBuilder};
-use super::{ResourceManager, RunAction, UnconfirmedAssignments, VQueueHandle, YieldAction};
+use super::{ResourceManager, RunAction, UnconfirmedAssignments, YieldAction};
 
 const QUANTUM: i32 = 1;
 
@@ -210,9 +211,6 @@ impl Stats {
 
 #[derive(derive_more::Debug)]
 pub struct VQueueState<S: VQueueStore> {
-    pub handle: VQueueHandle,
-    pub qid: VQueueId,
-    meta: VQueueMetaLite,
     deficit: i32,
     #[debug(skip)]
     unconfirmed_assignments: UnconfirmedAssignments,
@@ -224,17 +222,9 @@ pub struct VQueueState<S: VQueueStore> {
 }
 
 impl<S: VQueueStore> VQueueState<S> {
-    pub fn new(
-        qid: VQueueId,
-        handle: VQueueHandle,
-        meta: VQueueMetaLite,
-        num_running: u32,
-    ) -> Self {
-        let queue = Queue::new(num_running);
+    pub fn new(qid: &VQueueId, storage: &S, num_running: u32) -> Self {
+        let queue = Queue::new(num_running, storage, qid);
         Self {
-            handle,
-            qid,
-            meta,
             deficit: QUANTUM,
             unconfirmed_assignments: UnconfirmedAssignments::new(),
             queue,
@@ -243,15 +233,8 @@ impl<S: VQueueStore> VQueueState<S> {
         }
     }
 
-    pub fn apply_meta_update(&mut self, update: &MetaLiteUpdate) {
-        self.meta.apply_update(update)
-    }
-
-    pub fn new_empty(qid: VQueueId, handle: VQueueHandle, meta: VQueueMetaLite) -> Self {
+    pub fn new_empty() -> Self {
         Self {
-            qid,
-            handle,
-            meta,
             unconfirmed_assignments: UnconfirmedAssignments::new(),
             queue: Queue::new_closed(),
             deficit: 0,
@@ -263,7 +246,8 @@ impl<S: VQueueStore> VQueueState<S> {
     pub fn try_pop(
         &mut self,
         cx: &mut std::task::Context<'_>,
-        storage: &S,
+        handle: VQueueHandle,
+        slot: &cache::Slot,
         resources: &mut ResourceManager,
     ) -> Result<Pop, StorageError> {
         let (inbox_head_key, inbox_head_value, is_running) = match self.queue.head() {
@@ -271,8 +255,8 @@ impl<S: VQueueStore> VQueueState<S> {
             Some(QueueItem::Running { key, value }) => (key, value, true),
             e @ Some(QueueItem::None) | e @ None => {
                 unreachable!(
-                    "cannot pop from empty queue or attempted to pop before polling {:?}/{e:?}",
-                    self.handle
+                    "cannot pop from empty queue or attempted to pop before polling {}/{e:?}",
+                    slot.vqueue_id()
                 )
             }
         };
@@ -291,15 +275,14 @@ impl<S: VQueueStore> VQueueState<S> {
                 key: *inbox_head_key,
                 next_run_at: None,
             };
-            self.queue
-                .advance(storage, &self.unconfirmed_assignments, &self.qid)?;
+            self.queue.try_advance()?;
             return Ok(Pop::Yield(action));
         }
 
         match resources.poll_acquire_permit(
             cx,
-            self.handle,
-            &self.meta,
+            handle,
+            slot.meta(),
             inbox_head_key,
             &inbox_head_value.metadata,
             &mut self.current_permit,
@@ -313,8 +296,7 @@ impl<S: VQueueStore> VQueueState<S> {
                     wait_stats: self.head_stats.finalize(),
                 };
 
-                self.queue
-                    .advance(storage, &self.unconfirmed_assignments, &self.qid)?;
+                self.queue.try_advance()?;
                 Ok(Pop::Run(action))
             }
             AcquireOutcome::BlockedOn(resource) => {
@@ -327,22 +309,33 @@ impl<S: VQueueStore> VQueueState<S> {
         }
     }
 
-    pub fn is_dormant(&self) -> bool {
+    pub fn is_dormant(&self, meta: &VQueueMeta) -> bool {
         // We hold on to the vqueue until we confirm/reject all pending assignments. If we didn't
         // do so, we risk revisiting/redequeuing the unconfirmed items if the vqueue popped back to life
         // (i.e., on enqueue). This is the reason why we check for `unconfirmed_assignments`
-        (self.queue.is_empty() || self.meta.is_inbox_empty() || self.meta.is_queue_paused())
+        (self.queue.is_empty() || meta.is_inbox_empty() || meta.queue_is_paused())
             && self.unconfirmed_assignments.is_empty()
     }
 
-    pub fn poll_eligibility(&mut self, storage: &S) -> Result<DetailedEligibility, StorageError> {
-        self.queue
-            .advance_if_needed(storage, &self.unconfirmed_assignments, &self.qid)?;
-
-        Ok(self.check_eligibility())
+    pub fn poll_eligibility(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+        slot: &cache::Slot,
+        storage: &S,
+    ) -> Poll<Result<Eligibility, StorageError>> {
+        ready!(self.queue.poll_advance_if_needed(
+            cx,
+            storage,
+            &self.unconfirmed_assignments,
+            slot.vqueue_id(),
+            self.num_waiting_inbox(slot.meta()) == 0,
+            false,
+        ))?;
+
+        Poll::Ready(Ok(self.check_eligibility(slot.meta()).as_compact()))
     }
 
-    pub fn check_eligibility(&self) -> DetailedEligibility {
+    pub fn check_eligibility(&self, meta: &VQueueMeta) -> DetailedEligibility {
         let inbox_head_key = match self.queue.head() {
             Some(QueueItem::Running { .. }) => return DetailedEligibility::EligibleRunning,
             Some(QueueItem::Inbox { key, .. }) => key,
@@ -350,7 +343,7 @@ impl<S: VQueueStore> VQueueState<S> {
             None if self.queue.remaining_in_running_stage() > 0 => {
                 return DetailedEligibility::EligibleRunning;
             }
-            None if self.meta.inbox_len() > 0 => return DetailedEligibility::EligibleInbox,
+            None if meta.total_waiting() > 0 => return DetailedEligibility::EligibleInbox,
             None => return DetailedEligibility::Empty,
         };
 
@@ -408,10 +401,9 @@ impl<S: VQueueStore> VQueueState<S> {
 
     /// Takes into account the number of unconfirmed assignments when calculating
     /// the inbox length
-    pub fn num_waiting_inbox(&self) -> u64 {
-        self.meta
-            .inbox_len()
-            .saturating_sub(self.unconfirmed_assignments.len()) as u64
+    pub fn num_waiting_inbox(&self, meta: &VQueueMeta) -> u64 {
+        meta.total_waiting()
+            .saturating_sub(self.unconfirmed_assignments.len() as u64)
     }
 }
 
diff --git a/crates/worker/src/partition/leadership/leader_state.rs b/crates/worker/src/partition/leadership/leader_state.rs
index c26110b0f6..3d1eeb9c6f 100644
--- a/crates/worker/src/partition/leadership/leader_state.rs
+++ b/crates/worker/src/partition/leadership/leader_state.rs
@@ -42,9 +42,9 @@ use restate_types::net::partition_processor::{
 };
 use restate_types::sharding::KeyRange;
 use restate_types::{RESTATE_VERSION_1_7_0, SemanticRestateVersion, Version, Versioned, vqueues};
-use restate_vqueues::SchedulerService;
 use restate_vqueues::VQueueEvent;
 use restate_vqueues::scheduler::Decisions;
+use restate_vqueues::{SchedulerService, VQueuesMeta};
 use restate_wal_protocol::Command;
 use restate_wal_protocol::control::UpsertSchema;
 use restate_worker_api::invoker::InvokerHandle;
@@ -143,9 +143,13 @@ impl LeaderState {
         }
     }
 
-    pub fn read_scheduler_status(&self, keys: KeyRange) -> Vec<SchedulerStatusEntry> {
+    pub fn read_scheduler_status(
+        &self,
+        metas: VQueuesMeta<'_>,
+        keys: KeyRange,
+    ) -> Vec<SchedulerStatusEntry> {
         self.scheduler
-            .iter_status()
+            .iter_status(metas)
             .into_iter()
             .flatten()
             .filter(|(qid, _)| keys.contains(&qid.partition_key()))
@@ -164,7 +168,11 @@ impl LeaderState {
     ///
     /// Important: The future needs to be cancellation safe since it is polled as a tokio::select
     /// arm!
-    pub async fn run(&mut self, state_machine: &StateMachine) -> Result<Vec<ActionEffect>, Error> {
+    pub async fn run(
+        &mut self,
+        state_machine: &StateMachine,
+        vqueue_metas: VQueuesMeta<'_>,
+    ) -> Result<Vec<ActionEffect>, Error> {
         let timer_stream = std::pin::pin!(stream::unfold(
             &mut self.timer_service,
             |timer_service| async {
@@ -177,7 +185,7 @@ impl LeaderState {
         // if we have problems with latency
         let scheduler_stream =
             std::pin::pin!(stream::unfold(&mut self.scheduler, |scheduler| async {
-                match scheduler.schedule_next().await {
+                match scheduler.schedule_next(vqueue_metas).await {
                     Ok(decisions) => Some((ActionEffect::Scheduler(decisions), scheduler)),
                     Err(e) => {
                         error!("Fatal error when polling scheduler: {e}");
@@ -518,7 +526,11 @@ impl LeaderState {
         }
     }
 
-    pub fn handle_actions(&mut self, actions: impl Iterator<Item = Action>) -> Result<(), Error> {
+    pub fn handle_actions(
+        &mut self,
+        vqueues: VQueuesMeta<'_>,
+        actions: impl Iterator<Item = Action>,
+    ) -> Result<(), Error> {
         for action in actions {
             let action_name = action.name();
 
@@ -532,13 +544,13 @@ impl LeaderState {
             )
             .increment(1);
 
-            self.handle_action(action)?;
+            self.handle_action(vqueues, action)?;
         }
 
         Ok(())
     }
 
-    fn handle_action(&mut self, action: Action) -> Result<(), Error> {
+    fn handle_action(&mut self, metas: VQueuesMeta<'_>, action: Action) -> Result<(), Error> {
         match action {
             Action::Invoke {
                 invocation_id,
@@ -684,25 +696,25 @@ impl LeaderState {
                 }
             }
             Action::VQEvent(inbox_event) => {
-                self.handle_vqueue_inbox_event(inbox_event);
+                self.handle_vqueue_inbox_event(metas, inbox_event);
             }
             Action::VQInvoke {
-                qid,
+                vq_handle,
                 key,
                 invocation_target,
-                limit_key,
                 idempotency_key,
             } => {
+                let slot = metas.get(vq_handle).expect("vqueue meta must be in cache");
                 // state mutations should not create Invoke actions. At least for now.
                 assert!(matches!(key.kind(), vqueues::EntryKind::Invocation));
                 let invocation_id = key
                     .entry_id()
-                    .to_invocation_id(qid.partition_key())
+                    .to_invocation_id(slot.partition_key())
                     .unwrap();
 
-                let run_permit = self.scheduler.confirm_run_attempt(&qid, &key).unwrap_or_else(|| {
+                let run_permit = self.scheduler.confirm_run_attempt(vq_handle, slot, &key).unwrap_or_else(|| {
                     tracing::error!(
-                        vqueue = %qid,
+                        vqueue = %slot.vqueue_id(),
                         restate.invocation.id = %invocation_id,
                         "Cannot find a permit for entry key {key:?} in scheduler. Will not respect the invoker limit for this invocation"
                     );
@@ -710,11 +722,11 @@ impl LeaderState {
                 });
                 self.invoker_handle
                     .vqueue_invoke(
-                        qid,
+                        slot.vqueue_id().clone(),
                         run_permit,
                         invocation_id,
                         invocation_target,
-                        limit_key,
+                        slot.meta().limit_key().clone(),
                         idempotency_key,
                     )
                     .map_err(Error::Invoker)?
@@ -728,8 +740,8 @@ impl LeaderState {
         &mut self.invoker_handle
     }
 
-    fn handle_vqueue_inbox_event(&mut self, event: VQueueEvent) {
-        self.scheduler.on_inbox_event(event);
+    fn handle_vqueue_inbox_event(&mut self, metas: VQueuesMeta<'_>, event: VQueueEvent) {
+        self.scheduler.on_inbox_event(metas, event);
     }
 }
 
diff --git a/crates/worker/src/partition/leadership/mod.rs b/crates/worker/src/partition/leadership/mod.rs
index 768b0b2182..88079c1715 100644
--- a/crates/worker/src/partition/leadership/mod.rs
+++ b/crates/worker/src/partition/leadership/mod.rs
@@ -64,7 +64,7 @@ use restate_types::{
     GenerationalNodeId, RESTATE_VERSION_1_6_0, RESTATE_VERSION_1_7_0, SemanticRestateVersion,
 };
 use restate_vqueues::scheduler::{self};
-use restate_vqueues::{ResourceManager, SchedulerService, VQueuesMetaCache};
+use restate_vqueues::{ResourceManager, SchedulerService, VQueuesMeta, VQueuesMetaCache};
 use restate_wal_protocol::control::{AnnounceLeader, PartitionDurability, VersionBarrier};
 use restate_wal_protocol::timer::TimerKeyValue;
 use restate_wal_protocol::{Command, Envelope};
@@ -651,13 +651,17 @@ where
         }
     }
 
-    pub fn handle_actions(&mut self, actions: impl Iterator<Item = Action>) -> Result<(), Error> {
+    pub fn handle_actions(
+        &mut self,
+        vqueues: VQueuesMeta<'_>,
+        actions: impl Iterator<Item = Action>,
+    ) -> Result<(), Error> {
         match &mut self.state {
             State::Follower | State::Candidate { .. } => {
                 // nothing to do :-)
             }
             State::Leader(leader_state) => {
-                leader_state.handle_actions(actions)?;
+                leader_state.handle_actions(vqueues, actions)?;
             }
         }
 
@@ -669,7 +673,11 @@ where
     /// * Follower: Nothing to do
     /// * Candidate: Monitor appender task
     /// * Leader: Await action effects and monitor appender task
-    pub async fn run(&mut self, state_machine: &StateMachine) -> Result<Vec<ActionEffect>, Error> {
+    pub async fn run(
+        &mut self,
+        state_machine: &StateMachine,
+        vqueues: VQueuesMeta<'_>,
+    ) -> Result<Vec<ActionEffect>, Error> {
         match &mut self.state {
             State::Follower => Ok(futures::future::pending::<Vec<_>>().await),
             State::Candidate { self_proposer, .. } => Err(self_proposer
@@ -678,7 +686,7 @@ where
                 .join_on_err()
                 .await
                 .expect_err("never should never be returned")),
-            State::Leader(leader_state) => leader_state.run(state_machine).await,
+            State::Leader(leader_state) => leader_state.run(state_machine, vqueues).await,
         }
     }
 
@@ -708,13 +716,17 @@ where
 }
 
 impl<T> LeadershipState<T> {
-    pub fn handle_leader_query(&self, leader_query_cmd: LeaderQueryCommand) {
+    pub fn handle_leader_query(
+        &self,
+        metas: VQueuesMeta<'_>,
+        leader_query_cmd: LeaderQueryCommand,
+    ) {
         let (request, response_tx) = leader_query_cmd.into_inner();
 
         match (&self.state, request) {
             (State::Leader(leader_state), LeaderQueryRequest::SchedulerStatus { keys }) => {
                 let _ = response_tx.send(LeaderQueryResponse::SchedulerStatus(
-                    leader_state.read_scheduler_status(keys),
+                    leader_state.read_scheduler_status(metas, keys),
                 ));
             }
             (State::Leader(leader_state), LeaderQueryRequest::UserLimitCounters { keys }) => {
@@ -942,7 +954,7 @@ mod tests {
                 &mut partition_store,
                 &replica_set_states,
                 &Configuration::pinned(),
-                &mut VQueuesMetaCache::new_empty(),
+                &mut VQueuesMetaCache::new_empty(1024),
             )
             .await?;
 
diff --git a/crates/worker/src/partition/mod.rs b/crates/worker/src/partition/mod.rs
index e459cb3e2b..655e91627a 100644
--- a/crates/worker/src/partition/mod.rs
+++ b/crates/worker/src/partition/mod.rs
@@ -80,7 +80,7 @@ use restate_types::schema::Schema;
 use restate_types::storage::StorageDecodeError;
 use restate_types::time::{MillisSinceEpoch, NanosSinceEpoch};
 use restate_types::{GenerationalNodeId, SemanticRestateVersion, Version};
-use restate_vqueues::VQueuesMetaCache;
+use restate_vqueues::{VQueuesMeta, VQueuesMetaCache};
 use restate_wal_protocol::control::{
     AnnounceLeader, CurrentReplicaSetConfiguration, NextReplicaSetConfiguration,
 };
@@ -98,6 +98,11 @@ use crate::partition::leadership::LeadershipState;
 use crate::partition::state_machine::{ActionCollector, StateMachine};
 use crate::partition_processor_manager::PartitionLeaderHandlesRegistry;
 
+// Soft cap for the in-memory vqueue cache; once reached, inactive
+// entries are evicted at insert time. The cache will still grow past
+// this if compaction frees nothing.
+const VQUEUE_CACHE_CAPACITY: usize = 100_000;
+
 /// Information needed to run as leader, including the epoch and partition configurations.
 #[derive(Clone, Debug)]
 pub struct LeadershipInfo {
@@ -515,7 +520,11 @@ where
             tokio::time::interval(with_jitter(Duration::from_millis(500), 0.5));
         status_update_timer.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
-        let mut vqueues = VQueuesMetaCache::create(partition_store.partition_db().clone()).await?;
+        let mut vqueues = VQueuesMetaCache::create(
+            partition_store.partition_db().clone(),
+            VQUEUE_CACHE_CAPACITY,
+        )
+        .await?;
 
         let mut action_collector = ActionCollector::default();
         let mut command_buffer =
@@ -687,9 +696,9 @@ where
                     if let Some(lsn) = &self.status.last_applied_log_lsn {
                         self.last_applied_log_lsn_watch.send_replace(*lsn);
                     }
-                    self.leadership_state.handle_actions(action_collector.drain(..))?;
+                    self.leadership_state.handle_actions(vqueues.view(), action_collector.drain(..))?;
                 },
-                result = self.leadership_state.run(&self.state_machine) => {
+                result = self.leadership_state.run(&self.state_machine, vqueues.view()) => {
                     let action_effects = result?;
                     // We process the action_effects not directly in the run future because it
                     // requires the run future to be cancellation safe. In the future this could be
@@ -697,7 +706,7 @@ where
                     self.leadership_state.handle_action_effects(action_effects).await?;
                 }
                 Some(leader_query_cmd) = self.leader_query_rx.recv() => {
-                    self.on_leader_query(leader_query_cmd);
+                    self.on_leader_query(vqueues.view(), leader_query_cmd);
                 }
             }
             // Allow other tasks on this thread to run, but only if we have exhausted the coop
@@ -728,8 +737,9 @@ where
         Ok(())
     }
 
-    fn on_leader_query(&mut self, leader_query_cmd: LeaderQueryCommand) {
-        self.leadership_state.handle_leader_query(leader_query_cmd);
+    fn on_leader_query(&mut self, metas: VQueuesMeta<'_>, leader_query_cmd: LeaderQueryCommand) {
+        self.leadership_state
+            .handle_leader_query(metas, leader_query_cmd);
     }
 
     async fn on_pp_rpc_request(
diff --git a/crates/worker/src/partition/state_machine/actions.rs b/crates/worker/src/partition/state_machine/actions.rs
index dbb4fe32be..f421b6fc6b 100644
--- a/crates/worker/src/partition/state_machine/actions.rs
+++ b/crates/worker/src/partition/state_machine/actions.rs
@@ -11,7 +11,6 @@
 use restate_storage_api::outbox_table::OutboxMessage;
 use restate_storage_api::timer_table::TimerKey;
 use restate_storage_api::vqueue_table::EntryKey;
-use restate_types::LimitKey;
 use restate_types::identifiers::{EntryIndex, InvocationId, PartitionProcessorRpcRequestId};
 use restate_types::invocation::InvocationTarget;
 use restate_types::invocation::client::{
@@ -21,9 +20,8 @@ use restate_types::invocation::client::{
 use restate_types::journal_v2::{CommandIndex, NotificationId};
 use restate_types::message::MessageIndex;
 use restate_types::time::MillisSinceEpoch;
-use restate_types::vqueues::VQueueId;
 use restate_util_string::ReString;
-use restate_vqueues::VQueueEvent;
+use restate_vqueues::{VQueueEvent, VQueueHandle};
 use restate_wal_protocol::timer::TimerKeyValue;
 
 pub type ActionCollector = Vec<Action>;
@@ -34,10 +32,9 @@ pub enum Action {
     VQEvent(VQueueEvent),
     /// Tells invoker to run this invocation (similar to Invoke) but carries more information
     VQInvoke {
-        qid: VQueueId,
+        vq_handle: VQueueHandle,
         key: EntryKey,
         invocation_target: InvocationTarget,
-        limit_key: LimitKey<ReString>,
         idempotency_key: Option<ReString>,
     },
     Invoke {
diff --git a/crates/worker/src/partition/state_machine/entries/mod.rs b/crates/worker/src/partition/state_machine/entries/mod.rs
index 7cba1a4df7..7af7c3c1ed 100644
--- a/crates/worker/src/partition/state_machine/entries/mod.rs
+++ b/crates/worker/src/partition/state_machine/entries/mod.rs
@@ -380,7 +380,7 @@ where
             // Store journal entry
             WriteJournalTable::put_journal_entry(
                 ctx.storage,
-                self.invocation_id,
+                &self.invocation_id,
                 entry_index,
                 // Make sure that a deterministic append time is set based on Bifrost's record creation
                 // time. This ensures that the append time does not depend on the application time of
diff --git a/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs b/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs
index 97de8c1769..4a9b344ae0 100644
--- a/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs
+++ b/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs
@@ -63,7 +63,7 @@ where
                 // Now write the entry in the new table, and remove it from the old one
                 journal_table_v2::WriteJournalTable::put_journal_entry(
                     ctx.storage,
-                    self.invocation_id,
+                    &self.invocation_id,
                     0,
                     &StoredRawEntry::new(
                         StoredRawEntryHeader::new(ctx.record_created_at),
diff --git a/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs b/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs
index 44f957699d..72f11dd890 100644
--- a/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs
+++ b/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs
@@ -156,7 +156,7 @@ where
                     // Now copy to the new journal
                     journal_table_v2::WriteJournalTable::put_journal_entry(
                         ctx.storage,
-                        new_invocation_id,
+                        &new_invocation_id,
                         new_journal_index,
                         &entry,
                         &related_completion_ids,
@@ -176,7 +176,7 @@ where
                     // Now copy to the new journal
                     journal_table_v2::WriteJournalTable::put_journal_entry(
                         ctx.storage,
-                        new_invocation_id,
+                        &new_invocation_id,
                         new_journal_index,
                         &entry,
                         &[],
@@ -202,7 +202,7 @@ where
                 // Copy over this notification
                 journal_table_v2::WriteJournalTable::put_journal_entry(
                     ctx.storage,
-                    new_invocation_id,
+                    &new_invocation_id,
                     new_journal_index,
                     &entry,
                     &[],
@@ -311,7 +311,7 @@ where
         };
 
         ctx.on_pre_flight_invocation(
-            new_invocation_id,
+            &new_invocation_id,
             pre_flight_invocation_metadata,
             None,
             &limit_key,
diff --git a/crates/worker/src/partition/state_machine/mod.rs b/crates/worker/src/partition/state_machine/mod.rs
index 699dc1e7df..027114c8c8 100644
--- a/crates/worker/src/partition/state_machine/mod.rs
+++ b/crates/worker/src/partition/state_machine/mod.rs
@@ -113,7 +113,7 @@ use restate_types::{RESTATE_VERSION_1_6_0, journal_v2};
 use restate_types::{RestateVersion, SemanticRestateVersion};
 use restate_types::{Versioned, journal::*};
 use restate_util_string::ReString;
-use restate_vqueues::{VQueue, VQueuesMetaCache};
+use restate_vqueues::{VQueue, VQueueHandle, VQueuesMetaCache};
 use restate_wal_protocol::Command;
 use restate_wal_protocol::timer::TimerKeyDisplay;
 use restate_wal_protocol::timer::TimerKeyValue;
@@ -772,7 +772,7 @@ impl<S> StateMachineApplyContext<'_, S> {
         );
 
         self.on_pre_flight_invocation(
-            invocation_id,
+            &invocation_id,
             pre_flight_invocation_metadata,
             submit_notification_sink,
             &limit_key,
@@ -782,7 +782,7 @@ impl<S> StateMachineApplyContext<'_, S> {
 
     async fn on_pre_flight_invocation(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         mut pre_flight_invocation_metadata: PreFlightInvocationMetadata,
         submit_notification_sink: Option<SubmitNotificationSink>,
         limit_key: &LimitKey<ReString>,
@@ -936,7 +936,7 @@ impl<S> StateMachineApplyContext<'_, S> {
     // Uses vqueues, replaces on_pre_flight_invocation
     async fn vqueue_enqueue(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         metadata: PreFlightInvocationMetadata,
         submit_notification_sink: Option<SubmitNotificationSink>,
         limit_key: &LimitKey<ReString>,
@@ -970,7 +970,7 @@ impl<S> StateMachineApplyContext<'_, S> {
             record_unique_ts,
             self.record_lsn,
             metadata.execution_time,
-            EntryId::from(&invocation_id),
+            EntryId::from(invocation_id),
             vqueue_table::EntryMetadata::default(),
         );
 
@@ -994,7 +994,7 @@ impl<S> StateMachineApplyContext<'_, S> {
         };
 
         self.storage
-            .put_invocation_status(&invocation_id, &invocation_status)
+            .put_invocation_status(invocation_id, &invocation_status)
             .map_err(Error::Storage)?;
 
         // Invocation was scheduled, send back the ingress attach notification and return
@@ -1054,7 +1054,7 @@ impl<S> StateMachineApplyContext<'_, S> {
 
         // Send submit notification
         self.send_submit_notification_if_needed(
-            service_invocation.invocation_id,
+            &service_invocation.invocation_id,
             previous_invocation_status.execution_time(),
             // is_new_invocation is true if the RPC ingress request is a duplicate.
             service_invocation.source
@@ -1125,7 +1125,7 @@ impl<S> StateMachineApplyContext<'_, S> {
     /// Returns the invocation in case the invocation should run immediately
     fn handle_service_invocation_execution_time(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         metadata: PreFlightInvocationMetadata,
     ) -> Result<Option<PreFlightInvocationMetadata>, Error>
     where
@@ -1136,13 +1136,13 @@ impl<S> StateMachineApplyContext<'_, S> {
             debug_if_leader!(self.is_leader, "Store scheduled invocation");
 
             self.register_timer(
-                TimerKeyValue::neo_invoke(execution_time, invocation_id),
+                TimerKeyValue::neo_invoke(execution_time, *invocation_id),
                 span_context,
             )?;
 
             self.storage
                 .put_invocation_status(
-                    &invocation_id,
+                    invocation_id,
                     &InvocationStatus::Scheduled(
                         ScheduledInvocation::from_pre_flight_invocation_metadata(
                             metadata,
@@ -1161,7 +1161,7 @@ impl<S> StateMachineApplyContext<'_, S> {
     /// Returns the invocation in case the invocation was not inboxed
     async fn handle_service_invocation_exclusive_handler(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         metadata: PreFlightInvocationMetadata,
     ) -> Result<Option<PreFlightInvocationMetadata>, Error>
     where
@@ -1186,7 +1186,7 @@ impl<S> StateMachineApplyContext<'_, S> {
             if let VirtualObjectStatus::Locked(_) = service_status {
                 // If locked, enqueue in inbox and be done with it
                 let inbox_seq_number = self
-                    .enqueue_into_inbox(InboxEntry::Invocation(keyed_service_id, invocation_id))
+                    .enqueue_into_inbox(InboxEntry::Invocation(keyed_service_id, *invocation_id))
                     .await?;
 
                 debug_if_leader!(
@@ -1196,7 +1196,7 @@ impl<S> StateMachineApplyContext<'_, S> {
                 );
                 self.storage
                     .put_invocation_status(
-                        &invocation_id,
+                        invocation_id,
                         &InvocationStatus::Inboxed(
                             InboxedInvocation::from_pre_flight_invocation_metadata(
                                 metadata,
@@ -1219,7 +1219,7 @@ impl<S> StateMachineApplyContext<'_, S> {
                 self.storage
                     .put_virtual_object_status(
                         &keyed_service_id,
-                        &VirtualObjectStatus::Locked(invocation_id),
+                        &VirtualObjectStatus::Locked(*invocation_id),
                     )
                     .map_err(Error::Storage)?;
             }
@@ -1229,12 +1229,11 @@ impl<S> StateMachineApplyContext<'_, S> {
 
     fn init_journal_and_vqueue_invoke(
         &mut self,
-        qid: &VQueueId,
+        vq_handle: VQueueHandle,
         key: &EntryKey,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         mut in_flight_invocation_metadata: InFlightInvocationMetadata,
         invocation_input: Option<InvocationInput>,
-        limit_key: LimitKey<ReString>,
     ) -> Result<(), Error>
     where
         S: WriteJournalTable + WriteInvocationStatusTable + journal_table_v2::WriteJournalTable,
@@ -1261,19 +1260,13 @@ impl<S> StateMachineApplyContext<'_, S> {
         // Emit the trace anchor span for the invocation.
         if self.is_leader {
             let _start = instrumentation::create_invocation_start_span(
-                &invocation_id,
+                invocation_id,
                 &in_flight_invocation_metadata.invocation_target,
                 &in_flight_invocation_metadata.journal_metadata.span_context,
                 in_flight_invocation_metadata.timestamps.creation_time(),
             );
         }
-        self.vqueue_invoke(
-            qid,
-            key,
-            invocation_id,
-            in_flight_invocation_metadata,
-            limit_key,
-        )
+        self.vqueue_invoke(vq_handle, key, invocation_id, in_flight_invocation_metadata)
     }
 
     /// Inits the journal if invocation_input is `Some` and invokes the invocation. If
@@ -1281,24 +1274,13 @@ impl<S> StateMachineApplyContext<'_, S> {
     /// invoke the invocation.
     fn init_journal_and_invoke(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         mut in_flight_invocation_metadata: InFlightInvocationMetadata,
         invocation_input: Option<InvocationInput>,
     ) -> Result<(), Error>
     where
         S: WriteJournalTable + WriteInvocationStatusTable + journal_table_v2::WriteJournalTable,
     {
-        // Usage metering for "actions" should include the Input journal entry
-        // type, but it gets filtered out before reaching the state machine.
-        // Therefore we count it here, as a special case.
-        if self.is_leader {
-            counter!(
-                USAGE_LEADER_JOURNAL_ENTRY_COUNT,
-                "entry" => "Command/Input",
-            )
-            .increment(1);
-        }
-
         // Only init the journal if we have some invocation input
         if let Some(invocation_input) = invocation_input {
             self.init_journal(
@@ -1310,8 +1292,17 @@ impl<S> StateMachineApplyContext<'_, S> {
 
         // Emit the trace anchor span for the invocation.
         if self.is_leader {
+            // Usage metering for "actions" should include the Input journal entry
+            // type, but it gets filtered out before reaching the state machine.
+            // Therefore we count it here, as a special case.
+            counter!(
+                USAGE_LEADER_JOURNAL_ENTRY_COUNT,
+                "entry" => "Command/Input",
+            )
+            .increment(1);
+
             let _start = instrumentation::create_invocation_start_span(
-                &invocation_id,
+                invocation_id,
                 &in_flight_invocation_metadata.invocation_target,
                 &in_flight_invocation_metadata.journal_metadata.span_context,
                 in_flight_invocation_metadata.timestamps.creation_time(),
@@ -1330,7 +1321,7 @@ impl<S> StateMachineApplyContext<'_, S> {
     // will no longer be needed.
     fn init_journal(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         in_flight_invocation_metadata: &mut InFlightInvocationMetadata,
         invocation_input: InvocationInput,
     ) -> Result<(), Error>
@@ -1380,7 +1371,7 @@ impl<S> StateMachineApplyContext<'_, S> {
             ));
             journal_table::WriteJournalTable::put_journal_entry(
                 self.storage,
-                &invocation_id,
+                invocation_id,
                 0,
                 &input_entry,
             )
@@ -1392,11 +1383,10 @@ impl<S> StateMachineApplyContext<'_, S> {
 
     fn vqueue_invoke(
         &mut self,
-        qid: &VQueueId,
+        vq_handle: VQueueHandle,
         key: &EntryKey,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         in_flight_invocation_metadata: InFlightInvocationMetadata,
-        limit_key: LimitKey<ReString>,
     ) -> Result<(), Error>
     where
         S: WriteInvocationStatusTable,
@@ -1406,17 +1396,16 @@ impl<S> StateMachineApplyContext<'_, S> {
         let status = InvocationStatus::Invoked(in_flight_invocation_metadata);
 
         self.storage
-            .put_invocation_status(&invocation_id, &status)
+            .put_invocation_status(invocation_id, &status)
             .map_err(Error::Storage)?;
 
         if self.is_leader {
             let invocation_metadata = status.into_invocation_metadata().unwrap();
             let invocation_target = invocation_metadata.invocation_target;
             self.action_collector.push(Action::VQInvoke {
-                qid: qid.clone(),
+                vq_handle,
                 key: *key,
                 invocation_target,
-                limit_key,
                 idempotency_key: invocation_metadata.idempotency_key.map(ReString::new_owned),
             });
         }
@@ -1426,7 +1415,7 @@ impl<S> StateMachineApplyContext<'_, S> {
 
     fn invoke(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         in_flight_invocation_metadata: InFlightInvocationMetadata,
     ) -> Result<(), Error>
     where
@@ -1434,13 +1423,15 @@ impl<S> StateMachineApplyContext<'_, S> {
     {
         debug_if_leader!(self.is_leader, "Invoke");
 
-        self.action_collector.push(Action::Invoke {
-            invocation_id,
-            invocation_target: in_flight_invocation_metadata.invocation_target.clone(),
-        });
+        if self.is_leader {
+            self.action_collector.push(Action::Invoke {
+                invocation_id: *invocation_id,
+                invocation_target: in_flight_invocation_metadata.invocation_target.clone(),
+            });
+        }
         self.storage
             .put_invocation_status(
-                &invocation_id,
+                invocation_id,
                 &InvocationStatus::Invoked(in_flight_invocation_metadata),
             )
             .map_err(Error::Storage)?;
@@ -2349,11 +2340,11 @@ impl<S> StateMachineApplyContext<'_, S> {
                 .await?;
                 Ok(())
             }
-            Timer::NeoInvoke(invocation_id) => self.on_neo_invoke_timer(invocation_id).await,
+            Timer::NeoInvoke(ref invocation_id) => self.on_neo_invoke_timer(invocation_id).await,
         }
     }
 
-    async fn on_neo_invoke_timer(&mut self, invocation_id: InvocationId) -> Result<(), Error>
+    async fn on_neo_invoke_timer(&mut self, invocation_id: &InvocationId) -> Result<(), Error>
     where
         S: ReadVirtualObjectStatusTable
             + WriteVirtualObjectStatusTable
@@ -2368,7 +2359,7 @@ impl<S> StateMachineApplyContext<'_, S> {
             self.is_leader,
             "Handle scheduled invocation timer with invocation id {invocation_id}"
         );
-        let invocation_status = self.get_invocation_status(&invocation_id).await?;
+        let invocation_status = self.get_invocation_status(invocation_id).await?;
 
         if let InvocationStatus::Free = &invocation_status {
             warn!(
@@ -3055,15 +3046,13 @@ impl<S> StateMachineApplyContext<'_, S> {
             .unwrap();
 
             vqueue.run_entry(record_unique_ts, &header, wait_stats);
-
-            let limit_key = vqueue.meta().limit_key().clone();
+            let vq_handle = vqueue.handle();
 
             if self.is_leader {
                 self.action_collector.push(Action::VQInvoke {
-                    qid: qid.clone(),
+                    vq_handle,
                     key: *entry_key,
                     invocation_target: status.invocation_target().unwrap().clone(),
-                    limit_key,
                     // todo(tillrohrmann) avoid the transformation from ByteString to ReString by
                     //  storing the idempotency key as ReString in the first place
                     idempotency_key: status
@@ -3131,16 +3120,14 @@ impl<S> StateMachineApplyContext<'_, S> {
                 .unwrap();
 
                 vqueue.run_entry(record_unique_ts, &header, wait_stats);
-
-                let limit_key = vqueue.meta().limit_key().clone();
+                let vq_handle = vqueue.handle();
 
                 self.init_journal_and_vqueue_invoke(
-                    qid,
+                    vq_handle,
                     entry_key,
-                    invocation_id,
+                    &invocation_id,
                     metadata,
                     invocation_input,
-                    limit_key,
                 )?;
             }
             _ => {
@@ -3214,7 +3201,7 @@ impl<S> StateMachineApplyContext<'_, S> {
                                 self.record_created_at,
                             );
                         self.init_journal_and_invoke(
-                            invocation_id,
+                            &invocation_id,
                             in_flight_invocation_meta,
                             invocation_input,
                         )?;
@@ -4432,7 +4419,7 @@ impl<S> StateMachineApplyContext<'_, S> {
 
     fn send_submit_notification_if_needed(
         &mut self,
-        invocation_id: InvocationId,
+        invocation_id: &InvocationId,
         execution_time: Option<MillisSinceEpoch>,
         is_new_invocation: bool,
         submit_notification_sink: Option<SubmitNotificationSink>,
@@ -4880,7 +4867,7 @@ impl<S> StateMachineApplyContext<'_, S> {
         if pinned_protocol_version.is_none_or(|sp| sp >= ServiceProtocolVersion::V4) {
             journal_table_v2::WriteJournalTable::delete_journal(
                 self.storage,
-                invocation_id,
+                &invocation_id,
                 journal_length,
             )
             .map_err(Error::Storage)?
diff --git a/crates/worker/src/partition/state_machine/tests/mod.rs b/crates/worker/src/partition/state_machine/tests/mod.rs
index 9743d6d51e..e04d3ecf03 100644
--- a/crates/worker/src/partition/state_machine/tests/mod.rs
+++ b/crates/worker/src/partition/state_machine/tests/mod.rs
@@ -145,7 +145,7 @@ impl TestEnv {
     pub async fn apply(&mut self, command: Command) -> Vec<Action> {
         let mut transaction = self.storage.transaction();
         let mut action_collector = ActionCollector::default();
-        let mut vqueues = VQueuesMetaCache::new_empty();
+        let mut vqueues = VQueuesMetaCache::new_empty(1024);
         self.state_machine
             .apply(
                 command,
@@ -167,7 +167,7 @@ impl TestEnv {
     pub async fn apply_fallible(&mut self, command: Command) -> Result<Vec<Action>, Error> {
         let mut transaction = self.storage.transaction();
         let mut action_collector = ActionCollector::default();
-        let mut vqueues = VQueuesMetaCache::new_empty();
+        let mut vqueues = VQueuesMetaCache::new_empty(1024);
         self.state_machine
             .apply(
                 command,
diff --git a/tools/pp-bench/src/workload.rs b/tools/pp-bench/src/workload.rs
index f55a119794..06f4d751e8 100644
--- a/tools/pp-bench/src/workload.rs
+++ b/tools/pp-bench/src/workload.rs
@@ -104,7 +104,8 @@ pub async fn run(
 
     // Initialize the vqueue cache from the (empty) partition store — this follows
     // the production init path and avoids requiring test-util features.
-    let mut vq_cache = VQueuesMetaCache::create(partition_store.partition_db().clone()).await?;
+    let mut vq_cache =
+        VQueuesMetaCache::create(partition_store.partition_db().clone(), 100_000).await?;
 
     let workload_name = format!("{:?}", opts.spec.workload);
     let batch_size = opts.batch_size;