diff --git a/Cargo.lock b/Cargo.lock index 4a084bee5f..07668d8e11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8619,9 +8619,11 @@ dependencies = [ "restate-limiter", "restate-memory", "restate-partition-store", + "restate-platform", "restate-rocksdb", "restate-serde-util", "restate-storage-api", + "restate-time-util", "restate-types", "restate-util-string", "restate-worker-api", diff --git a/crates/partition-store/src/journal_table_v2/mod.rs b/crates/partition-store/src/journal_table_v2/mod.rs index 8bb7fcbdfd..33ebf9bd4d 100644 --- a/crates/partition-store/src/journal_table_v2/mod.rs +++ b/crates/partition-store/src/journal_table_v2/mod.rs @@ -769,23 +769,23 @@ fn budgeted_journal_v2_stream<'a, DB: DBAccess + Send>( impl WriteJournalTable for PartitionStoreTransaction<'_> { fn put_journal_entry( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, index: u32, entry: &StoredRawEntry, related_completion_ids: &[CompletionId], ) -> Result<()> { - self.assert_partition_key(&invocation_id)?; - put_journal_entry(self, &invocation_id, index, entry, related_completion_ids) + self.assert_partition_key(invocation_id)?; + put_journal_entry(self, invocation_id, index, entry, related_completion_ids) } fn delete_journal( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, journal_length: EntryIndex, ) -> Result<()> { - self.assert_partition_key(&invocation_id)?; + self.assert_partition_key(invocation_id)?; let _x = RocksDbPerfGuard::new("delete-journal"); - delete_journal(self, &invocation_id, journal_length) + delete_journal(self, invocation_id, journal_length) } } diff --git a/crates/partition-store/src/tests/journal_table_v2_test/mod.rs b/crates/partition-store/src/tests/journal_table_v2_test/mod.rs index da58dcd129..d5f6226010 100644 --- a/crates/partition-store/src/tests/journal_table_v2_test/mod.rs +++ b/crates/partition-store/src/tests/journal_table_v2_test/mod.rs @@ -99,7 +99,7 @@ fn mock_one_way_call_command(invocation_id_completion_id: CompletionId) -> Entry fn populate_sleep_journal(txn: &mut T) { for i in 0..5 { txn.put_journal_entry( - MOCK_INVOCATION_ID_1, + &MOCK_INVOCATION_ID_1, i, &StoredRawEntry::new( StoredRawEntryHeader::new(MillisSinceEpoch::now()), @@ -111,7 +111,7 @@ fn populate_sleep_journal(txn: &mut T) { } for i in 5..10 { txn.put_journal_entry( - MOCK_INVOCATION_ID_1, + &MOCK_INVOCATION_ID_1, i, &StoredRawEntry::new( StoredRawEntryHeader::new(MillisSinceEpoch::now()), @@ -199,7 +199,7 @@ async fn sleep_point_lookups(txn: &mut T) { } fn delete_journal(txn: &mut T, length: usize) { - txn.delete_journal(MOCK_INVOCATION_ID_1, length as u32) + txn.delete_journal(&MOCK_INVOCATION_ID_1, length as u32) .unwrap(); } @@ -259,7 +259,7 @@ async fn test_call_journal() { // Populate txn.put_journal_entry( - MOCK_INVOCATION_ID_1, + &MOCK_INVOCATION_ID_1, 0, &StoredRawEntry::new( StoredRawEntryHeader::new(MillisSinceEpoch::now()), @@ -269,7 +269,7 @@ async fn test_call_journal() { ) .unwrap(); txn.put_journal_entry( - MOCK_INVOCATION_ID_1, + &MOCK_INVOCATION_ID_1, 1, &StoredRawEntry::new( StoredRawEntryHeader::new(MillisSinceEpoch::now()), diff --git a/crates/partition-store/src/tests/vqueue_table_test/mod.rs b/crates/partition-store/src/tests/vqueue_table_test/mod.rs index be7d060657..67a784b8ed 100644 --- a/crates/partition-store/src/tests/vqueue_table_test/mod.rs +++ b/crates/partition-store/src/tests/vqueue_table_test/mod.rs @@ -20,17 +20,21 @@ //! Invariants: //! - The waiting cursor must not cross the boundary to adjacent vqueue ids. //! - The waiting cursor must not cross partition-key boundaries either. -//! - When the waiting cursor is "seeked" to to first after a higher-order item has been inserted -//! (i.e. item with has_lock=true, or with older run_at), the cursor must show this added item. +//! - The waiting cursor uses snapshot semantics: it captures a consistent +//! view of storage at creation time. Writes/deletes that happen after the +//! cursor is created are NOT visible to that cursor — callers must create +//! a fresh cursor to observe them. use restate_clock::time::MillisSinceEpoch; use restate_storage_api::Transaction; +use restate_storage_api::vqueue_table::ScanVQueueInboxStages; use restate_storage_api::vqueue_table::{ - EntryKey, EntryMetadata, EntryValue, Stage, Status, VQueueCursor, VQueueStore, - WriteVQueueTable, stats::EntryStatistics, + EntryKey, EntryMetadata, EntryValue, Options, Stage, Status, VQueueCursor, VQueueRunningCursor, + VQueueStore, WriteVQueueTable, stats::EntryStatistics, }; use restate_types::clock::UniqueTimestamp; use restate_types::identifiers::PartitionKey; +use restate_types::sharding::KeyRange; use restate_types::vqueues::{EntryId, EntryKind, VQueueId}; use crate::PartitionStore; @@ -90,7 +94,16 @@ fn default_entry(id: u8) -> (EntryKey, EntryValue) { entry(id, false, 0, id as u64) } -/// Collects all items from a cursor into a Vec +fn inbox_reader(db: &crate::PartitionDb, qid: &VQueueId) -> impl VQueueCursor + use<> { + db.new_inbox_reader( + qid, + Options { + allow_blocking_io: true, + }, + ) +} + +/// Collects all items from an inbox cursor. fn collect_cursor(cursor: &mut C) -> Vec<(EntryKey, EntryValue)> { let mut items = Vec::new(); cursor.seek_to_first(); @@ -101,6 +114,17 @@ fn collect_cursor(cursor: &mut C) -> Vec<(EntryKey, EntryValue) items } +/// Collects all items from a running cursor (different error type than inbox). +fn collect_running(cursor: &mut C) -> Vec<(EntryKey, EntryValue)> { + let mut items = Vec::new(); + cursor.seek_to_first(); + while let Ok(Some(item)) = cursor.peek() { + items.push(item); + cursor.advance(); + } + items +} + fn collect_ids(items: &[(EntryKey, EntryValue)]) -> Vec { items.iter().map(|(key, _)| *key.entry_id()).collect() } @@ -125,7 +149,7 @@ async fn key_ordering_by_has_lock_run_at_seq(txn: &mut W) { fn verify_key_ordering_by_has_lock_run_at_seq(db: &crate::PartitionDb) { let qid = test_qid(); - let mut reader = db.new_inbox_reader(&qid); + let mut reader = inbox_reader(db, &qid); let items = collect_cursor(&mut reader); assert_eq!(items.len(), 5, "Expected 5 items in inbox"); @@ -165,7 +189,7 @@ async fn ordering_within_same_lock_domain(txn: &mut W) { fn verify_ordering_within_same_lock_domain(db: &crate::PartitionDb) { let qid = VQueueId::custom(2000, "1"); - let mut reader = db.new_inbox_reader(&qid); + let mut reader = inbox_reader(db, &qid); let items = collect_cursor(&mut reader); assert_eq!(items.len(), 4, "Expected 4 items"); @@ -199,12 +223,12 @@ fn verify_running_and_inbox_are_separate(db: &crate::PartitionDb) { // Running reader should only see the Run stage entry let mut run_reader = db.new_run_reader(&qid); - let run_items = collect_cursor(&mut run_reader); + let run_items = collect_running(&mut run_reader); assert_eq!(run_items.len(), 1, "Running reader should see 1 item"); assert_eq!(*run_items[0].0.entry_id(), entry_id(10)); // Inbox reader should only see the Inbox stage entries - let mut inbox_reader = db.new_inbox_reader(&qid); + let mut inbox_reader = inbox_reader(db, &qid); let inbox_items = collect_cursor(&mut inbox_reader); assert_eq!(inbox_items.len(), 2, "Inbox reader should see 2 items"); assert_eq!(collect_ids(&inbox_items), vec![entry_id(20), entry_id(21)]); @@ -226,10 +250,10 @@ fn verify_seek_after_works(db: &crate::PartitionDb) { let entries: Vec<_> = (1..=5).map(default_entry).collect(); - let mut reader = db.new_inbox_reader(&qid); + let mut reader = inbox_reader(db, &qid); // Seek after the 3rd entry (id=3) - reader.seek_after(&qid, &entries[2].0); + reader.seek_after(&entries[2].0); let item = reader.peek().unwrap(); assert!(item.is_some(), "Should have items after seek_after"); // Next item should be entry 4 (strictly after entry 3) @@ -251,7 +275,7 @@ fn verify_empty_queue_returns_none(db: &crate::PartitionDb) { "Empty running queue should return None" ); - let mut inbox_reader = db.new_inbox_reader(&qid); + let mut inbox_reader = inbox_reader(db, &qid); inbox_reader.seek_to_first(); assert!( inbox_reader.peek().unwrap().is_none(), @@ -283,17 +307,17 @@ fn verify_vqueue_isolation(db: &crate::PartitionDb) { let qid3 = VQueueId::custom(pkey, "3"); // Each queue should only see its own entry - let mut reader1 = db.new_inbox_reader(&qid1); + let mut reader1 = inbox_reader(db, &qid1); let items1 = collect_cursor(&mut reader1); assert_eq!(items1.len(), 1); assert_eq!(*items1[0].0.entry_id(), entry_id(1)); - let mut reader2 = db.new_inbox_reader(&qid2); + let mut reader2 = inbox_reader(db, &qid2); let items2 = collect_cursor(&mut reader2); assert_eq!(items2.len(), 1); assert_eq!(*items2[0].0.entry_id(), entry_id(2)); - let mut reader3 = db.new_inbox_reader(&qid3); + let mut reader3 = inbox_reader(db, &qid3); let items3 = collect_cursor(&mut reader3); assert_eq!(items3.len(), 1); assert_eq!(*items3[0].0.entry_id(), entry_id(3)); @@ -320,7 +344,7 @@ fn verify_waiting_cursor_boundary_is_respected(db: &crate::PartitionDb) { let qid_b = VQueueId::custom(pkey, "b"); let a2 = entry(12, false, 10, 2); - let mut reader_a = db.new_inbox_reader(&qid_a); + let mut reader_a = inbox_reader(db, &qid_a); reader_a.seek_to_first(); assert_eq!( @@ -339,13 +363,13 @@ fn verify_waiting_cursor_boundary_is_respected(db: &crate::PartitionDb) { "Reader for qid_a must stop before qid_b entries" ); - reader_a.seek_after(&qid_a, &a2.0); + reader_a.seek_after(&a2.0); assert!( reader_a.peek().unwrap().is_none(), "seek_after(last_item) must not cross into the next vqueue" ); - let mut reader_b = db.new_inbox_reader(&qid_b); + let mut reader_b = inbox_reader(db, &qid_b); let items_b = collect_cursor(&mut reader_b); assert_eq!(collect_ids(&items_b), vec![entry_id(21)]); } @@ -376,7 +400,7 @@ fn verify_waiting_cursor_partition_prefix_boundary_is_respected(db: &crate::Part let qid_next_partition = VQueueId::custom(PartitionKey::from(5_201u64), "shared-boundary"); let target2 = entry(42, false, 10, 2); - let mut reader_target = db.new_inbox_reader(&qid_target_partition); + let mut reader_target = inbox_reader(db, &qid_target_partition); reader_target.seek_to_first(); assert_eq!( @@ -403,33 +427,33 @@ fn verify_waiting_cursor_partition_prefix_boundary_is_respected(db: &crate::Part "Reader for target qid must stop before adjacent partition keys" ); - reader_target.seek_after(&qid_target_partition, &target2.0); + reader_target.seek_after(&target2.0); assert!( reader_target.peek().unwrap().is_none(), "seek_after(last_item) must not cross into adjacent partition-key prefixes" ); - let mut reader_prev = db.new_inbox_reader(&qid_prev_partition); + let mut reader_prev = inbox_reader(db, &qid_prev_partition); assert_eq!( collect_ids(&collect_cursor(&mut reader_prev)), vec![entry_id(31)] ); - let mut reader_next = db.new_inbox_reader(&qid_next_partition); + let mut reader_next = inbox_reader(db, &qid_next_partition); assert_eq!( collect_ids(&collect_cursor(&mut reader_next)), vec![entry_id(51)] ); } -/// Test: Tailing iterator sees newly enqueued items after seek_to_first. +/// Test: a freshly created reader sees the current state of storage, +/// including writes that landed after a previous reader was created. /// -/// This verifies that the inbox reader (which uses a tailing iterator) can see -/// items that were added after the reader was created, when re-seeking. -async fn tailing_iterator_sees_new_items_on_reseek(rocksdb: &mut PartitionStore) { +/// This is the snapshot-semantics counterpart to the previous tailing-iterator +/// tests: callers must construct a new reader to observe post-creation writes. +async fn fresh_reader_sees_current_state(rocksdb: &mut PartitionStore) { let qid = VQueueId::custom(6000, "1"); - // Insert initial entries let entry1 = default_entry(1); let entry2 = default_entry(2); { @@ -439,475 +463,60 @@ async fn tailing_iterator_sees_new_items_on_reseek(rocksdb: &mut PartitionStore) txn.commit().await.expect("commit should succeed"); } - // Create reader and verify initial state - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - reader.seek_to_first(); - - let item = reader.peek().unwrap(); - assert_eq!(item.as_ref().map(|e| *e.0.entry_id()), Some(entry_id(1))); - reader.advance(); - - let item = reader.peek().unwrap(); - assert_eq!(item.as_ref().map(|e| *e.0.entry_id()), Some(entry_id(2))); - reader.advance(); - - // Should be empty now - assert!(reader.peek().unwrap().is_none()); - - // Now add a new entry while the reader is still open - let entry3 = default_entry(3); + // Create the original reader and drain it. { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1); - txn.commit().await.expect("commit should succeed"); + let db = rocksdb.partition_db(); + let mut reader = inbox_reader(db, &qid); + let items = collect_cursor(&mut reader); + assert_eq!(collect_ids(&items), vec![entry_id(1), entry_id(2)]); } - // Re-seek to first - should now see all 3 entries - reader.seek_to_first(); - let items = { - let mut items = Vec::new(); - while let Ok(Some(item)) = reader.peek() { - items.push(item); - reader.advance(); - } - items - }; - - assert_eq!(items.len(), 3, "Should see all 3 items after reseek"); - assert_eq!( - collect_ids(&items), - vec![entry_id(1), entry_id(2), entry_id(3)] - ); -} - -/// Test: Re-seek to first sees newly inserted higher-order items. -/// -/// This covers both invariants mentioned in the module docs: -/// - new item with older `run_at` appears first after re-seek -/// - new item with `has_lock=true` appears first after re-seek -async fn reseek_shows_new_higher_order_items(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(6_500, "1"); - - let base1 = entry(1, false, 100, 1); - let base2 = entry(2, false, 200, 2); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &base1.0, &base1.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &base2.0, &base2.1); - txn.commit().await.expect("commit should succeed"); - } - - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - reader.seek_to_first(); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1)); - - // Add an item with older run_at (higher order among unlocked entries) - let older_run_at = entry(3, false, 50, 3); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &older_run_at.0, &older_run_at.1); - txn.commit().await.expect("commit should succeed"); - } - - reader.seek_to_first(); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3)); - - // Add an item that has a lock (always higher order than unlocked entries) - let locked = entry(4, true, 300, 4); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &locked.0, &locked.1); - txn.commit().await.expect("commit should succeed"); - } - - reader.seek_to_first(); - let items = { - let mut items = Vec::new(); - while let Ok(Some(item)) = reader.peek() { - items.push(item); - reader.advance(); - } - items - }; - - assert_eq!( - collect_ids(&items), - vec![entry_id(4), entry_id(3), entry_id(1), entry_id(2)] - ); -} - -/// Test: Tailing iterator sees newly enqueued items via seek_after. -/// -/// This verifies that when using seek_after to resume iteration, newly added -/// items that sort after the seek position are visible. -async fn tailing_iterator_sees_new_items_on_seek_after(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(7000, "1"); - - // Insert initial entries with different key prefixes - let entry_first = entry(1, true, 10, 1); - let entry_second = entry(2, false, 10, 2); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_first.0, &entry_first.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_second.0, &entry_second.1); - txn.commit().await.expect("commit should succeed"); - } - - // Create reader and read the first item - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - reader.seek_to_first(); - - let first = reader.peek().unwrap().unwrap(); - assert_eq!(*first.0.entry_id(), entry_id(1)); - - // Now add a new entry that sorts after the seek position - let entry_third = entry(3, false, 10, 3); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_third.0, &entry_third.1); - txn.commit().await.expect("commit should succeed"); - } - - // Seek after the first item - should see entry_second and entry_third - reader.seek_after(&qid, &first.0); - - let mut remaining = Vec::new(); - while let Ok(Some(item)) = reader.peek() { - remaining.push(item); - reader.advance(); - } - - assert_eq!(remaining.len(), 2, "Should see 2 items after seek_after"); - assert_eq!(collect_ids(&remaining), vec![entry_id(2), entry_id(3)]); -} - -/// Test: Tailing iterator sees items inserted ahead of current position without re-seek. -/// -/// Scenario (1): after the cursor has advanced at least once, if a new item is inserted -/// at a key greater than the current position, it should become visible by continuing to -/// call `advance()`/`peek()` without any seek. -async fn tailing_iterator_sees_inserted_ahead_without_reseek(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(7_200, "1"); - - let entry1 = entry(1, false, 10, 1); - let entry3 = entry(3, false, 30, 3); - let entry5 = entry(5, false, 50, 5); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry5.0, &entry5.1); - txn.commit().await.expect("commit should succeed"); - } - - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - reader.seek_to_first(); - - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1)); - reader.advance(); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3)); - - let entry4 = entry(4, false, 40, 4); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry4.0, &entry4.1); - txn.commit().await.expect("commit should succeed"); - } - - // Continue from current position without seek. - reader.advance(); - let maybe_inserted = reader.peek().unwrap().as_ref().map(|e| *e.0.entry_id()); - assert_eq!( - maybe_inserted, - Some(entry_id(4)), - "Inserted item ahead of current position should be visible without re-seek" - ); - reader.advance(); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(5)); -} - -/// Test: Tailing iterator sees flushed insertions while mid-iteration without re-seek. -/// -/// Scenario: the cursor advances a couple of times and still has items to read. -/// New entries are inserted ahead of the current position, but before the next -/// existing item (splicing), and memtables are flushed. -/// Continuing with `advance()`/`peek()` (without seek) should surface the flushed items. -async fn tailing_iterator_sees_flushed_insertions_mid_iteration(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(7_250, "1"); - - let entry1 = entry(1, false, 10, 1); - let entry3 = entry(3, false, 30, 3); - let entry6 = entry(6, false, 60, 6); - let entry9 = entry(9, false, 90, 9); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry6.0, &entry6.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry9.0, &entry9.1); - txn.commit().await.expect("commit should succeed"); - } - - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - reader.seek_to_first(); - - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1)); - reader.advance(); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3)); - - // Splice entries between current position (3) and next existing item (6). - let entry4 = entry(4, false, 40, 4); - let entry5 = entry(5, false, 50, 5); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry4.0, &entry4.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry5.0, &entry5.1); - txn.commit().await.expect("commit should succeed"); - } - - rocksdb - .partition_db() - .flush_memtables(true) - .await - .expect("flush memtables should succeed"); - - // Continue from current position without seek. - reader.advance(); - let mut remaining_ids = Vec::new(); - while let Ok(Some(item)) = reader.peek() { - remaining_ids.push(*item.0.entry_id()); - reader.advance(); - } - - assert_eq!( - remaining_ids, - vec![entry_id(4), entry_id(5), entry_id(6), entry_id(9)], - "Spliced flushed items should be visible and not skipped without re-seek" - ); -} - -/// Test: Seeked tailing iterator sees spliced insertions after a pre-insert flush. -/// -/// Scenario: perform one seek to position the cursor on the item immediately -/// before the splice point, flush memtables, then insert a new item in the middle. -/// Advancing from that seeked position should return the new spliced item first. -async fn seeked_tailing_iterator_sees_spliced_insertions_after_preflush( - rocksdb: &mut PartitionStore, -) { - let qid = VQueueId::custom(7_255, "1"); - - let entry1 = entry(1, false, 10, 1); - let entry3 = entry(3, false, 30, 3); - let entry6 = entry(6, false, 60, 6); - let entry9 = entry(9, false, 90, 9); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry6.0, &entry6.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry9.0, &entry9.1); - txn.commit().await.expect("commit should succeed"); - } - - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - - rocksdb - .partition_db() - .flush_memtables(true) - .await - .expect("flush memtables should succeed"); - - // Single seek before inserting new items: land on entry3. - reader.seek_after(&qid, &entry1.0); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3)); - - // Splice one entry between current position (3) and next existing item (6). - let entry4 = entry(4, false, 40, 4); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry4.0, &entry4.1); - txn.commit().await.expect("commit should succeed"); - } - - // Continue from the seeked position without another seek. - reader.advance(); - - // This is the proof that we need to re-seek because the iterator will be blind - // to the newly added item if the seek happened after the mutable memtable flush. - assert_ne!( - *reader.peek().unwrap().unwrap().0.entry_id(), - entry_id(4), - "advance from seeked predecessor should surface the spliced item first" - ); - - // Re-seeking should make it visible again. - reader.seek_after(&qid, &entry1.0); - reader.advance(); - assert_eq!( - *reader.peek().unwrap().unwrap().0.entry_id(), - entry_id(4), - "advance from seeked predecessor should surface the spliced item first" - ); - - reader.advance(); - let mut tail_ids = Vec::new(); - while let Ok(Some(item)) = reader.peek() { - tail_ids.push(*item.0.entry_id()); - reader.advance(); - } - - assert_eq!( - tail_ids, - vec![entry_id(6), entry_id(9)], - "Remaining tail after the spliced item should keep original order" - ); -} - -/// Test: Seeked tailing iterator sees appended insertions after a pre-insert flush. -/// -/// Scenario: perform one seek, flush memtables, then insert new items that are strictly -/// after the existing tail. Continuing with `advance()`/`peek()` from that seeked -/// position should eventually surface the appended items. -async fn seeked_tailing_iterator_sees_appended_insertions_after_preflush( - rocksdb: &mut PartitionStore, -) { - let qid = VQueueId::custom(7_256, "1"); - - let entry1 = entry(1, false, 10, 1); - let entry3 = entry(3, false, 30, 3); - let entry6 = entry(6, false, 60, 6); - let entry9 = entry(9, false, 90, 9); + // Append a new item. + let entry3 = default_entry(3); { let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1); txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry6.0, &entry6.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry9.0, &entry9.1); txn.commit().await.expect("commit should succeed"); } + // A fresh reader sees all three items. let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - - // Single seek before appending new tail items: land on entry3. - reader.seek_after(&qid, &entry1.0); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(3)); - - rocksdb - .partition_db() - .flush_memtables(true) - .await - .expect("flush memtables should succeed"); - - let entry10 = entry(10, false, 100, 10); - let entry11 = entry(11, false, 110, 11); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry10.0, &entry10.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry11.0, &entry11.1); - txn.commit().await.expect("commit should succeed"); - } - - // Continue from the seeked position without another seek. - reader.advance(); - let mut remaining_ids = Vec::new(); - while let Ok(Some(item)) = reader.peek() { - remaining_ids.push(*item.0.entry_id()); - reader.advance(); - } - + let mut reader = inbox_reader(db, &qid); + let items = collect_cursor(&mut reader); assert_eq!( - remaining_ids, - vec![entry_id(6), entry_id(9), entry_id(10), entry_id(11)], - "Appended flushed items should be visible after traversing existing tail" + collect_ids(&items), + vec![entry_id(1), entry_id(2), entry_id(3)] ); } -/// Test: Tailing iterator sees items inserted after reaching the end without re-seek. -/// -/// Scenario (2): after the cursor reaches end-of-iteration, if a new item is inserted, -/// continuing with `advance()`/`peek()` (without seek) should surface the new item. -async fn tailing_iterator_sees_item_added_after_end_without_reseek(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(7_300, "1"); +/// Test: an existing reader holds a snapshot — writes after creation are not +/// observable, even after `seek_to_first` or `seek_after`. +async fn existing_reader_does_not_see_post_snapshot_writes(rocksdb: &mut PartitionStore) { + let qid = VQueueId::custom(6_100, "1"); - let entry1 = entry(1, false, 10, 1); + let entry1 = default_entry(1); + let entry2 = default_entry(2); { let mut txn = rocksdb.transaction(); txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1); + txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1); txn.commit().await.expect("commit should succeed"); } let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); + let mut reader = inbox_reader(db, &qid); reader.seek_to_first(); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(1)); - reader.advance(); - assert!(reader.peek().unwrap().is_none(), "Reader should be at end"); - - let entry2 = entry(2, false, 20, 2); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1); - txn.commit().await.expect("commit should succeed"); - } - - // Stay in the same iterator path, no re-seek. - reader.advance(); - assert!( - reader.peek().unwrap().is_none(), - "Inserted item after end is not visible without re-seek" - ); - - // Re-seeking makes the newly inserted item visible. - reader.seek_after(&qid, &entry1.0); - assert_eq!(*reader.peek().unwrap().unwrap().0.entry_id(), entry_id(2)); -} - -/// Test: Deleted items don't appear after reseek. -/// -/// This verifies that when an item is deleted while the reader is open, -/// re-seeking will not return the deleted item. -async fn deleted_items_not_visible_after_reseek(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(8000, "1"); - // Insert initial entries - let entry1 = default_entry(1); - let entry2 = default_entry(2); + // Insert a new item after the reader has taken its snapshot. let entry3 = default_entry(3); { let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1); txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1); txn.commit().await.expect("commit should succeed"); } - // Create reader and verify we see all 3 - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - let items = collect_cursor(&mut reader); - assert_eq!(items.len(), 3); - - // Delete the middle entry while the reader is still open - { - let mut txn = rocksdb.transaction(); - assert!( - txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry2.0) - .unwrap() - .is_some() - ); - txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry2.0); - txn.commit().await.expect("commit should succeed"); - } - - // Re-seek and verify we only see entries 1 and 3 + // Re-seeking the same reader still only shows the snapshot's two items. reader.seek_to_first(); let items = { let mut items = Vec::new(); @@ -917,123 +526,68 @@ async fn deleted_items_not_visible_after_reseek(rocksdb: &mut PartitionStore) { } items }; - - assert_eq!(items.len(), 2, "Should only see 2 items after deletion"); - assert_eq!(collect_ids(&items), vec![entry_id(1), entry_id(3)]); + assert_eq!(collect_ids(&items), vec![entry_id(1), entry_id(2)]); } -/// Test: Deleted items don't appear after seek_after. +/// Test: Stage scan reads only the requested stage key kind. /// -/// This verifies that deleted items are not returned when using seek_after -/// to resume iteration past a certain point. -async fn deleted_items_not_visible_after_seek_after(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(8500, "1"); +/// This validates the datafusion-oriented scan API and ensures stage-specific +/// scans do not leak rows from adjacent stage key kinds or partition keys. +async fn stage_scan_is_filtered_by_stage(rocksdb: &mut PartitionStore) { + let target_partition_key = PartitionKey::from(9_300u64); + let other_partition_key = PartitionKey::from(9_301u64); + let target_qid = VQueueId::custom(target_partition_key, "scan-target"); + let other_qid = VQueueId::custom(other_partition_key, "scan-target"); + + let stages = [ + Stage::Inbox, + Stage::Running, + Stage::Suspended, + Stage::Paused, + Stage::Finished, + ]; - // Insert entries - let entry1 = default_entry(1); - let entry2 = default_entry(2); - let entry3 = default_entry(3); { let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry1.0, &entry1.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry2.0, &entry2.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry3.0, &entry3.1); - txn.commit().await.expect("commit should succeed"); - } - - // Create reader - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - reader.seek_to_first(); - - // Read first item - let first = reader.peek().unwrap().unwrap(); - assert_eq!(*first.0.entry_id(), entry_id(1)); + for (index, stage) in stages.into_iter().enumerate() { + let entry_id = 100 + index as u8; + let target_entry = default_entry(entry_id); + let other_entry = default_entry(entry_id + 10); - // Delete entries 2 and 3 while reader is open - { - let mut txn = rocksdb.transaction(); - assert!( - txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry2.0) - .unwrap() - .is_some() - ); - txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry2.0); - assert!( - txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry3.0) - .unwrap() - .is_some() - ); - txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry3.0); + txn.put_vqueue_inbox(&target_qid, stage, &target_entry.0, &target_entry.1); + txn.put_vqueue_inbox(&other_qid, stage, &other_entry.0, &other_entry.1); + } txn.commit().await.expect("commit should succeed"); } - // Seek after first - should see nothing since 2 and 3 are deleted - reader.seek_after(&qid, &first.0); - assert!( - reader.peek().unwrap().is_none(), - "Should see no items after seek_after when remaining items are deleted" - ); -} + let range = KeyRange::from(target_partition_key..=target_partition_key); -/// Test: Concurrent enqueue and delete operations are handled correctly. -/// -/// This tests a more complex scenario where items are both added and removed -/// while the reader is open. -async fn concurrent_enqueue_and_delete(rocksdb: &mut PartitionStore) { - let qid = VQueueId::custom(9000, "1"); - - // Insert initial entries in deterministic key order - let entry_high = entry(10, true, 10, 10); - let entry_mid = entry(20, false, 20, 20); - let entry_low = entry(30, false, 30, 30); - { - let mut txn = rocksdb.transaction(); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_high.0, &entry_high.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_mid.0, &entry_mid.1); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_low.0, &entry_low.1); - txn.commit().await.expect("commit should succeed"); - } + for (index, stage) in stages.into_iter().enumerate() { + let expected_key = default_entry(100 + index as u8).0; + let rows = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let rows_for_scan = rows.clone(); - // Create reader and read first item - let db = rocksdb.partition_db(); - let mut reader = db.new_inbox_reader(&qid); - reader.seek_to_first(); + rocksdb + .for_each_vqueue_inbox_entry(range, stage, move |(qid, got_stage, key, _)| { + rows_for_scan + .lock() + .expect("stage scan lock should not be poisoned") + .push((qid.clone(), got_stage, *key)); + std::ops::ControlFlow::Continue(()) + }) + .expect("stage scan setup should succeed") + .await + .expect("stage scan should succeed"); - let first = reader.peek().unwrap().unwrap(); - assert_eq!(*first.0.entry_id(), entry_id(10)); - - // Simultaneously: delete entry_mid, add a new entry that sorts first - let entry_new_first = entry(5, true, 5, 5); - { - let mut txn = rocksdb.transaction(); - assert!( - txn.get_vqueue_inbox(&qid, Stage::Inbox, &entry_mid.0) - .unwrap() - .is_some() - ); - txn.delete_vqueue_inbox(&qid, Stage::Inbox, &entry_mid.0); - txn.put_vqueue_inbox(&qid, Stage::Inbox, &entry_new_first.0, &entry_new_first.1); - txn.commit().await.expect("commit should succeed"); + let rows = rows + .lock() + .expect("stage scan lock should not be poisoned") + .clone(); + assert_eq!(rows.len(), 1, "stage {stage} should return one row"); + assert_eq!(rows[0].0, target_qid, "stage {stage} returned wrong qid"); + assert_eq!(rows[0].1, stage, "stage {stage} returned wrong stage"); + assert_eq!(rows[0].2, expected_key, "stage {stage} returned wrong key"); } - - // Re-seek from start - should see: id=5, id=10, id=30 - // (id=20 is deleted, id=5 is added) - reader.seek_to_first(); - let items = { - let mut items = Vec::new(); - while let Ok(Some(item)) = reader.peek() { - items.push(item); - reader.advance(); - } - items - }; - - assert_eq!(items.len(), 3, "Should see 3 items"); - assert_eq!( - collect_ids(&items), - vec![entry_id(5), entry_id(10), entry_id(30)] - ); } pub(crate) async fn run_tests(mut rocksdb: PartitionStore) { @@ -1062,16 +616,9 @@ pub(crate) async fn run_tests(mut rocksdb: PartitionStore) { verify_waiting_cursor_boundary_is_respected(db); verify_waiting_cursor_partition_prefix_boundary_is_respected(db); - // Tailing iterator tests - these need mutable access to rocksdb for writes - tailing_iterator_sees_new_items_on_reseek(&mut rocksdb).await; - reseek_shows_new_higher_order_items(&mut rocksdb).await; - tailing_iterator_sees_new_items_on_seek_after(&mut rocksdb).await; - tailing_iterator_sees_inserted_ahead_without_reseek(&mut rocksdb).await; - tailing_iterator_sees_flushed_insertions_mid_iteration(&mut rocksdb).await; - seeked_tailing_iterator_sees_spliced_insertions_after_preflush(&mut rocksdb).await; - seeked_tailing_iterator_sees_appended_insertions_after_preflush(&mut rocksdb).await; - tailing_iterator_sees_item_added_after_end_without_reseek(&mut rocksdb).await; - deleted_items_not_visible_after_reseek(&mut rocksdb).await; - deleted_items_not_visible_after_seek_after(&mut rocksdb).await; - concurrent_enqueue_and_delete(&mut rocksdb).await; + stage_scan_is_filtered_by_stage(&mut rocksdb).await; + // Snapshot-iterator tests — exercise the contract that a fresh reader + // sees current storage and that an existing reader holds a fixed view. + fresh_reader_sees_current_state(&mut rocksdb).await; + existing_reader_does_not_see_post_snapshot_writes(&mut rocksdb).await; } diff --git a/crates/partition-store/src/vqueue_table/mod.rs b/crates/partition-store/src/vqueue_table/mod.rs index 57f115ac79..c1283c2368 100644 --- a/crates/partition-store/src/vqueue_table/mod.rs +++ b/crates/partition-store/src/vqueue_table/mod.rs @@ -18,6 +18,7 @@ mod running_reader; mod waiting_reader; use std::io::Cursor; +use std::pin::Pin; pub use entry::{EntryStatusKey, EntryStatusKeyRef, StatusHeaderRaw}; pub use inbox::InboxKey; @@ -32,12 +33,12 @@ use tracing::error; use restate_rocksdb::Priority; use restate_storage_api::StorageError; -use restate_storage_api::vqueue_table::ScanVQueueMetaTable; use restate_storage_api::vqueue_table::metadata::{VQueueMeta, VQueueMetaRef, VQueueMetaUpdates}; use restate_storage_api::vqueue_table::{ EntryKey, EntryMetadata, EntryStatusHeader, EntryValue, LazyEntryStatus, ReadVQueueTable, ScanVQueueTable, Stage, Status, WriteVQueueTable, stats::EntryStatistics, }; +use restate_storage_api::vqueue_table::{ScanVQueueInboxStages, ScanVQueueMetaTable}; use restate_types::sharding::{KeyRange, PartitionKey}; use restate_types::vqueues::{EntryId, Seq, VQueueId}; @@ -474,6 +475,118 @@ impl ScanVQueueMetaTable for PartitionStore { } } +fn stage_from_key_kind(key_kind: KeyKind) -> Option { + match key_kind { + KeyKind::VQueueInboxStage => Some(Stage::Inbox), + KeyKind::VQueueRunningStage => Some(Stage::Running), + KeyKind::VQueueSuspendedStage => Some(Stage::Suspended), + KeyKind::VQueuePausedStage => Some(Stage::Paused), + KeyKind::VQueueFinishedStage => Some(Stage::Finished), + _ => None, + } +} + +fn scan_vqueue_inbox_stage<'store, K, F>( + partition_store: &'store PartitionStore, + scanner_name: &'static str, + range: KeyRange, + stage: Stage, + mut f: F, +) -> Result> + Send + 'store>>> +where + K: EncodeTableKeyPrefix + 'store, + F: for<'row> FnMut( + (&'row VQueueId, Stage, &'row EntryKey, &'row EntryValue), + ) -> std::ops::ControlFlow<()> + + Send + + Sync + + 'static, +{ + let future = partition_store + .iterator_for_each( + scanner_name, + Priority::Low, + TableScan::FullScanPartitionKeyRange::(range), + move |(mut key, mut value)| { + let key_kind = break_on_err(KeyKind::deserialize(&mut key))?; + let key_stage = break_on_err( + stage_from_key_kind(key_kind).ok_or(StorageError::DataIntegrityError), + )?; + if key_stage != stage { + return std::ops::ControlFlow::Break(Err(StorageError::DataIntegrityError)); + } + + let qid: VQueueId = break_on_err(crate::keys::deserialize(&mut key))?; + let entry_key: EntryKey = break_on_err(crate::keys::deserialize(&mut key))?; + let entry = break_on_err( + EntryValue::decode(&mut value).map_err(StorageError::BilrostDecode), + )?; + + f((&qid, stage, &entry_key, &entry)).map_break(Ok) + }, + ) + .map_err(|_| StorageError::OperationalError)?; + + Ok(Box::pin(future)) +} + +impl ScanVQueueInboxStages for PartitionStore { + fn for_each_vqueue_inbox_entry< + F: for<'a> FnMut( + (&'a VQueueId, Stage, &'a EntryKey, &'a EntryValue), + ) -> std::ops::ControlFlow<()> + + Send + + Sync + + 'static, + >( + &self, + range: KeyRange, + stage: Stage, + f: F, + ) -> Result> + Send> { + match stage { + Stage::Unknown => Err(StorageError::Generic(anyhow::anyhow!( + "Unknown stage can't be scanned" + ))), + Stage::Inbox => scan_vqueue_inbox_stage::( + self, + "df-vqueue-inbox", + range, + stage, + f, + ), + Stage::Running => scan_vqueue_inbox_stage::( + self, + "df-vqueue-running", + range, + stage, + f, + ), + Stage::Suspended => scan_vqueue_inbox_stage::( + self, + "df-vqueue-suspended", + range, + stage, + f, + ), + Stage::Paused => scan_vqueue_inbox_stage::( + self, + "df-vqueue-paused", + range, + stage, + f, + ), + Stage::Finished => scan_vqueue_inbox_stage::( + self, + "df-vqueue-finished", + range, + stage, + f, + ), + } + } +} + // ## Safety // The iterator is guaranteed to be dropped before the database is dropped, we hold to the // PartitionDb in this struct for as long as the iterator is alive. diff --git a/crates/partition-store/src/vqueue_table/reader.rs b/crates/partition-store/src/vqueue_table/reader.rs index 99a1870f0a..ab8bfface0 100644 --- a/crates/partition-store/src/vqueue_table/reader.rs +++ b/crates/partition-store/src/vqueue_table/reader.rs @@ -8,7 +8,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use restate_storage_api::vqueue_table::VQueueStore; +use restate_storage_api::vqueue_table::{Options, VQueueStore}; use restate_types::vqueues::VQueueId; use crate::PartitionDb; @@ -24,7 +24,7 @@ impl VQueueStore for PartitionDb { VQueueRunningReader::new(self, qid) } - fn new_inbox_reader(&self, qid: &VQueueId) -> Self::InboxReader { - VQueueWaitingReader::new(self, qid) + fn new_inbox_reader(&self, qid: &VQueueId, opts: Options) -> Self::InboxReader { + VQueueWaitingReader::new(self, qid, opts) } } diff --git a/crates/partition-store/src/vqueue_table/running_reader.rs b/crates/partition-store/src/vqueue_table/running_reader.rs index cc21bfba67..f8d44aa5a5 100644 --- a/crates/partition-store/src/vqueue_table/running_reader.rs +++ b/crates/partition-store/src/vqueue_table/running_reader.rs @@ -13,7 +13,7 @@ use bilrost::OwnedMessage; use rocksdb::DBRawIteratorWithThreadMode; use restate_storage_api::StorageError; -use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueCursor}; +use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueRunningCursor}; use restate_types::vqueues::VQueueId; use crate::PartitionDb; @@ -60,57 +60,15 @@ impl VQueueRunningReader { } } -impl VQueueCursor for VQueueRunningReader { +impl VQueueRunningCursor for VQueueRunningReader { fn seek_to_first(&mut self) { self.it.seek_to_first(); } - fn seek_after(&mut self, _qid: &VQueueId, _key: &EntryKey) { - panic!("seek_after is not supported for running snapshot reader"); - } - fn advance(&mut self) { self.it.next(); } - /// Returns the current key under cursor - fn current_key(&mut self) -> Result, StorageError> { - if let Some(key) = self.it.key() { - debug_assert_eq!(key.len(), RunningKey::serialized_length_fixed()); - - // The portion we are interested in is everything that represents the EntryKey - let entry_key = - ::decode(&mut &key[RunningKey::offset_of_entry_key()..])?; - Ok(Some(entry_key)) - } else { - // we reached the end (or an error). We cannot recover from this without seek. - // todo: add support for iterator refresh(). - self.it - .status() - .context("peek into vqueue snapshot iterator") - .map_err(StorageError::Generic)?; - // iterator is beyond the end, we can't peek - Ok(None) - } - } - - /// Returns the current value under cursor - fn current_value(&mut self) -> Result, StorageError> { - if let Some(mut value) = self.it.value() { - let value = EntryValue::decode(&mut value)?; - Ok(Some(value)) - } else { - // we reached the end (or an error). We cannot recover from this without seek. - // todo: add support for iterator refresh(). - self.it - .status() - .context("peek into vqueue snapshot iterator") - .map_err(StorageError::Generic)?; - // iterator is beyond the end, we can't peek - Ok(None) - } - } - fn peek(&mut self) -> Result, StorageError> { if let Some((key, mut value)) = self.it.item() { debug_assert_eq!(key.len(), RunningKey::serialized_length_fixed()); diff --git a/crates/partition-store/src/vqueue_table/waiting_reader.rs b/crates/partition-store/src/vqueue_table/waiting_reader.rs index 48a91b4a35..0f81d9f268 100644 --- a/crates/partition-store/src/vqueue_table/waiting_reader.rs +++ b/crates/partition-store/src/vqueue_table/waiting_reader.rs @@ -8,12 +8,16 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use anyhow::Context; +use std::sync::Arc; + use bilrost::OwnedMessage; use rocksdb::DBRawIteratorWithThreadMode; +use restate_rocksdb::RocksDb; use restate_storage_api::StorageError; -use restate_storage_api::vqueue_table::{EntryKey, EntryValue, Stage, VQueueCursor}; +use restate_storage_api::vqueue_table::{ + CursorError, EntryKey, EntryValue, Options, Stage, VQueueCursor, +}; use restate_types::vqueues::VQueueId; use crate::PartitionDb; @@ -22,20 +26,34 @@ use crate::vqueue_table::InboxKey; use crate::vqueue_table::inbox::InboxKeyRef; pub struct VQueueWaitingReader { + qid: VQueueId, it: DBRawIteratorWithThreadMode<'static, rocksdb::DB>, + // Safety: This must be dropped last since the iterator references memory allocated by it. + // This is only set to Some if the iterator is configured to run with blocking IO. The + // assumption is that blocking iterators will run in background threads, so we need to pin + // the database until the iterator is dropped. + _db: Option>, } impl VQueueWaitingReader { - pub(crate) fn new(db: &PartitionDb, qid: &VQueueId) -> Self { + pub(crate) fn new(db: &PartitionDb, qid: &VQueueId, opts: Options) -> Self { let mut readopts = rocksdb::ReadOptions::default(); readopts.set_async_io(true); // this is not the place to be concerned about corruption, we favor speed // over safety for this particular use-case. readopts.set_verify_checksums(false); - readopts.set_tailing(true); + // We re-create this reader on every refill, so a fresh snapshot is what + // we want. A tailing iterator would see new writes but is unsafe across + // memtable flushes. // use prefix extractors for efficient filtering. readopts.set_prefix_same_as_start(true); + if opts.allow_blocking_io { + readopts.set_read_tier(rocksdb::ReadTier::All); + } else { + readopts.set_read_tier(rocksdb::ReadTier::BlockCache); + } + // we know how big the prefix is let mut key_buf = [0u8; InboxKey::by_qid_prefix_len()]; InboxKeyRef::builder() @@ -54,10 +72,14 @@ impl VQueueWaitingReader { .raw_iterator_cf_opt(db.cf_handle(), readopts); Self { + qid: qid.clone(), // Safety: // The iterator is guaranteed to be dropped before the database is dropped, we hold to the // PartitionDb in the scheduler which pins the database and the column family. + // + // We also pin the database if blocking IO is configured. it: unsafe { super::ignore_iterator_lifetime(it) }, + _db: opts.allow_blocking_io.then(|| db.rocksdb().clone()), } } } @@ -67,9 +89,8 @@ impl VQueueCursor for VQueueWaitingReader { self.it.seek_to_first(); } - fn seek_after(&mut self, qid: &VQueueId, key: &EntryKey) { - tracing::trace!("Seeking after {key:?}"); - let mut key_buf = super::inbox::encode_stage_key(Stage::Inbox, qid, key); + fn seek_after(&mut self, key: &EntryKey) { + let mut key_buf = super::inbox::encode_stage_key(Stage::Inbox, &self.qid, key); let success = crate::convert_to_upper_bound(&mut key_buf); debug_assert!(success); self.it.seek(key_buf); @@ -79,57 +100,20 @@ impl VQueueCursor for VQueueWaitingReader { self.it.next(); } - /// Returns the current key under cursor - fn current_key(&mut self) -> Result, StorageError> { - if let Some(key) = self.it.key() { - debug_assert_eq!(key.len(), InboxKey::serialized_length_fixed()); - - // The portion we are interested in is everything that represents the EntryKey - let entry_key = - ::decode(&mut &key[InboxKey::offset_of_entry_key()..])?; - Ok(Some(entry_key)) - } else { - // we reached the end (or an error). We cannot recover from this without seek. - // todo: add support for iterator refresh(). - self.it - .status() - .context("peek into vqueue snapshot iterator") - .map_err(StorageError::Generic)?; - // iterator is beyond the end, we can't peek - Ok(None) - } - } - /// Returns the current value under cursor - fn current_value(&mut self) -> Result, StorageError> { - if let Some(mut value) = self.it.value() { - let value = EntryValue::decode(&mut value)?; - Ok(Some(value)) - } else { - // we reached the end (or an error). We cannot recover from this without seek. - // todo: add support for iterator refresh(). - self.it - .status() - .context("peek into vqueue snapshot iterator") - .map_err(StorageError::Generic)?; - // iterator is beyond the end, we can't peek - Ok(None) - } - } - - fn peek(&mut self) -> Result, StorageError> { + fn peek(&mut self) -> Result, CursorError> { if let Some((key, mut value)) = self.it.item() { debug_assert_eq!(key.len(), InboxKey::serialized_length_fixed()); let entry_key = ::decode(&mut &key[InboxKey::offset_of_entry_key()..])?; - let value = EntryValue::decode(&mut value)?; + + let value = EntryValue::decode(&mut value).map_err(StorageError::BilrostDecode)?; Ok(Some((entry_key, value))) } else { - // We reached the end (or an error). We cannot recover from this without seek. - self.it - .status() - .context("peek into vqueue iterator") - .map_err(StorageError::Generic)?; + self.it.status().map_err(|err| match err.kind() { + rocksdb::ErrorKind::Incomplete => CursorError::WouldBlock, + _ => CursorError::Other(StorageError::Generic(err.into())), + })?; // iterator is beyond the end, we can't peek Ok(None) } diff --git a/crates/rocksdb/src/background.rs b/crates/rocksdb/src/background.rs index 5252543a07..6584d70fa0 100644 --- a/crates/rocksdb/src/background.rs +++ b/crates/rocksdb/src/background.rs @@ -64,7 +64,7 @@ where ) .increment(1); - let span = tracing::Span::current().clone(); + let span = tracing::Span::current(); move || span.in_scope(|| self.run()) } @@ -75,7 +75,7 @@ where OP_TYPE => self.kind.as_static_str(), ) .increment(1); - let span = tracing::Span::current().clone(); + let span = tracing::Span::current(); move || { span.in_scope(|| { diff --git a/crates/storage-api/src/journal_table_v2/mod.rs b/crates/storage-api/src/journal_table_v2/mod.rs index 6eb68de893..00233f01c4 100644 --- a/crates/storage-api/src/journal_table_v2/mod.rs +++ b/crates/storage-api/src/journal_table_v2/mod.rs @@ -113,14 +113,14 @@ pub trait WriteJournalTable { /// Related completion ids to this RawEntry, used to build the internal index fn put_journal_entry( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, index: EntryIndex, entry: &StoredRawEntry, related_completion_ids: &[CompletionId], ) -> Result<()>; /// When length is available, it is suggested to provide it as it makes the delete more efficient. - fn delete_journal(&mut self, invocation_id: InvocationId, length: EntryIndex) -> Result<()>; + fn delete_journal(&mut self, invocation_id: &InvocationId, length: EntryIndex) -> Result<()>; } #[derive(Debug, Clone)] diff --git a/crates/storage-api/src/vqueue_table/entry.rs b/crates/storage-api/src/vqueue_table/entry.rs index 4e7990f62f..e2b40475fb 100644 --- a/crates/storage-api/src/vqueue_table/entry.rs +++ b/crates/storage-api/src/vqueue_table/entry.rs @@ -205,7 +205,7 @@ impl<'a> From<&'a EntryMetadata> for EntryMetadataRef<'a> { pub struct EntryMetadata { // todo: This is temporary placeholder, type and name _will_ change. #[bilrost(tag(1))] - deployment: Option, + pub deployment: Option, } #[cfg(test)] diff --git a/crates/storage-api/src/vqueue_table/store.rs b/crates/storage-api/src/vqueue_table/store.rs index d882e6103f..eca01a30dd 100644 --- a/crates/storage-api/src/vqueue_table/store.rs +++ b/crates/storage-api/src/vqueue_table/store.rs @@ -10,30 +10,54 @@ use restate_types::vqueues::VQueueId; +use crate::StorageError; + use super::{EntryKey, EntryValue}; -use crate::Result; + +pub struct Options { + /// Allows blocking IO when we need to read from the storage + /// operations that block should be performed on IO threads. + /// + /// When set to false, seek and read operations may return `WouldBlock` if + /// the operation cannot be performed without blocking. + pub allow_blocking_io: bool, +} /// Storage for vqueue heads (e.g., RocksDB `ready_idx`). pub trait VQueueStore { - type RunningReader: VQueueCursor + Send + Unpin; - type InboxReader: VQueueCursor + Send + Unpin; + type RunningReader: VQueueRunningCursor + Send + Unpin; + type InboxReader: VQueueCursor + Send + Unpin + 'static; fn new_run_reader(&self, qid: &VQueueId) -> Self::RunningReader; - fn new_inbox_reader(&self, qid: &VQueueId) -> Self::InboxReader; + fn new_inbox_reader(&self, qid: &VQueueId, opts: Options) -> Self::InboxReader; } -/// Iterator over vqueue entries +/// Iterator over "waiting inbox" vqueue entries pub trait VQueueCursor { /// Moves the cursor to the beginning (min key) of the vqueue fn seek_to_first(&mut self); /// Moves the cursor to point strictly after `item` - fn seek_after(&mut self, qid: &VQueueId, item: &EntryKey); - /// Returns the current key under cursor - fn current_key(&mut self) -> Result>; - /// Returns the current value under cursor - fn current_value(&mut self) -> Result>; + fn seek_after(&mut self, item: &EntryKey); + /// Advancing the cursor. + fn advance(&mut self); /// Peek item without advancing the cursor - fn peek(&mut self) -> Result>; + fn peek(&mut self) -> Result, CursorError>; +} + +/// Iterator over already running vqueue entries +pub trait VQueueRunningCursor { + /// Moves the cursor to the beginning (min key) of the vqueue + fn seek_to_first(&mut self); + /// Peek item without advancing the cursor + fn peek(&mut self) -> Result, StorageError>; /// Advancing the cursor. If this fails, the error is returned on the next call to peek() fn advance(&mut self); } + +#[derive(Debug, thiserror::Error)] +pub enum CursorError { + #[error("operation cannot be completed without block this thread")] + WouldBlock, + #[error(transparent)] + Other(#[from] StorageError), +} diff --git a/crates/storage-api/src/vqueue_table/tables.rs b/crates/storage-api/src/vqueue_table/tables.rs index 87142d2bfb..4984c07164 100644 --- a/crates/storage-api/src/vqueue_table/tables.rs +++ b/crates/storage-api/src/vqueue_table/tables.rs @@ -221,3 +221,20 @@ pub trait ScanVQueueMetaTable { f: F, ) -> Result> + Send>; } + +pub trait ScanVQueueInboxStages { + /// Used for data-fusion queries + fn for_each_vqueue_inbox_entry< + F: for<'a> FnMut( + (&'a VQueueId, Stage, &'a EntryKey, &'a EntryValue), + ) -> std::ops::ControlFlow<()> + + Send + + Sync + + 'static, + >( + &self, + range: KeyRange, + stage: Stage, + f: F, + ) -> Result> + Send>; +} diff --git a/crates/storage-query-datafusion/src/context.rs b/crates/storage-query-datafusion/src/context.rs index 162a9fff55..a85b830f30 100644 --- a/crates/storage-query-datafusion/src/context.rs +++ b/crates/storage-query-datafusion/src/context.rs @@ -282,6 +282,12 @@ where self.partition_store_manager.clone(), &self.remote_scanner_manager, )?; + crate::vqueues::register_self( + ctx, + self.partition_selector.clone(), + self.partition_store_manager.clone(), + &self.remote_scanner_manager, + )?; ctx.datafusion_context.sql(SYS_INVOCATION_VIEW).await?; diff --git a/crates/storage-query-datafusion/src/filter.rs b/crates/storage-query-datafusion/src/filter.rs index eb2b740172..fe65313c7e 100644 --- a/crates/storage-query-datafusion/src/filter.rs +++ b/crates/storage-query-datafusion/src/filter.rs @@ -22,6 +22,7 @@ use datafusion::physical_expr_common::physical_expr::snapshot_physical_expr; use datafusion::physical_plan::PhysicalExpr; use datafusion::physical_plan::expressions::{BinaryExpr, Column, InListExpr, Literal}; +use restate_storage_api::vqueue_table::Stage; use restate_types::PartitionedResourceId; use restate_types::identifiers::partitioner::HashPartitioner; use restate_types::identifiers::{ @@ -282,6 +283,78 @@ fn extract_column_literal<'a>( Some((col, lit)) } +#[derive(Debug, Clone)] +pub struct VQueueFilter { + pub partition_keys: KeyRange, + pub stages: Option>, +} + +impl ScanLocalPartitionFilter for VQueueFilter { + fn new(range: KeyRange, predicate: Option>) -> Self { + let mut stages: Option> = None; + + if let Some(predicate) = predicate + && let Ok(predicate) = snapshot_physical_expr(predicate) + { + for conjunct in split_conjunction(&predicate) { + let Some(conjunct_stages) = parse_vqueue_stages("stage", conjunct) else { + continue; + }; + + stages = Some(match stages { + Some(current) => current.intersection(&conjunct_stages).copied().collect(), + None => conjunct_stages, + }); + } + } + + Self { + partition_keys: range, + stages, + } + } +} + +fn parse_vqueue_stages( + column_name: &str, + predicate: &Arc, +) -> Option> { + // OR-chain recursion depth; enough for `stage = a OR stage = b OR ...` over all stages. + let in_list = InList::parse(predicate, 5)?; + + if in_list.col.name() != column_name || in_list.negated { + return None; + } + + let mut stages = BTreeSet::new(); + for literal in in_list.list { + let Some(Some(stage_str)) = literal.try_as_str() else { + continue; + }; + + if let Some(stage) = parse_stage_literal(stage_str) { + stages.insert(stage); + } + } + + if stages.is_empty() { + None + } else { + Some(stages) + } +} + +fn parse_stage_literal(value: &str) -> Option { + match value.to_ascii_lowercase().as_str() { + "inbox" => Some(Stage::Inbox), + "run" | "running" => Some(Stage::Running), + "suspended" => Some(Stage::Suspended), + "paused" => Some(Stage::Paused), + "finished" => Some(Stage::Finished), + _ => None, + } +} + #[derive(Debug, Clone)] pub struct InvocationIdFilter { pub partition_keys: KeyRange, @@ -347,12 +420,13 @@ mod tests { use datafusion::physical_plan::PhysicalExpr; use datafusion::physical_plan::expressions::{BinaryExpr, Column, InListExpr, Literal}; + use restate_storage_api::vqueue_table::Stage; use restate_types::identifiers::{InvocationId, ServiceId, StateMutationId, WithPartitionKey}; use restate_types::invocation::{InvocationTarget, VirtualObjectHandlerType}; use restate_types::sharding::KeyRange; use crate::filter::{ - FirstMatchingPartitionKeyExtractor, InvocationIdFilter, PartitionKeyExtractor, + FirstMatchingPartitionKeyExtractor, InvocationIdFilter, PartitionKeyExtractor, VQueueFilter, }; use crate::partition_store_scanner::ScanLocalPartitionFilter; @@ -716,4 +790,80 @@ mod tests { let filter = InvocationIdFilter::new(FULL_RANGE, Some(predicate)); assert!(filter.invocation_ids.is_none()); } + + #[test] + fn vqueue_filter_single_stage_eq() { + let predicate = eq(col("stage"), utf8_lit("running")); + + let filter = VQueueFilter::new(FULL_RANGE, Some(predicate)); + assert_eq!( + filter.stages, + Some(std::collections::BTreeSet::from([Stage::Running])) + ); + } + + #[test] + fn vqueue_filter_in_list() { + let predicate = in_list("stage", vec![utf8_lit("running"), utf8_lit("paused")]); + + let filter = VQueueFilter::new(FULL_RANGE, Some(predicate)); + assert_eq!( + filter.stages, + Some(std::collections::BTreeSet::from([ + Stage::Running, + Stage::Paused, + ])) + ); + } + + #[test] + fn vqueue_filter_or_expression() { + let predicate = or( + eq(col("stage"), utf8_lit("finished")), + eq(col("stage"), utf8_lit("inbox")), + ); + + let filter = VQueueFilter::new(FULL_RANGE, Some(predicate)); + assert_eq!( + filter.stages, + Some(std::collections::BTreeSet::from([ + Stage::Finished, + Stage::Inbox, + ])) + ); + } + + #[test] + fn vqueue_filter_conjunction_intersection() { + let predicate = and( + in_list( + "stage", + vec![utf8_lit("running"), utf8_lit("paused"), utf8_lit("inbox")], + ), + eq(col("stage"), utf8_lit("paused")), + ); + + let filter = VQueueFilter::new(FULL_RANGE, Some(predicate)); + assert_eq!( + filter.stages, + Some(std::collections::BTreeSet::from([Stage::Paused])) + ); + } + + #[test] + fn vqueue_filter_invalid_stage_falls_back() { + let predicate = eq(col("stage"), utf8_lit("not-a-stage")); + + let filter = VQueueFilter::new(FULL_RANGE, Some(predicate)); + assert!(filter.stages.is_none()); + assert_eq!(filter.partition_keys, FULL_RANGE); + } + + #[test] + fn vqueue_filter_no_predicate() { + let filter = VQueueFilter::new(FULL_RANGE, None); + + assert!(filter.stages.is_none()); + assert_eq!(filter.partition_keys, FULL_RANGE); + } } diff --git a/crates/storage-query-datafusion/src/lib.rs b/crates/storage-query-datafusion/src/lib.rs index 139185dd1f..cd0cd69124 100644 --- a/crates/storage-query-datafusion/src/lib.rs +++ b/crates/storage-query-datafusion/src/lib.rs @@ -41,6 +41,7 @@ mod table_macro; mod table_providers; mod user_limits; mod vqueue_meta; +mod vqueues; pub use table_providers::Scan; pub mod table_util; diff --git a/crates/storage-query-datafusion/src/table_docs.rs b/crates/storage-query-datafusion/src/table_docs.rs index 12c68c198d..7468e484b6 100644 --- a/crates/storage-query-datafusion/src/table_docs.rs +++ b/crates/storage-query-datafusion/src/table_docs.rs @@ -10,7 +10,7 @@ use crate::{ deployment, inbox, invocation_state, invocation_status, journal, journal_events, - keyed_service_status, promise, scheduler_status, service, state, vqueue_meta, + keyed_service_status, promise, scheduler_status, service, state, vqueue_meta, vqueues, }; use std::borrow::Cow; @@ -28,6 +28,7 @@ pub const ALL_TABLE_DOCS: &[StaticTableDocs] = &[ service::schema::TABLE_DOCS, state::schema::TABLE_DOCS, vqueue_meta::schema::TABLE_DOCS, + vqueues::schema::TABLE_DOCS, ]; pub trait TableDocs { diff --git a/crates/storage-query-datafusion/src/vqueues/mod.rs b/crates/storage-query-datafusion/src/vqueues/mod.rs new file mode 100644 index 0000000000..9e16f0e6d3 --- /dev/null +++ b/crates/storage-query-datafusion/src/vqueues/mod.rs @@ -0,0 +1,18 @@ +// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod row; +pub(crate) mod schema; +mod table; + +pub(crate) use table::register_self; + +#[cfg(test)] +mod tests; diff --git a/crates/storage-query-datafusion/src/vqueues/row.rs b/crates/storage-query-datafusion/src/vqueues/row.rs new file mode 100644 index 0000000000..8b357e9295 --- /dev/null +++ b/crates/storage-query-datafusion/src/vqueues/row.rs @@ -0,0 +1,101 @@ +// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use restate_storage_api::vqueue_table::{EntryKey, EntryKind, EntryValue, Stage}; +use restate_types::vqueues::VQueueId; + +use super::schema::SysVqueuesBuilder; + +#[inline] +pub(crate) fn append_vqueues_row<'a>( + builder: &mut SysVqueuesBuilder, + qid: &'a VQueueId, + stage: Stage, + entry_key: &'a EntryKey, + entry: &'a EntryValue, +) { + let mut row = builder.row(); + + row.partition_key(qid.partition_key()); + if row.is_id_defined() { + row.fmt_id(qid); + } + if row.is_stage_defined() { + row.fmt_stage(stage); + } + if row.is_status_defined() { + row.fmt_status(entry.status); + } + + if row.is_has_lock_defined() { + row.has_lock(entry_key.has_lock()); + } + if matches!(stage, Stage::Inbox) && row.is_run_at_defined() { + row.run_at(entry_key.run_at().as_unix_millis().as_u64() as i64); + } + if row.is_sequence_number_defined() { + row.sequence_number(entry_key.seq().as_u64()); + } + + if row.is_entry_id_defined() { + row.fmt_entry_id(entry_key.entry_id().display(qid.partition_key())); + } + + if row.is_entry_kind_defined() { + row.entry_kind(match entry_key.kind() { + EntryKind::Invocation => "invocation", + EntryKind::StateMutation => "state-mutation", + EntryKind::Unknown => "unknown", + }); + } + + if row.is_created_at_defined() { + row.created_at(entry.stats.created_at.to_unix_millis().as_u64() as i64); + } + + if row.is_transitioned_at_defined() { + row.transitioned_at(entry.stats.transitioned_at.to_unix_millis().as_u64() as i64); + } + + if row.is_num_attempts_defined() { + row.num_attempts(entry.stats.num_attempts); + } + if row.is_num_pauses_defined() { + row.num_pauses(entry.stats.num_paused); + } + if row.is_num_suspensions_defined() { + row.num_suspensions(entry.stats.num_suspensions); + } + if row.is_num_yields_defined() { + row.num_yields(entry.stats.num_yields); + } + + if row.is_first_attempt_at_defined() + && let Some(first_attempt_at) = entry.stats.first_attempt_at + { + row.first_attempt_at(first_attempt_at.to_unix_millis().as_u64() as i64); + } + + if row.is_latest_attempt_at_defined() + && let Some(latest_attempt_at) = entry.stats.latest_attempt_at + { + row.latest_attempt_at(latest_attempt_at.to_unix_millis().as_u64() as i64); + } + + if row.is_first_runnable_at_defined() { + row.first_runnable_at(entry.stats.first_runnable_at.as_u64() as i64); + } + + if row.is_deployment_defined() + && let Some(deployment) = &entry.metadata.deployment + { + row.fmt_deployment(deployment); + } +} diff --git a/crates/storage-query-datafusion/src/vqueues/schema.rs b/crates/storage-query-datafusion/src/vqueues/schema.rs new file mode 100644 index 0000000000..2bb8452c5b --- /dev/null +++ b/crates/storage-query-datafusion/src/vqueues/schema.rs @@ -0,0 +1,79 @@ +// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use crate::table_macro::*; + +use datafusion::arrow::datatypes::DataType; + +// Stage scans run concurrently and merge into a shared builder, so only +// `partition_key` is monotone within a single partition's output stream. +define_sort_order!(sys_vqueues(partition_key)); + +define_table!(sys_vqueues( + /// Internal column that is used for partitioning. Can be ignored. + partition_key: DataType::UInt64, + + /// The VQueue Identifier (vq_...). + id: DataType::Utf8, + + /// The stage this entry currently belongs to. Choices are 'inbox', 'running', 'paused', + /// 'suspended', and 'finished'. + stage: DataType::Utf8, + + /// The entry processing status. Examples are `new`, `scheduled`, `running`, + /// `backing_off`, `yielded`, `suspended`, `failed`, and `succeeded`. + status: DataType::Utf8, + + /// Whether this entry currently holds a lock. + has_lock: DataType::Boolean, + + /// The entry will be eligible to run after this timestamp. Only present for entries + /// that are in the waiting inbox. + run_at: TimestampMillisecond, + + /// Sequence number encoded in the queue ordering key. + sequence_number: DataType::UInt64, + + /// Identifier of the entry. + entry_id: DataType::Utf8, + + /// Entry kind (`invocation` or `state-mutation`). + entry_kind: DataType::Utf8, + + /// Creation timestamp of the entry. + created_at: TimestampMillisecond, + + /// Timestamp of the latest stage transition. + transitioned_at: TimestampMillisecond, + + /// Number of times this entry has been moved to the run queue. + num_attempts: DataType::UInt32, + + /// Number of times this entry has been moved to the paused stage. + num_pauses: DataType::UInt32, + + /// Number of times this entry has been moved to the suspended stage. + num_suspensions: DataType::UInt32, + + /// Number of times this entry has yielded execution. + num_yields: DataType::UInt32, + + /// Timestamp of the first attempt to run this entry. + first_attempt_at: TimestampMillisecond, + + /// Timestamp of the latest attempt to run this entry. + latest_attempt_at: TimestampMillisecond, + + /// The realistic earliest time at which this entry can run its first attempt. + first_runnable_at: TimestampMillisecond, + + /// If set, the entry's pinned deployment identifier. + deployment: DataType::Utf8, +)); diff --git a/crates/storage-query-datafusion/src/vqueues/table.rs b/crates/storage-query-datafusion/src/vqueues/table.rs new file mode 100644 index 0000000000..a23177a1d4 --- /dev/null +++ b/crates/storage-query-datafusion/src/vqueues/table.rs @@ -0,0 +1,159 @@ +// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::fmt::Debug; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +use futures::future::try_join_all; +use parking_lot::Mutex; + +use restate_partition_store::{PartitionStore, PartitionStoreManager}; +use restate_storage_api::StorageError; +use restate_storage_api::vqueue_table::{EntryKey, EntryValue, ScanVQueueInboxStages, Stage}; +use restate_types::identifiers::StateMutationId; +use restate_types::vqueues::VQueueId; + +use crate::context::{QueryContext, SelectPartitions}; +use crate::filter::{FirstMatchingPartitionKeyExtractor, VQueueFilter}; +use crate::partition_store_scanner::{LocalPartitionsScanner, ScanLocalPartition}; +use crate::remote_query_scanner_manager::RemoteScannerManager; +use crate::table_providers::{PartitionedTableProvider, ScanPartition}; +use crate::vqueues::row::append_vqueues_row; +use crate::vqueues::schema::{SysVqueuesBuilder, sys_vqueues_sort_order}; + +const NAME: &str = "sys_vqueues"; + +pub(crate) fn register_self( + ctx: &QueryContext, + partition_selector: impl SelectPartitions, + partition_store_manager: Arc, + remote_scanner_manager: &RemoteScannerManager, +) -> datafusion::common::Result<()> { + let local_scanner = Arc::new(LocalPartitionsScanner::new( + partition_store_manager, + VQueuesScanner, + )) as Arc; + + let table = PartitionedTableProvider::new( + partition_selector, + SysVqueuesBuilder::schema(), + sys_vqueues_sort_order(), + remote_scanner_manager.create_distributed_scanner(NAME, local_scanner), + FirstMatchingPartitionKeyExtractor::default() + .with_partitioned_resource_id::("id") + // entry_id can be an invocation id or state mutation id + .with_invocation_id("entry_id") + .with_partitioned_resource_id::("entry_id"), + ); + + ctx.register_partitioned_table(NAME, Arc::new(table)) +} + +#[derive(Debug, Clone)] +struct VQueuesScanner; + +const SCAN_STAGE_ORDER: [Stage; 5] = [ + Stage::Paused, + Stage::Suspended, + Stage::Running, + Stage::Inbox, + Stage::Finished, +]; + +/// Bound on in-flight stage scans per partition scan. Keeps RocksDB iterator +/// pressure and the shared builder's `Mutex` contention in check while still +/// overlapping I/O across stages. +const MAX_CONCURRENT_STAGE_SCANS: usize = 2; + +impl ScanLocalPartition for VQueuesScanner { + type Builder = SysVqueuesBuilder; + type Item<'a> = (&'a VQueueId, Stage, &'a EntryKey, &'a EntryValue); + type ConversionError = std::convert::Infallible; + type Filter = VQueueFilter; + + fn for_each_row< + F: for<'a> FnMut(Self::Item<'a>) -> ControlFlow> + + Send + + Sync + + 'static, + >( + partition_store: &PartitionStore, + filter: VQueueFilter, + f: F, + ) -> Result> + Send, StorageError> { + let range = filter.partition_keys; + let stages: Vec<_> = if let Some(requested_stages) = filter.stages { + SCAN_STAGE_ORDER + .into_iter() + .filter(|stage| requested_stages.contains(stage)) + .collect() + } else { + SCAN_STAGE_ORDER.to_vec() + }; + + let callback = Arc::new(Mutex::new(f)); + let should_stop = Arc::new(AtomicBool::new(false)); + + Ok(async move { + let scan_result = async { + for stage_batch in stages.chunks(MAX_CONCURRENT_STAGE_SCANS) { + if should_stop.load(Ordering::Relaxed) { + break; + } + + let mut scans = Vec::with_capacity(stage_batch.len()); + for stage in stage_batch { + let callback = callback.clone(); + let should_stop = should_stop.clone(); + + scans.push(partition_store.for_each_vqueue_inbox_entry( + range, + *stage, + move |item| { + if should_stop.load(Ordering::Relaxed) { + return ControlFlow::Break(()); + } + + let mut callback = callback.lock(); + let control_flow = (*callback)(item); + if control_flow.is_break() { + should_stop.store(true, Ordering::Relaxed); + } + control_flow.map_break(Result::unwrap) + }, + )?); + } + + try_join_all(scans).await?; + } + + Ok(()) + } + .await; + + // The callback can own types with blocking cleanup logic (e.g. BatchSender). + // Drop it off the async runtime worker thread. + let _ = std::thread::spawn(move || drop(callback)).join(); + + scan_result + }) + } + + fn append_row<'a>( + row_builder: &mut Self::Builder, + value: Self::Item<'a>, + ) -> Result<(), Self::ConversionError> { + let (qid, stage, entry_key, entry) = value; + append_vqueues_row(row_builder, qid, stage, entry_key, entry); + Ok(()) + } +} diff --git a/crates/storage-query-datafusion/src/vqueues/tests.rs b/crates/storage-query-datafusion/src/vqueues/tests.rs new file mode 100644 index 0000000000..4192937860 --- /dev/null +++ b/crates/storage-query-datafusion/src/vqueues/tests.rs @@ -0,0 +1,192 @@ +// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use crate::mocks::*; +use crate::row; + +use datafusion::arrow::array::{StringArray, TimestampMillisecondArray, UInt32Array}; +use datafusion::arrow::record_batch::RecordBatch; +use futures::StreamExt; +use googletest::all; +use googletest::prelude::{assert_that, eq}; + +use restate_storage_api::Transaction; +use restate_storage_api::vqueue_table::{ + EntryId, EntryKey, EntryKind, EntryMetadata, EntryValue, Stage, Status, WriteVQueueTable, + stats::EntryStatistics, +}; +use restate_types::clock::UniqueTimestamp; +use restate_types::time::MillisSinceEpoch; +use restate_types::vqueues::VQueueId; + +#[restate_core::test(flavor = "multi_thread", worker_threads = 2)] +async fn get_vqueue_entry_value_fields() { + let mut engine = MockQueryEngine::create().await; + + let qid = VQueueId::custom(1337, "df-vqueue"); + let key = EntryKey::new( + true, + MillisSinceEpoch::new(1_744_000_001_000), + 7, + EntryId::new(EntryKind::Invocation, [7; 16]), + ); + + let created_at = + UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_100)).unwrap(); + let transitioned_at = + UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_300)).unwrap(); + let first_attempt_at = + UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_400)).unwrap(); + let latest_attempt_at = + UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new(1_744_000_000_500)).unwrap(); + + let mut stats = EntryStatistics::new(created_at, key.run_at().to_owned()); + stats.transitioned_at = transitioned_at; + stats.num_attempts = 3; + stats.num_paused = 2; + stats.num_suspensions = 4; + stats.num_yields = 5; + stats.first_attempt_at = Some(first_attempt_at); + stats.latest_attempt_at = Some(latest_attempt_at); + + let value = EntryValue { + status: Status::BackingOff, + metadata: EntryMetadata { + deployment: Some("dp_123".to_string()), + }, + stats, + }; + + // This row should be returned. + let mut tx = engine.partition_store().transaction(); + tx.put_vqueue_inbox(&qid, Stage::Inbox, &key, &value); + + // This row should only be returned when stage filtering selects it. + tx.put_vqueue_inbox(&qid, Stage::Running, &key, &value); + tx.commit().await.unwrap(); + + let records = engine + .execute( + "SELECT stage, status, num_attempts, num_pauses, num_suspensions, num_yields, \ + created_at, transitioned_at, first_attempt_at, latest_attempt_at, first_runnable_at, \ + run_at, deployment FROM sys_vqueues WHERE stage = 'inbox' ORDER BY sequence_number", + ) + .await + .unwrap() + .stream + .collect::>>() + .await + .remove(0) + .unwrap(); + + assert_eq!(records.num_rows(), 1); + + assert_that!( + records, + all!(row!( + 0, + { + "stage" => StringArray: eq("inbox"), + "status" => StringArray: eq("backing_off"), + "num_attempts" => UInt32Array: eq(3), + "num_pauses" => UInt32Array: eq(2), + "num_suspensions" => UInt32Array: eq(4), + "num_yields" => UInt32Array: eq(5), + "created_at" => TimestampMillisecondArray: eq(created_at.to_unix_millis().as_u64() as i64), + "transitioned_at" => TimestampMillisecondArray: eq(transitioned_at.to_unix_millis().as_u64() as i64), + "first_attempt_at" => TimestampMillisecondArray: eq(first_attempt_at.to_unix_millis().as_u64() as i64), + "latest_attempt_at" => TimestampMillisecondArray: eq(latest_attempt_at.to_unix_millis().as_u64() as i64), + "first_runnable_at" => TimestampMillisecondArray: eq(value.stats.first_runnable_at.as_u64() as i64), + "run_at" => TimestampMillisecondArray: eq(key.run_at().as_unix_millis().as_u64() as i64), + "deployment" => StringArray: eq("dp_123"), + } + )) + ); +} + +#[restate_core::test(flavor = "multi_thread", worker_threads = 2)] +async fn vqueue_stage_filter_and_unfiltered_scan() { + let mut engine = MockQueryEngine::create().await; + + let qid = VQueueId::custom(2337, "df-vqueue-stages"); + + let mut tx = engine.partition_store().transaction(); + let stages = [ + Stage::Inbox, + Stage::Running, + Stage::Suspended, + Stage::Paused, + Stage::Finished, + ]; + for (index, stage) in stages.into_iter().enumerate() { + let key = EntryKey::new( + false, + MillisSinceEpoch::new(1_744_001_000_000 + index as u64), + (index + 1) as u64, + EntryId::new(EntryKind::Invocation, [index as u8 + 1; 16]), + ); + let value = EntryValue { + status: Status::Started, + metadata: EntryMetadata::default(), + stats: EntryStatistics::new( + UniqueTimestamp::try_from_unix_millis(MillisSinceEpoch::new( + 1_744_001_000_100 + index as u64, + )) + .unwrap(), + key.run_at().to_owned(), + ), + }; + tx.put_vqueue_inbox(&qid, stage, &key, &value); + } + tx.commit().await.unwrap(); + + let all_stages = engine + .execute("SELECT stage FROM sys_vqueues ORDER BY stage") + .await + .unwrap() + .stream + .collect::>>() + .await + .remove(0) + .unwrap(); + + assert_eq!(all_stages.num_rows(), 5); + assert_that!( + all_stages, + all!( + row!(0, { "stage" => StringArray: eq("finished") }), + row!(1, { "stage" => StringArray: eq("inbox") }), + row!(2, { "stage" => StringArray: eq("paused") }), + row!(3, { "stage" => StringArray: eq("running") }), + row!(4, { "stage" => StringArray: eq("suspended") }) + ) + ); + + let filtered_stages = engine + .execute( + "SELECT stage FROM sys_vqueues WHERE stage IN ('running', 'paused') ORDER BY stage", + ) + .await + .unwrap() + .stream + .collect::>>() + .await + .remove(0) + .unwrap(); + + assert_eq!(filtered_stages.num_rows(), 2); + assert_that!( + filtered_stages, + all!( + row!(0, { "stage" => StringArray: eq("paused") }), + row!(1, { "stage" => StringArray: eq("running") }) + ) + ); +} diff --git a/crates/vqueues/Cargo.toml b/crates/vqueues/Cargo.toml index 6ea4b67b02..7394eb4291 100644 --- a/crates/vqueues/Cargo.toml +++ b/crates/vqueues/Cargo.toml @@ -18,9 +18,11 @@ restate-clock = { workspace = true } restate-futures-util = { workspace = true } restate-limiter = { workspace = true } restate-memory = { workspace = true } +restate-platform = { workspace = true } restate-rocksdb = { workspace = true } restate-serde-util = { workspace = true } restate-storage-api = { workspace = true } +restate-time-util = { workspace = true } restate-types = { workspace = true } restate-util-string = { workspace = true } restate-worker-api = { workspace = true } diff --git a/crates/vqueues/src/cache.rs b/crates/vqueues/src/cache.rs index e90ea943e1..fcb8139300 100644 --- a/crates/vqueues/src/cache.rs +++ b/crates/vqueues/src/cache.rs @@ -8,19 +8,20 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use hashbrown::{HashMap, hash_map}; use slotmap::SlotMap; use tokio::task::JoinHandle; use tracing::{debug, trace}; +use restate_platform::hash::HashMap; use restate_storage_api::StorageError; use restate_storage_api::vqueue_table::metadata::{self, VQueueMeta}; use restate_storage_api::vqueue_table::{ReadVQueueTable, ScanVQueueTable}; +use restate_types::sharding::PartitionKey; use restate_types::vqueues::VQueueId; type Result = std::result::Result; -slotmap::new_key_type! { pub struct VQueueCacheKey; } +slotmap::new_key_type! { pub struct VQueueHandle; } // A read-only view over the in-memory stash of vqueues. #[derive(Copy, Clone)] @@ -34,10 +35,15 @@ impl<'a> VQueuesMeta<'a> { Self { inner: cache } } - pub fn get(&self, key: VQueueCacheKey) -> Option<&Slot> { + pub fn get(&self, key: VQueueHandle) -> Option<&Slot> { self.inner.slab.get(key) } + /// Lookup the cache handle of the vqueue by its id. + pub fn handle_for(&self, qid: &VQueueId) -> Option { + self.inner.queues.get(qid).copied() + } + pub fn get_vqueue(&'a self, qid: &VQueueId) -> Option<&'a VQueueMeta> { self.inner .queues @@ -46,11 +52,13 @@ impl<'a> VQueuesMeta<'a> { .map(|slot| &slot.meta) } - pub fn iter_active_vqueues(&'a self) -> impl Iterator { + pub fn iter_active_vqueues( + &'a self, + ) -> impl Iterator { self.inner .slab - .values() - .filter_map(|Slot { qid, meta }| meta.is_active().then_some((qid, meta))) + .iter() + .filter_map(|(key, Slot { qid, meta })| meta.is_active().then_some((key, qid, meta))) } pub fn num_active(&self) -> usize { @@ -64,6 +72,10 @@ impl<'a> VQueuesMeta<'a> { pub fn report(&self) { self.inner.report(); } + + pub fn capacity(&self) -> usize { + self.inner.slab.capacity() + } } #[derive(Clone)] @@ -78,6 +90,11 @@ impl Slot { &self.qid } + #[inline(always)] + pub fn partition_key(&self) -> PartitionKey { + self.qid.partition_key() + } + #[inline(always)] pub fn meta(&self) -> &VQueueMeta { &self.meta @@ -97,8 +114,12 @@ impl Slot { // Needs rewriting after the workload pattern becomes more clear. #[derive(Clone)] pub struct VQueuesMetaCache { - queues: HashMap, - slab: SlotMap, + queues: HashMap, + slab: SlotMap, + /// Soft cap; partition processor triggers `compact()` once `len()` reaches + /// this number. The slab/hashmap will still grow past this if compaction + /// frees nothing. + target_capacity: usize, } impl VQueuesMetaCache { @@ -106,31 +127,70 @@ impl VQueuesMetaCache { VQueuesMeta::new(self) } - pub fn get(&self, key: VQueueCacheKey) -> Option<&Slot> { + pub fn get(&self, key: VQueueHandle) -> Option<&Slot> { self.slab.get(key) } - pub fn get_mut(&mut self, key: VQueueCacheKey) -> Option<&mut Slot> { + pub fn get_mut(&mut self, key: VQueueHandle) -> Option<&mut Slot> { self.slab.get_mut(key) } + pub fn len(&self) -> usize { + self.slab.len() + } + + pub fn is_empty(&self) -> bool { + self.slab.is_empty() + } + + /// Sweeps the cache and evicts entries whose vqueues are no longer needed. + /// A slot is safe to evict when its meta reports `!is_active()`. The scheduler + /// drops its handle eagerly on the events that flip the meta inactive + /// (`RemovedFromInbox`, `QueuePaused`), so meta inactivity implies the + /// scheduler has already released the handle. Returns the number of evicted + /// entries. + /// + /// Triggered automatically by `insert` once occupancy reaches + /// `target_capacity`, so steady-state operations never pay for compaction. + fn compact(&mut self) -> usize { + let mut evicted = 0; + self.slab.retain(|_handle, slot| { + if slot.meta.is_active() { + true + } else { + self.queues.remove(&slot.qid); + evicted += 1; + false + } + }); + evicted + } + #[cfg(any(test, feature = "test-util"))] - pub fn new_empty() -> Self { + pub fn new_empty(target_capacity: usize) -> Self { Self { - slab: Default::default(), - queues: Default::default(), + slab: SlotMap::with_capacity_and_key(target_capacity), + queues: HashMap::with_capacity(target_capacity), + target_capacity, } } /// Initializes the vqueue cache by loading all active vqueues into the cache. /// + /// `target_capacity` is the soft cap that drives compaction; the cache will + /// still grow past it if compaction frees nothing. + /// /// From this point on, the cache remains in-sync with the storage state by /// using the "apply_updates" method. - pub async fn create(storage: S) -> Result { + pub async fn create( + storage: S, + target_capacity: usize, + ) -> Result { let handle: JoinHandle> = tokio::task::spawn_blocking({ move || { - let mut slab = SlotMap::default(); - let mut queues = HashMap::default(); + // Allocation doesn't bump the RSS until we write to the allocated pages. + let mut slab = SlotMap::with_capacity_and_key(target_capacity); + let mut queues = HashMap::with_capacity(target_capacity); // find and load all active vqueues. storage.scan_active_vqueues(|qid, meta| { let key = slab.insert(Slot { @@ -148,35 +208,44 @@ impl VQueuesMetaCache { .await .map_err(|e| StorageError::Generic(e.into()))??; - Ok(Self { slab, queues }) + Ok(Self { + slab, + queues, + target_capacity, + }) } pub async fn load( &mut self, storage: &mut S, qid: &VQueueId, - ) -> Result> { - Ok(match self.queues.entry_ref(qid) { - hash_map::EntryRef::Occupied(entry) => Some(*entry.get()), - hash_map::EntryRef::Vacant(entry) => { - // We don't have it in cache, we must check storage. - match storage.get_vqueue(qid).await? { - None => None, - Some(meta) => { - let key = self.slab.insert(Slot { - qid: qid.clone(), - meta, - }); - entry.insert(key); - Some(key) - } - } - } - }) + ) -> Result> { + if self.queues.contains_key(qid) { + return Ok(self.queues.get(qid).copied()); + } + // Not in cache; consult storage. + match storage.get_vqueue(qid).await? { + None => Ok(None), + Some(meta) => Ok(Some(self.insert(qid.clone(), meta))), + } } - /// Inserts a vqueue metadata unconditionally to the cache. - pub(super) fn insert(&mut self, qid: VQueueId, meta: VQueueMeta) -> VQueueCacheKey { + /// Inserts a vqueue metadata unconditionally to the cache. The single point + /// of entry into the cache; runs compaction when occupancy reaches the + /// configured `target_capacity`. + pub(super) fn insert(&mut self, qid: VQueueId, meta: VQueueMeta) -> VQueueHandle { + if self.slab.len() >= self.target_capacity { + let evicted = self.compact(); + if evicted == 0 { + tracing::info!( + "vqueue cache at {} entries with no inactive queues to evict; cache will grow past target_capacity={}", + self.slab.len(), + self.target_capacity, + ); + } else { + trace!("vqueue cache compaction freed {evicted} entries"); + } + } let key = self.slab.insert(Slot { qid: qid.clone(), meta, @@ -196,3 +265,149 @@ impl VQueuesMetaCache { } } } + +#[cfg(test)] +mod tests { + use restate_clock::time::MillisSinceEpoch; + use restate_limiter::LimitKey; + use restate_storage_api::vqueue_table::Stage; + use restate_storage_api::vqueue_table::metadata::{Action, MoveMetrics, Update, VQueueLink}; + use restate_types::clock::UniqueTimestamp; + + use super::*; + + fn ts(unix_ms: u64) -> UniqueTimestamp { + UniqueTimestamp::from_unix_millis_unchecked(MillisSinceEpoch::new(unix_ms)) + } + + fn empty_meta(at: UniqueTimestamp) -> VQueueMeta { + VQueueMeta::new(at, None, LimitKey::None, VQueueLink::None) + } + + /// Bumps inbox count so `is_active()` returns true. + fn enqueue_to_inbox(meta: &mut VQueueMeta, at: UniqueTimestamp) { + let metrics = MoveMetrics { + last_transition_at: at, + has_started: false, + blocked_on_concurrency_rules_ms: 0, + blocked_on_invoker_throttling_ms: 0, + first_runnable_at: at.to_unix_millis(), + }; + meta.apply_update(&Update::new( + at, + Action::Move { + prev_stage: None, + next_stage: Stage::Inbox, + metrics, + }, + )); + } + + #[test] + fn compact_evicts_inactive_and_keeps_active() { + let now = ts(1_744_000_000_000); + // Use a high cap so insert() does not auto-compact during setup. + let mut cache = VQueuesMetaCache::new_empty(1024); + + let qid_active = VQueueId::custom(1, "alive"); + let qid_inactive = VQueueId::custom(2, "dormant"); + let qid_empty_paused = VQueueId::custom(3, "paused"); + + // Active: has an inbox entry. + let mut active_meta = empty_meta(now); + enqueue_to_inbox(&mut active_meta, now); + let h_active = cache.insert(qid_active.clone(), active_meta); + + // Inactive: brand-new meta has no entries. + let h_inactive = cache.insert(qid_inactive.clone(), empty_meta(now)); + + // Paused queue with empty inbox is also !is_active. + let mut paused_meta = empty_meta(now); + paused_meta.apply_update(&Update::new(now, Action::PauseVQueue {})); + let h_paused = cache.insert(qid_empty_paused.clone(), paused_meta); + + assert_eq!(cache.len(), 3); + + let evicted = cache.compact(); + + assert_eq!(evicted, 2); + assert_eq!(cache.len(), 1); + + // Active stayed; inactive ones gone from both maps. + assert!(cache.get(h_active).is_some()); + assert!(cache.get(h_inactive).is_none()); + assert!(cache.get(h_paused).is_none()); + assert_eq!(cache.view().handle_for(&qid_active), Some(h_active)); + assert_eq!(cache.view().handle_for(&qid_inactive), None); + assert_eq!(cache.view().handle_for(&qid_empty_paused), None); + } + + #[test] + fn compact_on_empty_cache_is_noop() { + let mut cache = VQueuesMetaCache::new_empty(1024); + assert_eq!(cache.compact(), 0); + assert_eq!(cache.len(), 0); + } + + #[test] + fn compact_on_all_active_evicts_nothing() { + let now = ts(1_744_000_000_000); + let mut cache = VQueuesMetaCache::new_empty(1024); + for i in 0..5 { + let qid = VQueueId::custom(i, "q"); + let mut meta = empty_meta(now); + enqueue_to_inbox(&mut meta, now); + cache.insert(qid, meta); + } + assert_eq!(cache.compact(), 0); + assert_eq!(cache.len(), 5); + } + + #[test] + fn insert_auto_compacts_when_capacity_hit() { + let now = ts(1_744_000_000_000); + // Tiny cap so we can drive the auto-compact path in two steps. + let mut cache = VQueuesMetaCache::new_empty(3); + + // Fill with 3 inactive entries. + for i in 0..3 { + cache.insert(VQueueId::custom(i, "stale"), empty_meta(now)); + } + assert_eq!(cache.len(), 3); + + // The next insert should observe `len >= target_capacity` and compact + // the inactive ones before adding the new entry. + let mut active_meta = empty_meta(now); + enqueue_to_inbox(&mut active_meta, now); + cache.insert(VQueueId::custom(99, "fresh"), active_meta); + + assert_eq!(cache.len(), 1); + assert!( + cache + .view() + .handle_for(&VQueueId::custom(99, "fresh")) + .is_some() + ); + } + + #[test] + fn insert_grows_past_capacity_when_nothing_to_evict() { + let now = ts(1_744_000_000_000); + let mut cache = VQueuesMetaCache::new_empty(2); + + // Two active entries fill the cap. + for i in 0..2 { + let mut meta = empty_meta(now); + enqueue_to_inbox(&mut meta, now); + cache.insert(VQueueId::custom(i, "active"), meta); + } + assert_eq!(cache.len(), 2); + + // Third insert: compact attempts but frees nothing; cache grows. + let mut meta = empty_meta(now); + enqueue_to_inbox(&mut meta, now); + cache.insert(VQueueId::custom(2, "active"), meta); + + assert_eq!(cache.len(), 3); + } +} diff --git a/crates/vqueues/src/lib.rs b/crates/vqueues/src/lib.rs index 94ff262348..b5d64fc3cb 100644 --- a/crates/vqueues/src/lib.rs +++ b/crates/vqueues/src/lib.rs @@ -14,13 +14,14 @@ pub mod scheduler; mod util; // Re-exports -pub use cache::{VQueuesMeta, VQueuesMetaCache}; +pub use cache::{VQueueHandle, VQueuesMeta, VQueuesMetaCache}; pub use metric_definitions::describe_metrics; pub use restate_worker_api::{ResourceKind, SchedulingStatus, VQueueSchedulerStatus}; pub use scheduler::{ResourceManager, SchedulerService}; +pub use util::*; + use smallvec::SmallVec; use tracing::debug; -pub use util::*; use restate_clock::RoughTimestamp; use restate_clock::time::MillisSinceEpoch; @@ -33,42 +34,37 @@ use restate_storage_api::vqueue_table::{ WriteVQueueTable, metadata, }; use restate_storage_api::{StorageError, lock_table}; +use restate_types::ServiceName; use restate_types::clock::UniqueTimestamp; use restate_types::identifiers::PartitionKey; use restate_types::invocation::InvocationTarget; use restate_types::vqueues::{EntryId, Seq, VQueueId}; -use restate_types::{LockName, Scope, ServiceName}; +#[cfg(test)] +use restate_types::{LockName, Scope}; use restate_util_string::ReString; -use self::cache::VQueueCacheKey; -use self::scheduler::MetaLiteUpdate; - // Token bucket used for throttling over all vqueues type GlobalTokenBucket = gardal::TokenBucket; #[derive(Debug)] pub enum EventDetails { - // A vqueue that had empty inbox and now the scheduler needs to monitor - // - // It's implied that this vqueue is active (not in paused state) - AddVQueue { - scope: Option, - limit_key: LimitKey, - lock_name: Option, - }, - // An inbox enqueue, inbox remove, or pause/unpause of the queue. - InboxUpdate(MetaLiteUpdate), - LockReleased { - scope: Option, - lock_name: LockName, + /// The vqueue was manually paused + QueuePaused, + /// The vqueue was resumed + QueueResumed, + RemovedFromInbox(EntryKey), + EnqueuedToInbox { + key: EntryKey, + value: EntryValue, }, + LockReleased, } #[derive(Debug)] pub struct VQueueEvent { - pub qid: VQueueId, - pub updates: SmallVec<[EventDetails; 2]>, + pub queue: VQueueHandle, + pub updates: SmallVec<[EventDetails; 1]>, } impl VQueueEvent { @@ -76,9 +72,9 @@ impl VQueueEvent { self.updates.is_empty() } - pub const fn new(qid: VQueueId) -> Self { + pub const fn new(queue: VQueueHandle) -> Self { Self { - qid, + queue, updates: SmallVec::new_const(), } } @@ -95,7 +91,7 @@ impl VQueueEvent { pub struct VQueue<'a, A, S> { storage: &'a mut S, cache: &'a mut VQueuesMetaCache, - cache_key: VQueueCacheKey, + handle: VQueueHandle, // action collector is only available if we have a scheduler to notify action_collector: Option<&'a mut Vec>, } @@ -117,8 +113,12 @@ where A: From + 'static, S: WriteVQueueTable + ReadVQueueTable + WriteLockTable, { + pub fn handle(&self) -> VQueueHandle { + self.handle + } + pub fn meta(&self) -> &VQueueMeta { - self.cache.get(self.cache_key).unwrap().meta() + self.cache.get(self.handle).unwrap().meta() } /// The entry has completed execution and it needs to be removed from the vqueue. @@ -164,7 +164,7 @@ where Ok(Some(Self { storage, - cache_key, + handle: cache_key, cache, action_collector, })) @@ -202,7 +202,7 @@ where Ok(Self { storage, - cache_key, + handle: cache_key, cache, action_collector, }) @@ -240,7 +240,7 @@ where Ok(Self { storage, - cache_key, + handle: cache_key, cache, action_collector, }) @@ -257,7 +257,7 @@ where entry_id: impl Into, metadata: impl Into, ) { - let meta = self.cache.get_mut(self.cache_key).unwrap(); + let meta = self.cache.get_mut(self.handle).unwrap(); let created_at_unix = created_at.to_unix_millis(); let (run_at, status) = match run_at { @@ -290,7 +290,6 @@ where ); // Update cache - let was_inbox_empty = meta.meta().is_inbox_empty(); let (was_active_before, is_active_now) = meta.apply_update(&update); // Update vqueue meta in storage @@ -329,19 +328,8 @@ where if let Some(collector) = self.action_collector.as_deref_mut() { // Let the scheduler know about the new entry to keep its head-of-line cache of the vqueue // as fresh as possible. - let mut event = VQueueEvent::new(meta.vqueue_id().clone()); - if was_inbox_empty { - event.push(EventDetails::AddVQueue { - scope: meta.meta().scope().clone(), - limit_key: meta.meta().limit_key().clone(), - lock_name: meta.meta().lock_name().cloned(), - }); - } - - event.push(EventDetails::InboxUpdate(MetaLiteUpdate::EnqueuedToInbox { - key, - value, - })); + let mut event = VQueueEvent::new(self.handle); + event.push(EventDetails::EnqueuedToInbox { key, value }); collector.push(A::from(event)); } @@ -365,7 +353,7 @@ where ) -> EntryKey { let vqueue_id = header.vqueue_id(); let partition_key = vqueue_id.partition_key(); - let meta = self.cache.get_mut(self.cache_key).unwrap(); + let meta = self.cache.get_mut(self.handle).unwrap(); assert_eq!(vqueue_id, meta.vqueue_id()); assert!(matches!(header.stage(), Stage::Inbox)); @@ -478,7 +466,7 @@ where updated_metadata: Option, ) { let vqueue_id = header.vqueue_id(); - let meta = self.cache.get_mut(self.cache_key).unwrap(); + let meta = self.cache.get_mut(self.handle).unwrap(); assert_eq!(vqueue_id, meta.vqueue_id()); assert!(matches!(header.stage(), Stage::Paused | Stage::Suspended)); @@ -496,7 +484,6 @@ where ); // Update cache - let was_inbox_empty = meta.meta().is_inbox_empty(); let (was_active_before, is_active_now) = meta.apply_update(&update); // Update vqueue meta in storage self.storage.update_vqueue(vqueue_id, &update); @@ -543,20 +530,12 @@ where .put_vqueue_inbox(vqueue_id, Stage::Inbox, &modified_key, &value); if let Some(collector) = self.action_collector.as_deref_mut() { - let mut event = VQueueEvent::new(vqueue_id.clone()); - - if was_inbox_empty { - event.push(EventDetails::AddVQueue { - scope: meta.meta().scope().clone(), - limit_key: meta.meta().limit_key().clone(), - lock_name: meta.meta().lock_name().cloned(), - }); - } + let mut event = VQueueEvent::new(self.handle); - event.push(EventDetails::InboxUpdate(MetaLiteUpdate::EnqueuedToInbox { + event.push(EventDetails::EnqueuedToInbox { key: modified_key, value, - })); + }); collector.push(A::from(event)); } } @@ -586,7 +565,7 @@ where next_stage: Stage, ) { let vqueue_id = header.vqueue_id(); - let meta = self.cache.get_mut(self.cache_key).unwrap(); + let meta = self.cache.get_mut(self.handle).unwrap(); assert_eq!(vqueue_id, meta.vqueue_id()); assert!(matches!(next_stage, Stage::Paused | Stage::Suspended)); @@ -649,10 +628,8 @@ where if let Some(collector) = self.action_collector.as_deref_mut() && matches!(header.stage(), Stage::Inbox) { - let mut event = VQueueEvent::new(vqueue_id.clone()); - event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox( - *header.entry_key(), - ))); + let mut event = VQueueEvent::new(self.handle); + event.push(EventDetails::RemovedFromInbox(*header.entry_key())); collector.push(A::from(event)); } } @@ -670,7 +647,7 @@ where new_status: Status, ) { let vqueue_id = header.vqueue_id(); - let meta = self.cache.get_mut(self.cache_key).unwrap(); + let meta = self.cache.get_mut(self.handle).unwrap(); assert_eq!(vqueue_id, meta.vqueue_id()); debug!( @@ -693,7 +670,6 @@ where ); // Update cache - let was_inbox_empty = meta.meta().is_inbox_empty(); let (was_active_before, is_active_now) = meta.apply_update(&update); self.storage.update_vqueue(vqueue_id, &update); @@ -741,31 +717,21 @@ where .put_vqueue_inbox(vqueue_id, Stage::Inbox, &modified_key, &value); if let Some(collector) = self.action_collector.as_deref_mut() { - let mut event = VQueueEvent::new(vqueue_id.clone()); - - if was_inbox_empty { - event.push(EventDetails::AddVQueue { - scope: meta.meta().scope().clone(), - limit_key: meta.meta().limit_key().clone(), - lock_name: meta.meta().lock_name().cloned(), - }); - } + let mut event = VQueueEvent::new(self.handle); if matches!(header.stage(), Stage::Inbox) { // Inbox -> Inbox // The scheduler is moving the item from inbox, so we need to confirm its // decision. We only do that if prev_stage is Inbox because it's the only // stage that needs confirmation. - event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox( - *header.entry_key(), - ))); + event.push(EventDetails::RemovedFromInbox(*header.entry_key())); } // Enqueue the replaced entry now - event.push(EventDetails::InboxUpdate(MetaLiteUpdate::EnqueuedToInbox { + event.push(EventDetails::EnqueuedToInbox { key: modified_key, value, - })); + }); collector.push(A::from(event)); } @@ -780,7 +746,7 @@ where // todo: add a paramter to specify the "scrub time" for this item. ) { let vqueue_id = header.vqueue_id(); - let meta = self.cache.get_mut(self.cache_key).unwrap(); + let meta = self.cache.get_mut(self.handle).unwrap(); assert_eq!(vqueue_id, meta.vqueue_id()); // Remove from the current stage @@ -846,21 +812,14 @@ where } if let Some(collector) = self.action_collector.as_deref_mut() { - let mut event = VQueueEvent::new(vqueue_id.clone()); + let mut event = VQueueEvent::new(self.handle); // Release the lock if this entry has been holding a lock already - if header.has_lock() - && let Some(lock_name) = meta.meta().lock_name() - { - event.push(EventDetails::LockReleased { - scope: meta.meta().scope().clone(), - lock_name: lock_name.clone(), - }); + if header.has_lock() && meta.meta().lock_name().is_some() { + event.push(EventDetails::LockReleased); } if matches!(header.stage(), Stage::Inbox) { - event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox( - *header.entry_key(), - ))); + event.push(EventDetails::RemovedFromInbox(*header.entry_key())); } if !event.is_empty() { collector.push(A::from(event)); @@ -908,7 +867,7 @@ where status: Status, ) { let vqueue_id = header.vqueue_id(); - let meta = self.cache.get_mut(self.cache_key).unwrap(); + let meta = self.cache.get_mut(self.handle).unwrap(); assert_eq!(vqueue_id, meta.vqueue_id()); assert!(matches!(header.stage(), Stage::Inbox)); @@ -979,12 +938,10 @@ where ); if let Some(collector) = self.action_collector.as_deref_mut() { - let mut event = VQueueEvent::new(vqueue_id.clone()); + let mut event = VQueueEvent::new(self.handle); // In the case of state mutation, we execute it inline and we can // ask the scheduler to drop any pending resources for this entry. - event.push(EventDetails::InboxUpdate(MetaLiteUpdate::RemovedFromInbox( - *header.entry_key(), - ))); + event.push(EventDetails::RemovedFromInbox(*header.entry_key())); collector.push(A::from(event)); } diff --git a/crates/vqueues/src/scheduler.rs b/crates/vqueues/src/scheduler.rs index e35995a0ae..9c6a91b49a 100644 --- a/crates/vqueues/src/scheduler.rs +++ b/crates/vqueues/src/scheduler.rs @@ -23,9 +23,10 @@ use restate_types::vqueues::VQueueId; use restate_worker_api::resources::ReservedResources; use restate_worker_api::{SchedulingStatus, UserLimitCounterEntry, VQueueSchedulerStatus}; -use crate::VQueueEvent; -use crate::VQueuesMetaCache; +use crate::cache::VQueueHandle; use crate::metric_definitions::publish_scheduler_decision_metrics; +use crate::{VQueueEvent, VQueuesMeta}; +use crate::{VQueuesMetaCache, cache}; use self::drr::DRRScheduler; use self::resource_manager::PermitBuilder; @@ -35,18 +36,14 @@ mod clock; mod drr; mod eligible; mod queue; -mod queue_meta; mod resource_manager; mod vqueue_state; // Re-exports -pub use self::queue_meta::{MetaLiteUpdate, VQueueMetaLite}; pub use resource_manager::ResourceManager; type UnconfirmedAssignments = hashbrown::HashMap; -slotmap::new_key_type! { pub(crate) struct VQueueHandle; } - fn status_from_detailed_eligibility(value: DetailedEligibility) -> SchedulingStatus { match value { DetailedEligibility::EligibleRunning | DetailedEligibility::EligibleInbox => { @@ -160,9 +157,9 @@ impl SchedulerService { Ok(Self { state }) } - pub fn on_inbox_event(&mut self, event: VQueueEvent) { + pub fn on_inbox_event(&mut self, metas: VQueuesMeta<'_>, event: VQueueEvent) { if let State::Active(ref mut drr_scheduler) = self.state { - drr_scheduler.as_mut().on_inbox_event(event); + drr_scheduler.as_mut().on_inbox_event(metas, event); } } @@ -172,46 +169,63 @@ impl SchedulerService { /// Resources will not be returned if the unconfirmed assignment was rejected or removed. pub fn confirm_run_attempt( &mut self, - qid: &VQueueId, + handle: VQueueHandle, + slot: &cache::Slot, key: &EntryKey, ) -> Option { if let State::Active(ref mut drr_scheduler) = self.state { - drr_scheduler.as_mut().confirm_run_attempt(qid, key) + drr_scheduler + .as_mut() + .confirm_run_attempt(handle, slot, key) } else { None } } - pub async fn schedule_next(&mut self) -> Result { - poll_fn(|cx| self.poll_schedule_next(cx)).await + pub async fn schedule_next( + &mut self, + metas: VQueuesMeta<'_>, + ) -> Result { + poll_fn(|cx| self.poll_schedule_next(metas, cx)).await } pub fn poll_schedule_next( &mut self, + metas: VQueuesMeta<'_>, cx: &mut std::task::Context<'_>, ) -> Poll> { match self.state { // if scheduler is disabled, we always return pending. State::Disabled => Poll::Pending, - State::Active(ref mut drr_scheduler) => drr_scheduler.as_mut().poll_schedule_next(cx), + State::Active(ref mut drr_scheduler) => { + drr_scheduler.as_mut().poll_schedule_next(metas, cx) + } } } /// Returns the scheduling status for a specific vqueue. - pub fn get_status(&self, qid: &VQueueId) -> VQueueSchedulerStatus { + #[cfg(test)] + pub fn get_status( + &self, + metas: VQueuesMeta<'_>, + handle: VQueueHandle, + ) -> VQueueSchedulerStatus { match self.state { State::Disabled => VQueueSchedulerStatus::default(), - State::Active(ref drr_scheduler) => drr_scheduler.get_status(qid), + State::Active(ref drr_scheduler) => drr_scheduler.get_status(metas, handle), } } /// Returns an iterator over the scheduling status of all tracked vqueues. /// /// Returns `None` if the scheduler is disabled. - pub fn iter_status(&self) -> Option> { + pub fn iter_status( + &self, + metas: VQueuesMeta<'_>, + ) -> Option> { match self.state { State::Disabled => None, - State::Active(ref drr_scheduler) => Some(drr_scheduler.iter_status()), + State::Active(ref drr_scheduler) => Some(drr_scheduler.iter_status(metas)), } } diff --git a/crates/vqueues/src/scheduler/drr.rs b/crates/vqueues/src/scheduler/drr.rs index 2d7d50ba4c..5159370647 100644 --- a/crates/vqueues/src/scheduler/drr.rs +++ b/crates/vqueues/src/scheduler/drr.rs @@ -14,10 +14,9 @@ use std::task::Poll; use std::task::Waker; use std::time::Duration; -use hashbrown::HashMap; use metrics::counter; use pin_project::pin_project; -use slotmap::SlotMap; +use slotmap::SecondaryMap; use tokio::time::Instant; use tracing::{info, trace, warn}; @@ -32,16 +31,16 @@ use restate_worker_api::UserLimitCounterEntry; use crate::EventDetails; use crate::VQueueEvent; use crate::VQueuesMeta; +use crate::cache; +use crate::cache::VQueueHandle; +use crate::metric_definitions::VQUEUE_ENQUEUE; use crate::metric_definitions::VQUEUE_RUN_CONFIRMED; -use crate::scheduler::MetaLiteUpdate; use crate::scheduler::eligible::EligibilityTracker; use crate::scheduler::vqueue_state::Pop; use super::Decisions; use super::ReservedResources; use super::ResourceManager; -use super::VQueueHandle; -use super::VQueueMetaLite; use super::VQueueSchedulerStatus; use super::vqueue_state::VQueueState; @@ -50,9 +49,7 @@ pub struct DRRScheduler { resource_manager: ResourceManager, // sorted by queue_id eligible: EligibilityTracker, - /// Mapping of vqueue_id -> handle for active vqueues - id_lookup: HashMap, - q: SlotMap>, + q: SecondaryMap>, /// Waker to be notified when scheduler is potentially able to scheduler more work waker: Waker, /// Time of the last memory reporting and memory compaction @@ -68,7 +65,6 @@ pub struct DRRScheduler { } impl DRRScheduler { - #[allow(clippy::too_many_arguments)] pub fn new( limit_qid_per_poll: NonZeroU16, max_items_per_decision: NonZeroU16, @@ -79,23 +75,14 @@ impl DRRScheduler { let mut total_running = 0; let mut total_waiting = 0; - let num_active = vqueues.num_active(); - let mut q = SlotMap::with_capacity_and_key(num_active); - let mut id_lookup = HashMap::with_capacity(num_active); - let mut eligible: EligibilityTracker = EligibilityTracker::with_capacity(num_active); + let mut q = SecondaryMap::with_capacity(vqueues.capacity()); + let mut eligible: EligibilityTracker = + EligibilityTracker::with_capacity(vqueues.capacity()); - for (qid, meta) in vqueues.iter_active_vqueues() { + for (handle, qid, meta) in vqueues.iter_active_vqueues() { total_running += meta.num_running(); total_waiting += meta.total_waiting(); - let handle = q.insert_with_key(|handle| { - VQueueState::new( - qid.clone(), - handle, - VQueueMetaLite::new(meta), - meta.num_running(), - ) - }); - id_lookup.insert(qid.clone(), handle); + q.insert(handle, VQueueState::new(qid, &storage, meta.num_running())); // We init all active vqueues as eligible first eligible.insert_eligible(handle); } @@ -107,7 +94,6 @@ impl DRRScheduler { Self { resource_manager, - id_lookup, q, eligible, waker: Waker::noop().clone(), @@ -128,6 +114,7 @@ impl DRRScheduler { pub fn poll_schedule_next( mut self: Pin<&mut Self>, + metas: VQueuesMeta<'_>, cx: &mut std::task::Context<'_>, ) -> Poll> { let mut decisions = Decisions::default(); @@ -156,32 +143,36 @@ impl DRRScheduler { break; } - let Some(handle) = this.eligible.next_eligible(this.storage, this.q)? else { + let Some(handle) = this + .eligible + .next_eligible(cx, metas, this.storage, this.q)? + else { break; }; let qstate = this.q.get_mut(handle).unwrap(); + let slot = metas.get(handle).unwrap(); - match qstate.try_pop(cx, this.storage, this.resource_manager)? { + match qstate.try_pop(cx, handle, slot, this.resource_manager)? { Pop::NeedsCredit => { this.eligible.rotate_one(); } Pop::Run(action) => { coop.made_progress(); - decisions.push(&qstate.qid, action); + decisions.push(slot.vqueue_id(), action); // We need to set the state so we check eligibility and setup // necessary schedules when we poll the queue again. this.eligible.front_needs_poll(); } Pop::Yield(action) => { coop.made_progress(); - decisions.push(&qstate.qid, action); + decisions.push(slot.vqueue_id(), action); // We need to set the state so we check eligibility and setup // necessary schedules when we poll the queue again. this.eligible.front_needs_poll(); } Pop::Blocked(resource) => { - trace!("VQueue {:?} is blocked on {resource:?}", qstate.qid); + trace!("VQueue {} is blocked on {resource:?}", slot.vqueue_id()); this.eligible.front_blocked(resource); } } @@ -214,7 +205,6 @@ impl DRRScheduler { .remove_vqueue(handle, &blocked_resource); } - self.id_lookup.remove(qid); self.q.remove(handle); self.eligible.remove(handle); } @@ -222,21 +212,21 @@ impl DRRScheduler { // To be replaced by an invoke event that runs the invocation within the scheduler itself. pub fn confirm_run_attempt( mut self: Pin<&mut Self>, - qid: &VQueueId, + handle: VQueueHandle, + slot: &cache::Slot, key: &EntryKey, ) -> Option { - let handle = self.id_lookup.get(qid).copied()?; let qstate = self.q.get_mut(handle)?; // I've the resources. Let's run it. let permit = qstate.remove_from_unconfirmed_assignments(key)?; - // To make sure the num_waiting counter is updated. - qstate.apply_meta_update(&MetaLiteUpdate::RemovedFromInbox(*key)); counter!(VQUEUE_RUN_CONFIRMED).increment(1); - if qstate.is_dormant() { - trace!("VQueue {qid} is no longer observed by the scheduler"); - self.id_lookup.remove(qid); + if qstate.is_dormant(slot.meta()) { + trace!( + "VQueue {} is no longer observed by the scheduler", + slot.vqueue_id() + ); self.q.remove(handle); self.eligible.remove(handle); } @@ -245,79 +235,51 @@ impl DRRScheduler { #[tracing::instrument(skip_all)] #[track_caller] - pub fn on_inbox_event(&mut self, event: VQueueEvent) { - let qid = event.qid; - let mut maybe_handle = self.id_lookup.get(&qid).copied(); - debug_assert!(maybe_handle.map(|h| self.q.contains_key(h)).unwrap_or(true)); + pub fn on_inbox_event(&mut self, metas: VQueuesMeta<'_>, event: VQueueEvent) { + let slot = metas + .get(event.queue) + .expect("vqueue meta must be in cache"); for update in event.updates { match update { - EventDetails::AddVQueue { - scope, - limit_key, - lock_name, - } => { - if maybe_handle.is_none() { - // add it. - trace!("VQueue {qid} is added to the scheduler"); - maybe_handle = Some(self.q.insert_with_key(|h| { - VQueueState::new_empty( - qid.clone(), - h, - VQueueMetaLite::new_empty(scope, limit_key, lock_name), - ) - })); - self.id_lookup.insert(qid.clone(), maybe_handle.unwrap()); - // Note: We don't make this eligible yet. We'll do that when we see - // an actual enqueue because we initialize the vqueue metadata with - // length 0. - } - } - EventDetails::LockReleased { scope, lock_name } => { - self.release_lock(&scope, &lock_name); + EventDetails::LockReleased => { + self.release_lock(slot.meta().scope(), slot.meta().lock_name().unwrap()); } - EventDetails::InboxUpdate(ref update @ MetaLiteUpdate::QueuePaused) => { - let Some(handle) = maybe_handle else { - continue; - }; - let qstate = self.q.get_mut(handle).unwrap(); - qstate.apply_meta_update(update); - if qstate.is_dormant() { - self.mark_vqueue_as_dormant(&qid, handle); + EventDetails::QueuePaused => { + let qstate = self.q.get_mut(event.queue).unwrap(); + if qstate.is_dormant(slot.meta()) { + self.mark_vqueue_as_dormant(slot.vqueue_id(), event.queue); } } - EventDetails::InboxUpdate(ref update @ MetaLiteUpdate::QueueResumed { .. }) => { - let Some(handle) = maybe_handle else { - continue; - }; - let qstate = self.q.get_mut(handle).unwrap(); - qstate.apply_meta_update(update); - if self.eligible.refresh_membership(qstate) { + EventDetails::QueueResumed => { + let qstate = self.q.get_mut(event.queue).unwrap(); + if self + .eligible + .refresh_membership(event.queue, slot.meta(), qstate) + { self.wake_up(); } } - EventDetails::InboxUpdate( - ref update @ MetaLiteUpdate::EnqueuedToInbox { ref key, ref value }, - ) => { - assert!( - maybe_handle.is_some(), - "scheduler must know about queue {qid} before an inbox enqueue", - ); - let handle = maybe_handle.unwrap(); - let qstate = self.q.get_mut(handle).unwrap(); - - // keep inbox size in sync - qstate.apply_meta_update(update); + EventDetails::EnqueuedToInbox { ref key, ref value } => { + let qstate = self.q.entry(event.queue).unwrap().or_insert_with(|| { + trace!("VQueue {} is added to the scheduler", slot.vqueue_id()); + VQueueState::new_empty() + }); + + counter!(VQUEUE_ENQUEUE).increment(1); if let Some(permit_builder) = qstate.notify_enqueued(key, value) { // The newly enqueued item became the head of the queue. // If the vqueue was blocked we need to place it back // on the ready ring if it wasn't already there. let mut wake_up = false; - if let Some(resource) = self.eligible.mark_queue_unblocked(handle) { - self.resource_manager.remove_vqueue(handle, &resource); + if let Some(resource) = self.eligible.mark_queue_unblocked(event.queue) { + self.resource_manager.remove_vqueue(event.queue, &resource); wake_up |= true; - } else if self.eligible.refresh_membership(qstate) { + } else if self + .eligible + .refresh_membership(event.queue, slot.meta(), qstate) + { wake_up |= true; } @@ -333,13 +295,10 @@ impl DRRScheduler { } } } - EventDetails::InboxUpdate( - ref update @ MetaLiteUpdate::RemovedFromInbox(ref key), - ) => { - let Some(handle) = maybe_handle else { + EventDetails::RemovedFromInbox(ref key) => { + let Some(qstate) = self.q.get_mut(event.queue) else { continue; }; - let qstate = self.q.get_mut(handle).unwrap(); // Three cases: // 1. The item was pending confirmation (we hold resources that should be @@ -347,11 +306,7 @@ impl DRRScheduler { // 2. The item was the head of the queue (not scheduled yet). We _may_ have a // permit builder in-flight that we can release. // 3. None of the above, removing only changes the vqueue metadata. - let mut wake_up = false; - // keep inbox size in sync - qstate.apply_meta_update(update); - // If we have been holding a concurrency permit for this item, we release it. if let Some(permit) = qstate.remove_from_unconfirmed_assignments(key) { // Case 1: @@ -363,15 +318,15 @@ impl DRRScheduler { // Case 2: // This means it might be the current head. Let's invalidate it if // that's the case. - if let Some(resource) = self.eligible.find_blocking_resource(handle) { + if let Some(resource) = self.eligible.find_blocking_resource(event.queue) { // The head was removed and the queue was blocked on a resource. - self.resource_manager.remove_vqueue(handle, resource); + self.resource_manager.remove_vqueue(event.queue, resource); } - if !qstate.is_dormant() { + if !qstate.is_dormant(slot.meta()) { // force the queue to be polled again since the head // will definitely be unknown at this point. - self.eligible.ensure_queue_needs_polling(handle); + self.eligible.ensure_queue_needs_polling(event.queue); wake_up |= true; } // let the other queues that were blocked on my partial permit @@ -382,9 +337,9 @@ impl DRRScheduler { .revert_permit_builder(&mut self.eligible, permit_builder); } - if qstate.is_dormant() { + if qstate.is_dormant(slot.meta()) { // the removal makes the queue dormant. Remove it from everything - self.mark_vqueue_as_dormant(&qid, handle); + self.mark_vqueue_as_dormant(slot.vqueue_id(), event.queue); } if wake_up { @@ -404,29 +359,39 @@ impl DRRScheduler { } } - pub fn iter_status(&self) -> impl Iterator { + pub fn iter_status( + &self, + metas: VQueuesMeta<'_>, + ) -> impl Iterator { // Resolver shared across every queue's status snapshot in this sweep — // the rule store doesn't change while we hold `&self`. let resolve_rule = |handle| self.resource_manager.resolve_user_rule(handle); - self.q.iter().map(move |(_handle, qstate)| { + self.q.iter().map(move |(handle, qstate)| { + let slot = metas.get(handle).unwrap(); let status = VQueueSchedulerStatus { wait_stats: qstate.get_head_wait_stats(), remaining_running: qstate.num_remaining_in_running_stage(), - waiting_inbox: qstate.num_waiting_inbox(), - status: self.eligible.get_status(qstate, &resolve_rule), + waiting_inbox: qstate.num_waiting_inbox(slot.meta()), + status: self + .eligible + .get_status(handle, slot.meta(), qstate, &resolve_rule), head_entry_id: qstate.head_entry_id(), }; - (qstate.qid.clone(), status) + (slot.vqueue_id().clone(), status) }) } - pub fn get_status(&self, qid: &VQueueId) -> VQueueSchedulerStatus { - let Some(qstate) = self - .id_lookup - .get(qid) - .and_then(|handle| self.q.get(*handle)) - else { + #[cfg(test)] + pub fn get_status( + &self, + metas: VQueuesMeta<'_>, + handle: VQueueHandle, + ) -> VQueueSchedulerStatus { + let Some(qstate) = self.q.get(handle) else { + return VQueueSchedulerStatus::default(); + }; + let Some(slot) = metas.get(handle) else { return VQueueSchedulerStatus::default(); }; @@ -434,8 +399,10 @@ impl DRRScheduler { VQueueSchedulerStatus { wait_stats: qstate.get_head_wait_stats(), remaining_running: qstate.num_remaining_in_running_stage(), - waiting_inbox: qstate.num_waiting_inbox(), - status: self.eligible.get_status(qstate, &resolve_rule), + waiting_inbox: qstate.num_waiting_inbox(slot.meta()), + status: self + .eligible + .get_status(handle, slot.meta(), qstate, &resolve_rule), head_entry_id: qstate.head_entry_id(), } } @@ -486,6 +453,8 @@ mod tests { use crate::cache::VQueuesMetaCache; use crate::{GlobalTokenBucket, SchedulingStatus, VQueue, VQueueEvent}; + const TEST_VQUEUES_CAPACITY: usize = 1024; + use super::*; const BASE_RUN_AT_MS: u64 = 1_744_000_000_000; @@ -653,10 +622,11 @@ mod tests { fn poll_scheduler( mut scheduler: Pin<&mut DRRScheduler>, + metas: VQueuesMeta<'_>, ) -> Poll> { let waker = std::task::Waker::noop(); let mut cx = std::task::Context::from_waker(waker); - scheduler.as_mut().poll_schedule_next(&mut cx) + scheduler.as_mut().poll_schedule_next(metas, &mut cx) } fn run_keys(decision: &Decisions) -> Vec { @@ -675,11 +645,13 @@ mod tests { async fn test_empty_scheduler_returns_pending() { let rocksdb = storage_test_environment().await; let db = rocksdb.partition_db(); - let cache = VQueuesMetaCache::create(db.clone()).await.unwrap(); + let cache = VQueuesMetaCache::create(db.clone(), TEST_VQUEUES_CAPACITY) + .await + .unwrap(); let mut scheduler = create_scheduler(db, &cache).await; assert!(matches!( - poll_scheduler(Pin::new(&mut scheduler)), + poll_scheduler(Pin::new(&mut scheduler), cache.view()), Poll::Pending )); } @@ -687,7 +659,7 @@ mod tests { #[restate_core::test] async fn test_poll_yields_enqueued_item() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let qid = test_qid(2000); let mut txn = rocksdb.transaction(); @@ -697,7 +669,7 @@ mod tests { let db = rocksdb.partition_db(); let mut scheduler = create_scheduler(db, &cache).await; - let result = poll_scheduler(Pin::new(&mut scheduler)); + let result = poll_scheduler(Pin::new(&mut scheduler), cache.view()); assert!(matches!(result, Poll::Ready(Ok(ref decision)) if !decision.is_empty())); let Poll::Ready(Ok(decision)) = result else { @@ -711,7 +683,7 @@ mod tests { #[restate_core::test] async fn test_run_at_below_now_preempts_within_inbox() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let qid = test_qid(2_100); let now = MillisSinceEpoch::now().as_u64(); @@ -739,7 +711,8 @@ mod tests { let db = rocksdb.partition_db(); let mut scheduler = create_scheduler(db, &cache).await; - let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; @@ -753,7 +726,7 @@ mod tests { #[restate_core::test] async fn test_round_robin_fairness() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let mut txn = rocksdb.transaction(); for i in 1..=3u64 { @@ -781,7 +754,8 @@ mod tests { cache.view(), ); - let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; @@ -804,7 +778,7 @@ mod tests { #[restate_core::test] async fn test_concurrency_limiting() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let qid1 = test_qid(7000); let qid2 = test_qid(7001); @@ -818,10 +792,19 @@ mod tests { let db = rocksdb.partition_db(); let mut scheduler = create_scheduler_with_concurrency(db, &cache, 1).await; - assert_eq!(scheduler.get_status(&qid1).status, SchedulingStatus::Ready,); - assert_eq!(scheduler.get_status(&qid2).status, SchedulingStatus::Ready,); + let h_qid1 = cache.view().handle_for(&qid1).unwrap(); + let h_qid2 = cache.view().handle_for(&qid2).unwrap(); + + assert_eq!( + scheduler.get_status(cache.view(), h_qid1).status, + SchedulingStatus::Ready, + ); + assert_eq!( + scheduler.get_status(cache.view(), h_qid2).status, + SchedulingStatus::Ready, + ); - let result = poll_scheduler(Pin::new(&mut scheduler)); + let result = poll_scheduler(Pin::new(&mut scheduler), cache.view()); assert!(matches!(result, Poll::Ready(Ok(ref d)) if d.total_items() == 1)); let Poll::Ready(Ok(result)) = result else { panic!("expected decision"); @@ -830,29 +813,34 @@ mod tests { let first_pop_qid = result.qids.keys().next().unwrap().clone(); let first_pop_key = run_keys(&result)[0]; + let h_first_pop = cache.view().handle_for(&first_pop_qid).unwrap(); qids.retain(|qid| qid != &first_pop_qid); + let h_remaining = cache.view().handle_for(&qids[0]).unwrap(); assert!(matches!( - poll_scheduler(Pin::new(&mut scheduler)), + poll_scheduler(Pin::new(&mut scheduler), cache.view()), Poll::Pending )); assert_eq!( - scheduler.get_status(&first_pop_qid).status, + scheduler.get_status(cache.view(), h_first_pop).status, SchedulingStatus::Empty, ); assert_eq!( - scheduler.get_status(&qids[0]).status, + scheduler.get_status(cache.view(), h_remaining).status, SchedulingStatus::BlockedOn(BlockedResource::InvokerConcurrency), ); + let metas = cache.view(); + let first_pop_slot = metas.get(h_first_pop).unwrap(); let mut scheduler = Pin::new(&mut scheduler); - let resources = scheduler - .as_mut() - .confirm_run_attempt(&first_pop_qid, &first_pop_key); + let resources = + scheduler + .as_mut() + .confirm_run_attempt(h_first_pop, first_pop_slot, &first_pop_key); assert!(resources.is_some()); drop(resources); - let Poll::Ready(Ok(result2)) = poll_scheduler(scheduler.as_mut()) else { + let Poll::Ready(Ok(result2)) = poll_scheduler(scheduler.as_mut(), cache.view()) else { panic!("expected decision"); }; assert_eq!(result2.num_run(), 1); @@ -861,16 +849,19 @@ mod tests { qids.retain(|qid| qid != &second_pop_qid); assert!(qids.is_empty()); assert_eq!( - scheduler.get_status(&qid1).status, + scheduler.get_status(cache.view(), h_qid1).status, SchedulingStatus::Dormant, ); - assert_eq!(scheduler.get_status(&qid2).status, SchedulingStatus::Empty,); + assert_eq!( + scheduler.get_status(cache.view(), h_qid2).status, + SchedulingStatus::Empty, + ); } #[restate_core::test] async fn test_get_status_reports_invoker_throttling_retry_estimate() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let qid1 = test_qid(21_101); let qid2 = test_qid(21_102); @@ -901,7 +892,8 @@ mod tests { cache.view(), ); - let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; assert_eq!(decision.num_run(), 1); @@ -914,7 +906,10 @@ mod tests { .expect("a single queue should be scheduled"); let blocked_qid = if running_qid == qid1 { qid2 } else { qid1 }; - let blocked_status = scheduler.get_status(&blocked_qid).status; + let h_blocked = cache.view().handle_for(&blocked_qid).unwrap(); + let h_running = cache.view().handle_for(&running_qid).unwrap(); + + let blocked_status = scheduler.get_status(cache.view(), h_blocked).status; let SchedulingStatus::BlockedOn(BlockedResource::InvokerThrottling { estimated_retry_at }) = blocked_status else { @@ -923,7 +918,7 @@ mod tests { assert!(estimated_retry_at.is_some()); assert_eq!( - scheduler.get_status(&running_qid).status, + scheduler.get_status(cache.view(), h_running).status, SchedulingStatus::Empty ); } @@ -931,7 +926,7 @@ mod tests { #[restate_core::test] async fn test_waiters_outside_throttling_window_report_no_estimate() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let qids = [ test_qid(22_001), @@ -977,7 +972,8 @@ mod tests { cache.view(), ); - let Poll::Ready(Ok(_decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + let Poll::Ready(Ok(_decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; @@ -985,9 +981,10 @@ mod tests { let mut none_estimate = 0; for qid in &qids { + let handle = cache.view().handle_for(qid).unwrap(); if let SchedulingStatus::BlockedOn(BlockedResource::InvokerThrottling { estimated_retry_at, - }) = scheduler.get_status(qid).status + }) = scheduler.get_status(cache.view(), handle).status { if estimated_retry_at.is_some() { some_estimate += 1; @@ -1004,7 +1001,7 @@ mod tests { #[restate_core::test] async fn test_max_items_per_decision_limit() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let mut txn = rocksdb.transaction(); for i in 1..=5u64 { @@ -1022,7 +1019,7 @@ mod tests { cache.view(), ); - if let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) { + if let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) { assert!(!decision.is_empty()); assert!(decision.total_items() <= 2); } @@ -1031,7 +1028,7 @@ mod tests { #[restate_core::test] async fn test_higher_priority_preempts_between_polls() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let qid = test_qid(14_000); let mut events = Vec::new(); @@ -1050,7 +1047,8 @@ mod tests { cache.view(), ); - let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; let mut in_flight = run_keys(&decision); @@ -1063,10 +1061,11 @@ mod tests { enqueue_entry(&mut txn, &mut cache, &qid, 125, 0, Some(&mut events)).await; txn.commit().await.expect("commit should succeed"); for event in events.drain(..) { - scheduler.on_inbox_event(event); + scheduler.on_inbox_event(cache.view(), event); } - let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; let next_keys = run_keys(&decision); @@ -1082,10 +1081,11 @@ mod tests { } txn.commit().await.expect("commit should succeed"); for event in events.drain(..) { - scheduler.on_inbox_event(event); + scheduler.on_inbox_event(cache.view(), event); } - let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; let keys = run_keys(&decision); @@ -1097,7 +1097,7 @@ mod tests { #[restate_core::test] async fn test_get_status_reflects_scheduling_states() { let mut rocksdb = storage_test_environment().await; - let mut cache = VQueuesMetaCache::new_empty(); + let mut cache = VQueuesMetaCache::new_empty(TEST_VQUEUES_CAPACITY); let qid1 = test_qid(21_001); let qid2 = test_qid(21_002); @@ -1120,16 +1120,26 @@ mod tests { cache.view(), ); - assert_eq!(scheduler.get_status(&qid1).status, SchedulingStatus::Ready); - assert_eq!(scheduler.get_status(&qid2).status, SchedulingStatus::Ready); + let h_qid1 = cache.view().handle_for(&qid1).unwrap(); + let h_qid2 = cache.view().handle_for(&qid2).unwrap(); - let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler)) else { + assert_eq!( + scheduler.get_status(cache.view(), h_qid1).status, + SchedulingStatus::Ready + ); + assert_eq!( + scheduler.get_status(cache.view(), h_qid2).status, + SchedulingStatus::Ready + ); + + let Poll::Ready(Ok(decision)) = poll_scheduler(Pin::new(&mut scheduler), cache.view()) + else { panic!("expected decision"); }; assert_eq!(decision.total_items(), 3); assert_eq!(decision.num_queues(), 2); - let status = scheduler.get_status(&qid1); + let status = scheduler.get_status(cache.view(), h_qid1); assert_eq!( status.status, SchedulingStatus::BlockedOn(BlockedResource::InvokerConcurrency) @@ -1137,7 +1147,7 @@ mod tests { assert_eq!(status.waiting_inbox, 1); assert_eq!(status.remaining_running, 0); - let status = scheduler.get_status(&qid2); + let status = scheduler.get_status(cache.view(), h_qid2); assert_eq!(status.status, SchedulingStatus::Empty); assert_eq!(status.waiting_inbox, 0); assert_eq!(status.remaining_running, 0); @@ -1153,22 +1163,32 @@ mod tests { }) .expect("qid2 run action should exist"); + let metas = cache.view(); + let qid2_slot = metas.get(h_qid2).unwrap(); let mut scheduler = Pin::new(&mut scheduler); - let resources = scheduler.as_mut().confirm_run_attempt(&qid2, &qid2_key); + let resources = scheduler + .as_mut() + .confirm_run_attempt(h_qid2, qid2_slot, &qid2_key); assert!(resources.is_some()); drop(resources); - let Poll::Ready(Ok(decision)) = poll_scheduler(scheduler.as_mut()) else { + let Poll::Ready(Ok(decision)) = poll_scheduler(scheduler.as_mut(), cache.view()) else { panic!("expected decision"); }; assert_eq!(decision.total_items(), 1); assert_eq!(decision.num_queues(), 1); - assert_eq!(scheduler.get_status(&qid1).status, SchedulingStatus::Empty); assert_eq!( - scheduler.get_status(&qid2).status, + scheduler.get_status(cache.view(), h_qid1).status, + SchedulingStatus::Empty + ); + assert_eq!( + scheduler.get_status(cache.view(), h_qid2).status, SchedulingStatus::Dormant ); - assert!(matches!(poll_scheduler(scheduler.as_mut()), Poll::Pending)); + assert!(matches!( + poll_scheduler(scheduler.as_mut(), cache.view()), + Poll::Pending + )); } } diff --git a/crates/vqueues/src/scheduler/eligible.rs b/crates/vqueues/src/scheduler/eligible.rs index dc4d281212..5aaf79840b 100644 --- a/crates/vqueues/src/scheduler/eligible.rs +++ b/crates/vqueues/src/scheduler/eligible.rs @@ -11,8 +11,9 @@ use std::collections::VecDeque; use std::task::Poll; +use restate_storage_api::vqueue_table::metadata::VQueueMeta; +use slotmap::SecondaryMap; use slotmap::secondary::Entry; -use slotmap::{SecondaryMap, SlotMap}; use tokio_util::time::{DelayQueue, delay_queue}; use restate_limiter::RuleHandle; @@ -22,9 +23,10 @@ use restate_types::time::MillisSinceEpoch; use restate_util_string::ReString; use restate_worker_api::{ResourceKind, SchedulingStatus}; -use super::VQueueHandle; use super::clock::SchedulerClock; use super::vqueue_state::VQueueState; +use crate::VQueuesMeta; +use crate::cache::VQueueHandle; use crate::scheduler::vqueue_state::Eligibility; #[derive(Debug, Copy, Clone)] @@ -71,14 +73,20 @@ impl EligibilityTracker { self.ready_ring.push_back(handle); } - pub fn get_status(&self, qstate: &VQueueState, resolve_rule: &R) -> SchedulingStatus + pub fn get_status( + &self, + handle: VQueueHandle, + meta: &VQueueMeta, + qstate: &VQueueState, + resolve_rule: &R, + ) -> SchedulingStatus where S: VQueueStore, R: Fn(RuleHandle) -> Option, { - match self.states.get(qstate.handle) { + match self.states.get(handle) { None | Some(State::NeedsPoll) | Some(State::Ready) => { - super::status_from_detailed_eligibility(qstate.check_eligibility()) + super::status_from_detailed_eligibility(qstate.check_eligibility(meta)) } Some(State::Scheduled { wake_up }) => SchedulingStatus::Scheduled { at: wake_up.ts.into(), @@ -110,10 +118,14 @@ impl EligibilityTracker { pub fn next_eligible( &mut self, + cx: &mut std::task::Context<'_>, + metas: VQueuesMeta<'_>, storage: &S, - vqueues: &mut SlotMap>, + vqueues: &mut SecondaryMap>, ) -> Result, StorageError> { - loop { + let n = self.ready_ring.len(); + // avoid rescanning the ready ring multiple rounds + for _ in 0..n { // what is my current status let Some(handle) = self.ready_ring.front().copied() else { return Ok(None); @@ -134,15 +146,17 @@ impl EligibilityTracker { continue; }; + let slot = metas.get(handle).unwrap(); + match current_state { State::NeedsPoll => { // update the state based on eligibility. - match qstate.poll_eligibility(storage)?.as_compact() { - Eligibility::Eligible => { + match qstate.poll_eligibility(cx, slot, storage) { + Poll::Ready(Ok(Eligibility::Eligible)) => { *current_state = State::Ready; return Ok(Some(handle)); } - Eligibility::EligibleAt(ts) => { + Poll::Ready(Ok(Eligibility::EligibleAt(ts))) => { let ts = ts.as_unix_millis(); let duration = ts.duration_since(SchedulerClock.now_millis()); let timer_key = self.delayed_eligibility.insert(handle, duration); @@ -152,24 +166,29 @@ impl EligibilityTracker { self.ready_ring.pop_front(); continue; } - Eligibility::NotEligible => { + Poll::Ready(Ok(Eligibility::NotEligible)) => { self.ready_ring.pop_front(); self.remove(handle); continue; } + Poll::Ready(Err(err)) => { + return Err(err); + } + Poll::Pending => { + continue; + } } } - State::BlockedOn(_) => { + State::BlockedOn(_) | State::Scheduled { .. } => { self.ready_ring.pop_front(); } State::Ready => { return Ok(Some(handle)); } - State::Scheduled { .. } => { - self.ready_ring.pop_front(); - } } } + + Ok(None) } pub fn remove(&mut self, handle: VQueueHandle) { @@ -254,8 +273,13 @@ impl EligibilityTracker { /// Returns true if scheduler should be woken up /// /// If the vqueue is blocked on a resource, this function will not touch it. - pub fn refresh_membership(&mut self, vqueue: &VQueueState) -> bool { - let Some(current_state) = self.states.entry(vqueue.handle) else { + pub fn refresh_membership( + &mut self, + handle: VQueueHandle, + meta: &VQueueMeta, + vqueue: &VQueueState, + ) -> bool { + let Some(current_state) = self.states.entry(handle) else { // the vqueue handle was removed from the original slot map. return false; }; @@ -263,11 +287,11 @@ impl EligibilityTracker { match current_state { Entry::Vacant(vacant_entry) => { vacant_entry.insert(State::NeedsPoll); - self.ready_ring.push_back(vqueue.handle); + self.ready_ring.push_back(handle); true } Entry::Occupied(mut occupied_entry) => { - let eligibility = vqueue.check_eligibility().as_compact(); + let eligibility = vqueue.check_eligibility(meta).as_compact(); match (occupied_entry.get(), eligibility) { (State::NeedsPoll, _) => { // do nothing. @@ -282,14 +306,14 @@ impl EligibilityTracker { // happens, we err on the safe side and switch to polling. self.delayed_eligibility.remove(&wake_up.timer_key); occupied_entry.insert(State::NeedsPoll); - self.ready_ring.push_back(vqueue.handle); + self.ready_ring.push_back(handle); true } (State::Scheduled { wake_up }, Eligibility::Eligible) => { // wake up now self.delayed_eligibility.remove(&wake_up.timer_key); - self.states.insert(vqueue.handle, State::NeedsPoll); - self.ready_ring.push_back(vqueue.handle); + self.states.insert(handle, State::NeedsPoll); + self.ready_ring.push_back(handle); true } (State::Scheduled { wake_up }, Eligibility::EligibleAt(eligible_at_ts)) => { diff --git a/crates/vqueues/src/scheduler/queue.rs b/crates/vqueues/src/scheduler/queue.rs index f381f3f5ac..53c60184ed 100644 --- a/crates/vqueues/src/scheduler/queue.rs +++ b/crates/vqueues/src/scheduler/queue.rs @@ -8,20 +8,197 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use restate_storage_api::StorageError; -use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueCursor, VQueueStore}; +use std::collections::VecDeque; +use std::num::NonZeroU32; +use std::pin::Pin; +use std::task::{Poll, ready}; + +use itertools::{EitherOrBoth, Itertools as _}; +use tokio::time::Instant; +use tracing::trace; + +use restate_storage_api::vqueue_table::CursorError; +use restate_storage_api::vqueue_table::{ + EntryKey, EntryValue, VQueueCursor, VQueueRunningCursor, VQueueStore, +}; +use restate_storage_api::{StorageError, vqueue_table}; +use restate_time_util::DurationExt; use restate_types::vqueues::VQueueId; use super::UnconfirmedAssignments; -#[derive(Debug)] -enum Head { - /// We need a seek+read to know the head. - Unknown, - /// The current cursor's head - Known { key: EntryKey, value: EntryValue }, - /// We know that we've reached the end of the vqueue - Empty, +/// The number of entries we are willing to keep in cache +const INBOX_CACHE_CAPACITY: usize = 24; + +enum Overlay { + Add(EntryValue), + Tombstone, +} + +/// In-flight async refill, plus an overlay of `notify_enqueued` / +/// `notify_removed` events that arrived while the task was running. +/// +/// # Storage-write-before-notify invariant +/// +/// The partition processor commits inbox writes/deletes to RocksDB *before* +/// dispatching `EnqueuedToInbox` / `RemovedFromInbox` events to the +/// scheduler. This ordering rule underpins the following overlay reasoning: +/// +/// 1. **Why tombstones are needed at all.** The refill task's RocksDB +/// snapshot is fixed at task-start time. A `notify_removed` that arrives +/// while the task is in flight may correspond to a delete that committed +/// *before* the task started its scan (snapshot still sees the row) or +/// *after* (snapshot doesn't). We can't tell from the notification +/// alone, so we record a tombstone and let `merge_join_by` suppress the +/// row if it shows up in the result. +/// +/// 2. **Why `push_added_item` panics on a key it already tombstoned.** A +/// later add of a key we previously tombstoned would imply the +/// notifications arrived out-of-order with respect to storage commits, +/// which the invariant rules out. +/// +/// 3. **Why cancelling an in-flight task discards the overlay safely** (see +/// [`RefillState::update_anchor`]). When cancellation kicks in, every +/// pending notification has already had its storage write committed. +/// The next fresh refill takes a new snapshot and sees post-commit +/// state directly, so the overlay would be redundant. +struct RefillTask { + started_at: Instant, + refill_anchor: Option, + /// Items that are known to be added or removed while the refill was in-flight + /// Sorted ascending by EntryKey. + overlay: VecDeque<(EntryKey, Overlay)>, + handle: tokio::task::JoinHandle, StorageError>>, + /// Exclusive upper bound on the overlay's coverage. `None` until the + /// first at-capacity event; once set, only shrinks. + /// + /// On overflow we set this (exclusive upper bound) instead of + /// evicting from the overlay. Overlay events and merge-time storage rows + /// with `key >= horizon` are then dropped; the next refill rediscovers + /// them via `seek_after(cache.back)` (always below `horizon`). Avoids + /// the silent-drop hazard of back-eviction — a popped tombstone could + /// re-admit a deleted row — at the cost of re-fetching some + /// already-read storage rows. + horizon: Option, +} +impl RefillTask { + /// Capacity/horizon handling shared by the `push_*` methods. Caller resolves + /// duplicates first; `pos` is insertion position. + /// + /// Returns `Some(pos)` to insert there, or `None` if the event was dropped. + fn prepare_overlay_insert(&mut self, key: EntryKey, pos: usize) -> Option { + if self.horizon.is_some_and(|h| key >= h) { + return None; + } + if self.overlay.len() < INBOX_CACHE_CAPACITY { + return Some(pos); + } + // At capacity: shrink the horizon to seal the affected range. + if pos == self.overlay.len() { + // New key sorts past every overlay entry — it becomes the + // horizon itself and is not tracked. + self.horizon = Some(self.horizon.map_or(key, |h| h.min(key))); + None + } else { + // New key displaces the back; the back's key becomes the + // horizon (anything at-or-above it is now uncertain). + let back_key = self + .overlay + .back() + .expect("overlay at capacity must have a back") + .0; + self.horizon = Some(self.horizon.map_or(back_key, |h| h.min(back_key))); + self.overlay.pop_back(); + Some(pos) + } + } + + fn push_tombstone(&mut self, key_to_remove: &EntryKey) { + let pos = match self + .overlay + .binary_search_by_key(key_to_remove, |&(k, _)| k) + { + Ok(pos) => { + // Existing overlay entry → upgrade to Tombstone. + self.overlay.get_mut(pos).unwrap().1 = Overlay::Tombstone; + return; + } + Err(pos) => pos, + }; + if let Some(pos) = self.prepare_overlay_insert(*key_to_remove, pos) { + self.overlay + .insert(pos, (*key_to_remove, Overlay::Tombstone)); + } + } + + fn push_added_item(&mut self, key_to_add: &EntryKey, value: &EntryValue) { + let pos = match self.overlay.binary_search_by_key(key_to_add, |&(k, _)| k) { + Ok(_) => panic!( + "Something went wrong here. We should not see duplicate enqueues or an enqueue after a removal of the same key: {key_to_add:?}" + ), + Err(pos) => pos, + }; + if let Some(pos) = self.prepare_overlay_insert(*key_to_add, pos) { + self.overlay + .insert(pos, (*key_to_add, Overlay::Add(value.clone()))); + } + } +} + +#[derive(derive_more::Debug)] +enum RefillState { + #[debug("standby")] + Standby { + /// `seek_after` anchor for the next refill, and the upper bound used to + /// decide whether to accept a `notify_enqueued`. Items currently in the + /// cache, items previously consumed from the cache, and items in the + /// caller's skip set together account for every inbox key + /// `<= refill_anchor`. Keys `> refill_anchor` are undiscovered and will + /// appear on the next refill via `seek_after(refill_anchor)`. + refill_anchor: Option, + }, + #[debug("in-flight (age: {})", _0.started_at.elapsed().friendly())] + InFlight(Box), +} + +impl Default for RefillState { + fn default() -> Self { + Self::Standby { + refill_anchor: None, + } + } +} + +impl RefillState { + /// Updates the standby anchor, or cancels an in-flight task and resets + /// to standby with the given anchor. + /// + /// Dropping an in-flight `RefillTask` here also drops its overlay + /// (pending `Add` / `Tombstone` records). This is safe per the + /// storage-write-before-notify invariant documented on [`RefillTask`]: + /// every notification we have observed (and thus every overlay entry) + /// has its storage write already committed, so the next fresh refill's + /// snapshot reflects them directly without needing the overlay. + fn update_anchor(&mut self, new_anchor: Option) { + match self { + RefillState::Standby { refill_anchor } => { + *refill_anchor = new_anchor; + } + RefillState::InFlight(_) => { + trace!("Refill task was in-flight but no longer needed, cancelling it"); + // Drop the task; the receiver going away signals the worker + // thread to bail. See `RefillTask` doc for why discarding + // the overlay is safe here. + *self = RefillState::Standby { + refill_anchor: new_anchor, + } + } + } + } + + fn is_in_flight(&self) -> bool { + matches!(self, Self::InFlight(_)) + } } #[derive(Debug)] @@ -38,683 +215,639 @@ pub enum QueueItem<'a> { } #[derive(derive_more::Debug)] -pub(crate) enum Reader { - /// Reader was never opened and might need to scan running items - New { already_running: u32 }, +enum Stage { + /// Brand-new queue; running items still need to be drained first. + New { + already_running: NonZeroU32, + reader: Option, + }, + /// In running stage. Single-item head, single-shot reader. #[debug("Running")] Running { - remaining: u32, + head: (EntryKey, EntryValue), reader: S::RunningReader, + remaining: u32, }, - #[debug("Inbox")] - Inbox(S::InboxReader), - // We can transition back to Reader::Inbox if new items have been added to the inbox - // but we should never return to `Running`. - #[debug("Closed")] - Closed, + /// In inbox stage. The queue's `inbox_cache` is the source of truth + /// between refills. + Inbox, + /// Inbox is fully drained. + Empty, } pub(crate) struct Queue { - head: Head, - reader: Reader, + stage: Stage, + /// Backing cache for the inbox stage. + /// Meaningful only when `stage` is `Inbox` or `Empty`; for + /// other stages it must be empty (invariant maintained by `advance` / + /// `remove` / `enqueue`). + /// + /// Sorted ascending by EntryKey. Front = current head. + items: VecDeque<(EntryKey, EntryValue)>, + refill_state: RefillState, } impl Queue { /// Creates a new queue that must first go through the given number of running items /// before it switches to reading the waiting inbox. - pub fn new(num_running: u32) -> Self { + pub fn new(num_running: u32, storage: &S, qid: &VQueueId) -> Self { + let stage = if num_running > 0 { + Stage::New { + already_running: NonZeroU32::new(num_running).unwrap(), + reader: Some(storage.new_run_reader(qid)), + } + } else { + Stage::Inbox + }; + Self { - head: Head::Unknown, - reader: Reader::New { - already_running: num_running, - }, + stage, + items: VecDeque::with_capacity(INBOX_CACHE_CAPACITY), + refill_state: Default::default(), } } /// Creates an empty queue pub fn new_closed() -> Self { Self { - head: Head::Empty, - reader: Reader::Closed, + stage: Stage::Empty, + items: VecDeque::with_capacity(INBOX_CACHE_CAPACITY), + refill_state: Default::default(), } } /// If the queue is known to be empty (no more items to dequeue) pub fn is_empty(&self) -> bool { - matches!(self.head, Head::Empty) + matches!(self.stage, Stage::Empty) } + /// Returns `true` iff the removed key was the head of the queue. + /// + /// While the queue is still in the running stage, this is a no-op: the + /// scheduler may still yield a running item after the state machine has + /// declared it removed, and the state machine must ignore that yield. pub fn remove(&mut self, key_to_remove: &EntryKey) -> bool { - // Can this be the known head? - // Yes. Perhaps it expired/ended externally. - // We do not do anything if the reader is still at the running stage, - // - // This means that the scheduler might still yield the "running" item after - // the state machine has declared it as completed/removed. The state machine - // must be able to handle this case and "ignore" the yield command of this item. - if matches!(self.reader, Reader::Closed | Reader::Inbox(..)) - && let Head::Known { ref key, .. } = self.head - && key == key_to_remove - { - self.head = Head::Unknown; - // Ensure that next advance would re-seek to the newly added item - self.reader = Reader::Closed; - true - } else { - false + if matches!( + self.stage, + Stage::New { .. } | Stage::Running { .. } | Stage::Empty + ) { + return false; } + + // We are in Inbox/Waiting stage. + let Ok(pos) = self.items.binary_search_by_key(key_to_remove, |&(k, _)| k) else { + // This branch handles when we don't have the item in cache. + // + // The removed item can be: + // - Cached + // - Unconfirmed (not in cache) || < refill anchor + // - > refill anchor + // - We have in-flight refill + // - No in-inflight refills + // + // Scenarios: + // - We have empty cache (or does it matter?), maybe what matters is if we have it in cache or not. + // - We have waiting inbox (inbox > 0) + // - We have waiting inbox but with unconfirmed items in flights, we even out at zero. + // which means that we should not see any removals outside what we have already + // decided to start. We are effectively (empty). + // - No inflight refill yet. Ignore. + // - In flight refill exists. --> Maybe tombstone it. + // - We have cached, no in-flight refill. + // - Remove if in cache, otherwise, ignore. + // - We have cached items, removal is < the refill anchor, we don't have it in cache. Ignore. + // - We have cached items, but the removal is > the refill anchor --> Maybe tombstone it. + // + // Removed item is beyond the refill anchor + match self.refill_state { + RefillState::Standby { .. } => {} + RefillState::InFlight(ref mut task) + if task + .refill_anchor + .is_none_or(|ref refill| key_to_remove > refill) => + { + // The refill task is in flight. We cannot determine if the key will impact + // its result or not, we'll keep at most INBOX_CACHE_CAPACITY tombstones to + // use as overlay when the refill is complete. + task.push_tombstone(key_to_remove); + } + RefillState::InFlight(_) => { + // Ignore it. This item has either been shipped (unconfirmed assignment) + // or was never in the inbox to start with. + } + } + + return false; + }; + self.items.remove(pos); + // The anchor stays put — even if we removed the back, the entry was + // also removed from storage, so `seek_after(anchor)` will skip it + // naturally on the next refill. + pos == 0 } - /// Returns true if the head was changed + /// Returns `true` iff the new item became the head of the queue. pub fn enqueue(&mut self, key: &EntryKey, value: &EntryValue) -> bool { - match (&self.head, &self.reader) { - // we are only unknown if we are new and didn't read the running list yet, - // we might also be in a limbo state if advance() failed. - (_, Reader::New { .. } | Reader::Running { .. }) => { /* do nothing */ } - (Head::Unknown, _) => { /* do nothing */ } - (Head::Empty, _) => { - self.reader = Reader::Closed; - self.head = Head::Known { - key: *key, - value: value.clone(), - }; + match self.stage { + Stage::New { .. } | Stage::Running { .. } => return false, + Stage::Empty => { + // The cache is already allocated and empty (kept around + // across the previous `Inbox -> Empty` transition). Just + // re-seed it and flip the marker. + assert!(self.items.is_empty()); + assert!(!self.refill_state.is_in_flight()); + self.items.push_back((*key, value.clone())); + self.refill_state.update_anchor(Some(*key)); + self.stage = Stage::Inbox; return true; } - ( - Head::Known { - key: current_key, .. - }, - Reader::Inbox(_) | Reader::Closed, - ) => { - if key < current_key { - self.head = Head::Known { - key: *key, - value: value.clone(), - }; - // Ensure that next advance would re-seek to the newly added item - self.reader = Reader::Closed; - return true; - } else { - // This is a temporary fix to ensure that we perform a re-seek - // to fix the issue where the iterator wouldn't see the newly added - // items if the memtable was flushed prior the seek. - self.reader = Reader::Closed; - } + Stage::Inbox => { /* fall-through */ } + } + + // Insert `(key, value)` into the sorted cache. + // + // Returns `true` iff the item became the new front (head). Returns + // `false` if the item was rejected because its key falls strictly above + // the cache's coverage zone (`> refill_anchor`); in that case the item + // will be discovered on the next refill. + + // Priority-queue rule: ignore items strictly above our coverage zone. + // The bound is `refill_anchor`, NOT `cache.back`. After consuming the + // back of the cache, `cache.back` can be lower than `refill_anchor`, + // and items in `(cache.back, refill_anchor]` would be lost (the next + // refill seek_after(refill_anchor) only returns keys strictly greater + // than the anchor). + match self.refill_state { + RefillState::Standby { refill_anchor, .. } + if refill_anchor.is_none_or(|ref refill| key > refill) => + { + // We cannot accept items beyond our coverage zone. + return false; + } + RefillState::Standby { .. } => { /* in coverage zone */ } + // This enqueue may or may not appear in the in-flight operation and + // there is no way to determine that. So, we stage it on the in-flight + // task overlay until we receive the result. + RefillState::InFlight(ref mut task) + if task.refill_anchor.is_none_or(|ref refill| key > refill) => + { + task.push_added_item(key, value); + return false; + } + RefillState::InFlight(ref task) => { + // the new item is < than what the refill task is interested in. Therefore, + // we add it right now. + assert!(task.refill_anchor.is_some_and(|ref refill| key <= refill)); } } - false - } - /// Returns the head if known, or None if the queue needs advancing - pub fn head(&self) -> Option> { - match (&self.head, &self.reader) { - (Head::Unknown, _) => None, - (_, Reader::New { .. }) => None, - (Head::Known { key, value }, Reader::Running { .. }) => { - Some(QueueItem::Running { key, value }) + let pos = match self.items.binary_search_by_key(key, |&(k, _)| k) { + Ok(_) => { + // We already know about this key which means that the head has not changed + // as a result of this enqueue. + return false; } - (Head::Known { key, value }, Reader::Inbox(_) | Reader::Closed) => { - Some(QueueItem::Inbox { key, value }) + Err(pos) => pos, + }; + + // We need to be careful about moving the refill anchor if there is an + // in-flight refill task. + // + // let's say we have a refill task in flight, and we are enqueueing smaller + // than it's anchor point, we are confident that the task will never return + // this item. If as a result of the enqueue we exceeded the cache capacity. + // + // If there was an in-flight refill, we must cancel it and reset our refill + // anchor to point to the back of the cache so that the next refill can + // re-discover it. This is an acceptable trade-off because we must be here + // because we have sufficiently populated the cache with recently enqueued + // items and we wouldn't need the refill immediately anyway. + + // At-cap fast path: avoid the VecDeque grow/shrink that would happen + // if we inserted first and evicted afterwards. + if self.items.len() >= INBOX_CACHE_CAPACITY { + if pos == self.items.len() { + // The new key would land at the back and be evicted on the + // very next step. Skip the insert, but still lower the anchor + // to the current back so the next refill can re-discover `key` + // via `seek_after(anchor)`. + // + // Cancels the in-flight task if exists + self.refill_state + .update_anchor(self.items.back().map(|(k, _)| *k)); + return false; } - (Head::Empty, _) => Some(QueueItem::None), + // Make room first; `pos` is unaffected because pos < old_len in + // this branch (we shift elements right of `pos` regardless). + debug_assert!(pos < self.items.len()); + self.items.pop_back(); + self.items.insert(pos, (*key, value.clone())); + + // Cancels the in-flight task if exists + self.refill_state + .update_anchor(self.items.back().map(|(k, _)| *k)); + return pos == 0; } + + // Normal path: cache below capacity. + self.items.insert(pos, (*key, value.clone())); + // Anchor maintenance: if it was None (very first insert), set it to + // this key. Otherwise the precondition above guarantees + // `key <= anchor`, so the anchor stays put. + if let RefillState::Standby { refill_anchor } = &mut self.refill_state + && refill_anchor.is_none() + { + *refill_anchor = Some(*key); + } + pos == 0 } - pub fn advance_if_needed( + /// Returns the head if known, or `None` if the queue needs advancing. + pub fn head(&self) -> Option> { + match &self.stage { + Stage::New { .. } => None, + Stage::Running { head: (k, v), .. } => Some(QueueItem::Running { key: k, value: v }), + Stage::Inbox => self + .items + .front() + .map(|(k, v)| QueueItem::Inbox { key: k, value: v }), + Stage::Empty => Some(QueueItem::None), + } + } + + pub fn poll_advance_if_needed( &mut self, + cx: &mut std::task::Context<'_>, storage: &S, skip: &UnconfirmedAssignments, qid: &VQueueId, - ) -> Result, StorageError> { - // Keep advancing until the head is known - while matches!(self.head, Head::Unknown) { - self.advance(storage, skip, qid)?; - } - - match (&self.head, &self.reader) { - (Head::Unknown, _) => unreachable!("head must be known"), - (_, Reader::New { .. }) => unreachable!("reader cannot be new after poll"), - (Head::Known { key, value }, Reader::Running { .. }) => { - Ok(QueueItem::Running { key, value }) + effectively_empty: bool, + allow_blocking_io: bool, + ) -> Poll, StorageError>> { + loop { + let needs_advance = match self.stage { + Stage::New { .. } => true, + Stage::Inbox => self.items.is_empty(), + _ => false, + }; + if !needs_advance { + break; } - (Head::Known { key, value }, Reader::Inbox(_) | Reader::Closed) => { - Ok(QueueItem::Inbox { key, value }) + if !self.try_advance()? { + // We cannot advance without a refill + if self.refill_state.is_in_flight() { + ready!(self.poll_refill_task(cx, skip)); + // If we ended up also being empty and we can't determine if we are + // empty or not, we should start another refill task and return Pending. + // This happens automatically because in that case we'll "continue" + } else { + // Do we need to refill? + // We don't need to refill if we are at Inbox stage and we know no more + // inbox entries are available. + if effectively_empty && matches!(self.stage, Stage::Inbox) { + self.stage = Stage::Empty; + break; + } + // A) try refill immediate refill if allowed + match self.try_refill(storage, qid, skip, allow_blocking_io) { + Ok(_) => {} + Err(CursorError::WouldBlock) => { + // B) start an async refill task + self.start_refill_task(storage, qid); + } + Err(CursorError::Other(e)) => { + // C) fail miserably + return Poll::Ready(Err(e)); + } + } + } } - (Head::Empty, _) => Ok(QueueItem::None), } + + Poll::Ready(Ok(match &self.stage { + Stage::New { .. } => unreachable!("head must be resolved after advance_if_needed"), + Stage::Running { head: (k, v), .. } => QueueItem::Running { key: k, value: v }, + Stage::Inbox => { + let (k, v) = self + .items + .front() + .expect("inbox cache must have a head after advance_if_needed"); + QueueItem::Inbox { key: k, value: v } + } + Stage::Empty => QueueItem::None, + })) } /// Advances the queue to the next item. /// - /// The queue reader will skip over items in `skip` when reading the inbox stage. When reading - /// the running stage, the `skip` set is ignored. - pub fn advance( - &mut self, - storage: &S, - skip: &UnconfirmedAssignments, - qid: &VQueueId, - ) -> Result<(), StorageError> { + /// In the inbox stage this consumes the current head (if any) and exposes + /// the next cached item; the cache is refilled from storage in batches of + /// up to [`INBOX_CACHE_CAPACITY`] when it empties. The `skip` set is + /// consulted only when reading the inbox stage; it is ignored when + /// reading the running stage. + /// + /// + /// Returns false if advance was not possible and we need to perform a refill. + pub fn try_advance(&mut self) -> Result { + // Split into disjoint borrows so the `Stage::Inbox` arm below can + // mutate both `stage` and `inbox_cache` without fighting the + // borrow checker. + let Self { stage, items, .. } = self; loop { - match self.reader { - Reader::New { already_running } if already_running > 0 => { - let mut reader = storage.new_run_reader(qid); + match stage { + Stage::New { + already_running, + reader, + } => { + let mut reader = reader.take().unwrap(); reader.seek_to_first(); - let item = reader.peek()?; - if let Some((key, value)) = item { - self.head = Head::Known { key, value }; - self.reader = Reader::Running { - remaining: already_running, + if let Some((key, value)) = reader.peek()? { + *stage = Stage::Running { + head: (key, value), reader, + remaining: already_running.get(), }; - break; - } else { - assert!( - already_running > 0, - "vqueue {qid:?} has no running items but its metadata says that it has {already_running} running items", - ); - // move to inbox reading - self.head = Head::Unknown; - self.reader = Reader::Closed; + return Ok(true); } + debug_assert!( + false, + "vqueue has no running items but its metadata says it has {}", + already_running.get(), + ); + *stage = Stage::Inbox; } - Reader::New { .. } => { - // create new inbox reader - self.reader = Reader::Closed; - } - Reader::Running { - ref mut reader, - ref mut remaining, + Stage::Running { + reader, + remaining, + head, } => { reader.advance(); *remaining = remaining.saturating_sub(1); - let item = reader.peek()?; - if let Some((key, value)) = item { - debug_assert!(*remaining > 0); - self.head = Head::Known { key, value }; - break; - } else { - debug_assert_eq!(0, *remaining); - // move to inbox reading - self.head = Head::Unknown; - self.reader = Reader::Closed; - } - } - Reader::Inbox(ref mut reader) => { - reader.advance(); - let key = reader.current_key()?; - if let Some(key) = key { - if skip.contains_key(&key) { - continue; - } - self.head = Head::Known { - key, - value: reader.current_value()?.unwrap(), - }; - break; - } else { - // we are done reading inbox - self.head = Head::Empty; - self.reader = Reader::Closed; - break; - } - } - Reader::Closed => { - match self.head { - Head::Unknown => { - let mut reader = storage.new_inbox_reader(qid); - reader.seek_to_first(); - let key = reader.current_key()?; - if let Some(key) = key { - if skip.contains_key(&key) { - self.reader = Reader::Inbox(reader); - continue; - } - self.head = Head::Known { - key, - value: reader.current_value()?.unwrap(), - }; - self.reader = Reader::Inbox(reader); - break; - } else { - self.head = Head::Empty; - self.reader = Reader::Closed; - } - } - Head::Known { ref key, .. } => { - // seek to known head first, then advance. - let mut reader = storage.new_inbox_reader(qid); - reader.seek_after(qid, key); - let next_key = reader.current_key()?; - if let Some(next_key) = next_key { - if skip.contains_key(&next_key) { - self.reader = Reader::Inbox(reader); - continue; - } - self.head = Head::Known { - key: next_key, - value: reader.current_value()?.unwrap(), - }; - self.reader = Reader::Inbox(reader); - break; - } else { - self.head = Head::Empty; - self.reader = Reader::Closed; - } + match reader.peek()? { + Some(next) => { + debug_assert!(*remaining > 0); + *head = next; + return Ok(true); } - Head::Empty => { - // do nothing. - return Ok(()); + None => { + debug_assert_eq!(0, *remaining); + *stage = Stage::Inbox; } } } + Stage::Inbox => return Ok(items.pop_front().is_some()), + Stage::Empty => return Ok(true), } } - Ok(()) } - pub(crate) fn remaining_in_running_stage(&self) -> u32 { - match self.reader { - Reader::New { already_running } => already_running, - Reader::Running { remaining, .. } => remaining, - Reader::Inbox(..) => 0, - Reader::Closed => 0, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use restate_clock::RoughTimestamp; - use restate_clock::time::MillisSinceEpoch; - use restate_core::TaskCenter; - use restate_partition_store::{PartitionDb, PartitionStore, PartitionStoreManager}; - use restate_rocksdb::RocksDbManager; - use restate_storage_api::Transaction; - use restate_storage_api::vqueue_table::{ - EntryId, EntryKey, EntryKind, EntryMetadata, EntryValue, Stage, Status, WriteVQueueTable, - stats::EntryStatistics, - }; - use restate_types::clock::UniqueTimestamp; - use restate_types::identifiers::{PartitionId, PartitionKey}; - use restate_types::partitions::Partition; - use restate_types::sharding::KeyRange; - use restate_types::vqueues::VQueueId; - - const BASE_RUN_AT_MS: u64 = 1_744_000_000_000; - - fn test_entry_at(id: u8, has_lock: bool, run_at: MillisSinceEpoch) -> (EntryKey, EntryValue) { - let run_at = RoughTimestamp::from_unix_millis_clamped(run_at); - let created_at = UniqueTimestamp::try_from(1_000u64 + id as u64).unwrap(); - let entry_id = EntryId::new(EntryKind::Invocation, [id; EntryId::REMAINDER_LEN]); - let key = EntryKey::new(has_lock, run_at, id as u64, entry_id); - let stats = EntryStatistics::new(created_at, run_at); - let value = EntryValue { - status: if stats.first_runnable_at > created_at.to_unix_millis() { - Status::Scheduled - } else { - Status::New - }, - metadata: EntryMetadata::default(), - stats, + /// Refills `cache` with up to [`INBOX_CACHE_CAPACITY`] items from storage if + /// it's possible to do so without blocking the thread. + /// + /// Starting at `cache.refill_anchor` (or the very first key if no anchor is + /// set yet). Returns true if items were added. Items in the `skip` set are + /// not added to the cache, but the anchor advances past them so they are not + /// reconsidered on the next refill. + fn try_refill( + &mut self, + storage: &S, + qid: &VQueueId, + skip: &UnconfirmedAssignments, + allow_blocking_io: bool, + ) -> Result<(), CursorError> { + let start = Instant::now(); + let mut reader = storage.new_inbox_reader(qid, vqueue_table::Options { allow_blocking_io }); + let RefillState::Standby { refill_anchor } = &mut self.refill_state else { + panic!("refill state must be standby"); }; - (key, value) - } - - fn test_entry(id: u8, has_lock: bool, run_at_ms: u64) -> (EntryKey, EntryValue) { - test_entry_at( - id, - has_lock, - MillisSinceEpoch::new(BASE_RUN_AT_MS + run_at_ms), - ) - } - - fn default_entry(id: u8) -> (EntryKey, EntryValue) { - test_entry(id, false, 0) - } - - fn test_qid(partition_key: u64) -> VQueueId { - VQueueId::custom(partition_key, "1") - } - - async fn storage_test_environment() -> PartitionStore { - let rocksdb_manager = RocksDbManager::init(); - TaskCenter::set_on_shutdown(Box::pin(async { - rocksdb_manager.shutdown().await; - })); - - let manager = PartitionStoreManager::create() - .await - .expect("DB storage creation succeeds"); - manager - .open( - &Partition::new(PartitionId::MIN, KeyRange::new(0, PartitionKey::MAX - 1)), - None, - ) - .await - .expect("DB storage creation succeeds") - } - - async fn insert_entries( - rocksdb: &mut PartitionStore, - qid: &VQueueId, - stage: Stage, - entries: &[(EntryKey, EntryValue)], - ) { - let mut txn = rocksdb.transaction(); - for (key, value) in entries { - txn.put_vqueue_inbox(qid, stage, key, value); + match refill_anchor { + Some(anchor) => reader.seek_after(anchor), + None => reader.seek_to_first(), } - txn.commit().await.expect("commit should succeed"); - } - #[restate_core::test] - async fn test_queue_running_to_inbox_to_empty() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(1000); - let running_entry = default_entry(1); - let inbox_entry = default_entry(2); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Running, - std::slice::from_ref(&running_entry), - ) - .await; - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - std::slice::from_ref(&inbox_entry), - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(1); - let mut skip = UnconfirmedAssignments::new(); - - assert!(!queue.is_empty()); - - let head = queue.advance_if_needed(db, &skip, &qid).unwrap(); - assert!(matches!(head, QueueItem::Running { key, .. } if *key == running_entry.0)); - assert!( - matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running_entry.0) - ); + while self.items.len() < INBOX_CACHE_CAPACITY { + let Some((key, value)) = reader.peek()? else { + if self.items.is_empty() { + self.stage = Stage::Empty; + } + break; + }; - queue.advance(db, &skip, &qid).unwrap(); - assert!( - matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == inbox_entry.0) + if !skip.contains_key(&key) { + self.items.push_back((key, value)); + } + *refill_anchor = Some(refill_anchor.map_or(key, |a| a.max(key))); + reader.advance(); + } + tracing::debug!( + "non-blocking refill finished in {}, cache has {} items", + start.elapsed().friendly(), + self.items.len() ); - let Some(QueueItem::Inbox { key, .. }) = queue.head() else { - panic!("expected inbox head"); - }; - skip.insert(*key, Default::default()); - assert!(!queue.is_empty()); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(queue.is_empty()); - - let higher = default_entry(0); - assert!(queue.enqueue(&higher.0, &higher.1)); - let head = queue.advance_if_needed(db, &skip, &qid).unwrap(); - assert!(matches!(head, QueueItem::Inbox { key, .. } if *key == higher.0)); + Ok(()) } - #[restate_core::test] - async fn test_entry_key_ordering() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(2000); - let low = test_entry(1, false, 3_000); - let high = test_entry(2, false, 2_000); - let highest = test_entry(3, true, 9_000); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - &[low.clone(), high.clone(), highest.clone()], - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(0); - let skip = UnconfirmedAssignments::new(); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == highest.0)); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == high.0)); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == low.0)); - } + fn start_refill_task(&mut self, storage: &S, qid: &VQueueId) { + assert!(!self.refill_state.is_in_flight()); + let RefillState::Standby { refill_anchor } = self.refill_state else { + panic!("refill state must be standby"); + }; - #[restate_core::test] - async fn test_run_at_below_now_bumps_entry_higher_in_inbox() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(2_500); - let now = MillisSinceEpoch::now().as_u64(); - let future_entry = - test_entry_at(1, false, MillisSinceEpoch::new(now.saturating_add(60_000))); - let overdue_entry = - test_entry_at(2, false, MillisSinceEpoch::new(now.saturating_sub(1_000))); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - &[future_entry.clone(), overdue_entry.clone()], - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(0); - let skip = UnconfirmedAssignments::new(); - - queue.advance(db, &skip, &qid).unwrap(); - assert!( - matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == overdue_entry.0) + let mut reader = storage.new_inbox_reader( + qid, + vqueue_table::Options { + allow_blocking_io: true, + }, ); - queue.advance(db, &skip, &qid).unwrap(); - assert!( - matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == future_entry.0) - ); - } + let handle = tokio::task::spawn_blocking(move || { + // collect and send the results at the end + let mut results = Vec::with_capacity(INBOX_CACHE_CAPACITY); - #[restate_core::test] - async fn test_enqueue_replaces_head_on_smaller_key() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(3000); - let initial = test_entry(1, false, 3_000); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - std::slice::from_ref(&initial), - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(0); - let skip = UnconfirmedAssignments::new(); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == initial.0)); - - let higher = test_entry(2, false, 2_000); - assert!(queue.enqueue(&higher.0, &higher.1)); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == higher.0)); - assert!( - matches!(queue.advance_if_needed(db, &skip, &qid).unwrap(), QueueItem::Inbox { key, .. } if *key == higher.0) - ); + match refill_anchor { + Some(anchor) => reader.seek_after(&anchor), + None => reader.seek_to_first(), + } - let lower = test_entry(3, false, 4_000); - assert!(!queue.enqueue(&lower.0, &lower.1)); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == higher.0)); + while results.len() < INBOX_CACHE_CAPACITY { + // In this mode, we don't expect to see WouldBlock + match reader.peek() { + Ok(Some((key, value))) => { + results.push((key, value)); + reader.advance(); + } + Ok(None) => { + // no more items + break; + } + Err(CursorError::WouldBlock) => { + unreachable!("background refill task should never see WouldBlock"); + } + Err(CursorError::Other(e)) => { + tracing::error!("refill task failed: {e}"); + return Err(e); + } + } + } + Ok(results) + }); + + let task = Box::new(RefillTask { + started_at: Instant::now(), + refill_anchor, + horizon: None, + overlay: VecDeque::with_capacity(INBOX_CACHE_CAPACITY), + handle, + }); + + self.refill_state = RefillState::InFlight(task); } - #[restate_core::test] - async fn test_remove() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(4000); - let entry = default_entry(1); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - std::slice::from_ref(&entry), - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(0); - let mut skip = UnconfirmedAssignments::new(); - - queue.advance(db, &skip, &qid).unwrap(); - let head_key = match queue.head() { - Some(QueueItem::Inbox { key, .. }) => *key, - _ => panic!("expected inbox head"), + fn poll_refill_task( + &mut self, + cx: &mut std::task::Context<'_>, + skip: &UnconfirmedAssignments, + ) -> Poll<()> { + let (items, overlay, mut refill_anchor, horizon) = match self.refill_state { + RefillState::Standby { .. } => return Poll::Ready(()), + RefillState::InFlight(ref mut task) => { + match ready!(Pin::new(&mut task.handle).poll(cx)) { + Err(join_err) => { + tracing::error!("refill task panicked: {join_err}"); + self.refill_state = RefillState::Standby { + refill_anchor: task.refill_anchor.take(), + }; + return Poll::Ready(()); + } + Ok(Err(err)) => { + tracing::error!("refill task failed: {err}"); + self.refill_state = RefillState::Standby { + refill_anchor: task.refill_anchor.take(), + }; + return Poll::Ready(()); + } + Ok(Ok(result)) => { + tracing::debug!( + "refill task finished with {} items in {}", + result.len(), + task.started_at.elapsed().friendly() + ); + ( + result, + std::mem::take(&mut task.overlay), + task.refill_anchor.take(), + task.horizon.take(), + ) + } + } + } }; + // If we got less items than what we asked, then this must have been the end of the queue + // at the time of the refill. + let end_of_queue = items.len() < INBOX_CACHE_CAPACITY; - assert!(!queue.remove(&default_entry(99).0)); - assert!(queue.remove(&head_key)); - assert!(queue.head().is_none()); - assert!( - matches!(queue.advance_if_needed(db, &skip, &qid).unwrap(), QueueItem::Inbox { key, .. } if *key == entry.0) - ); + // We now need to merge the items we received, apply the overlays, and deduplicate + // with existing entries in cache. + // At the end, we figure out what's the next anchor to use for the next refill. + // + // + // The strategy here is more complex that the blocking/inline version because + // concurrency is hard, who knew! + // We need to do all that while keeping the cache capacity in check. We don't + // want to re-allocate/resize the cache. + // + // We navigate both the overlays and the items in together in semi-lockstep. + // Technically, this is a LSM-style read/compaction algorithm. + for either in items + .into_iter() + .merge_join_by(overlay, |(key, _), (overlay_key, _)| key.cmp(overlay_key)) + { + // First key at-or-above the horizon ends the merge: overlay + // entries are all below it by construction, and the merge + // walks ascending, so the rest is storage that the next + // refill will rediscover via `seek_after`. + let key = match &either { + EitherOrBoth::Left((k, _)) + | EitherOrBoth::Right((k, _)) + | EitherOrBoth::Both((k, _), _) => *k, + }; + if horizon.is_some_and(|h| key >= h) { + break; + } + // Left is the item from db, right is the overlay + match either { + // Somewhat similar to a normal enqueue + EitherOrBoth::Left((key, value)) + | EitherOrBoth::Right((key, Overlay::Add(value))) + // overlay always wins. + | EitherOrBoth::Both((key, _), (_, Overlay::Add(value))) => { + if skip.contains_key(&key) { + // The key was already dispatched, skip it. + refill_anchor = Some(refill_anchor.map_or(key, |a| a.max(key))); + continue; + } + // Insert sorted in cache and ignore it if we already have it. + // If this item pushes us over the cache capcity, then we ignore it and reset + // the refill anchor to it. + let pos = match self.items.binary_search_by_key(&key, |&(k, _)| k) { + Ok(_) => continue, + Err(pos) => pos, + }; + if self.items.len() >= INBOX_CACHE_CAPACITY { + if pos == self.items.len() { + // beyond capacity, ignore the item. Reset the anchor. + refill_anchor = self.items.back().map(|(k, _)| *k); + break; + } + // Make room first; + self.items.pop_back(); + self.items.insert(pos, (key, value)); + refill_anchor = self.items.back().map(|(k, _)| *k); + break; + } - skip.insert(entry.0, Default::default()); - assert!(queue.remove(&head_key)); - assert!(queue.head().is_none()); - assert!(matches!( - queue.advance_if_needed(db, &skip, &qid).unwrap(), - QueueItem::None - )); - } + // Normal path: cache below capacity. + self.items.insert(pos, (key, value)); + refill_anchor = Some(key); + } + EitherOrBoth::Right((key, Overlay::Tombstone)) + // overlay always wins. + | EitherOrBoth::Both((key, _), (_, Overlay::Tombstone)) => { + // In theory, we should never see a tombstone that impacts the existing + // cache. + debug_assert!(self.items.binary_search_by_key(&key, |&(k, _)| k).is_err()); + // we should push the anchor to this item. + refill_anchor = Some(key); + } + } + } - #[restate_core::test] - async fn test_skip_set() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(5000); - let entry1 = default_entry(1); - let entry2 = default_entry(2); - let entry3 = default_entry(3); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - &[entry1.clone(), entry2.clone(), entry3.clone()], - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(0); - let mut skip = UnconfirmedAssignments::new(); - skip.insert(entry1.0, Default::default()); - skip.insert(entry2.0, Default::default()); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == entry3.0)); - } + // It's very important is that we must reset the task to standby + self.refill_state = RefillState::Standby { + refill_anchor: refill_anchor.or(self.items.back().map(|(k, _)| *k)), + }; - #[restate_core::test] - async fn test_running_before_inbox_regardless_of_key() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(6000); - let running_entry = default_entry(1); - let inbox_entry = test_entry(2, true, 0); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Running, - std::slice::from_ref(&running_entry), - ) - .await; - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - std::slice::from_ref(&inbox_entry), - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(1); - let skip = UnconfirmedAssignments::new(); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Running { .. }))); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Inbox { .. }))); - } + // at the end, if the cache is empty and end_of_queue is true, then we must + // have exhausted the inbox. + if self.items.is_empty() && end_of_queue { + self.stage = Stage::Empty; + } - #[restate_core::test] - async fn test_enqueue_and_remove_ignored_during_running_stage() { - let mut rocksdb = storage_test_environment().await; - - let qid = test_qid(7000); - let running1 = default_entry(1); - let running2 = default_entry(2); - let inbox_entry = test_entry(10, true, 0); - - insert_entries( - &mut rocksdb, - &qid, - Stage::Running, - &[running1.clone(), running2.clone()], - ) - .await; - insert_entries( - &mut rocksdb, - &qid, - Stage::Inbox, - std::slice::from_ref(&inbox_entry), - ) - .await; - - let db = rocksdb.partition_db(); - let mut queue: Queue = Queue::new(2); - let skip = UnconfirmedAssignments::new(); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0)); - - let even_higher = test_entry(11, true, 0); - assert!(!queue.enqueue(&even_higher.0, &even_higher.1)); - assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0)); - - assert!(!queue.remove(&running2.0)); - assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0)); - - assert!(!queue.remove(&running1.0)); - assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running1.0)); - - queue.advance(db, &skip, &qid).unwrap(); - assert!(matches!(queue.head(), Some(QueueItem::Running { key, .. }) if *key == running2.0)); - - queue.advance(db, &skip, &qid).unwrap(); - assert!( - matches!(queue.head(), Some(QueueItem::Inbox { key, .. }) if *key == inbox_entry.0) - ); + Poll::Ready(()) + } - assert!(queue.remove(&inbox_entry.0)); - assert!(queue.head().is_none()); + pub(crate) fn remaining_in_running_stage(&self) -> u32 { + match &self.stage { + Stage::New { + already_running, .. + } => already_running.get(), + Stage::Running { remaining, .. } => *remaining, + Stage::Inbox | Stage::Empty => 0, + } } } + +#[cfg(test)] +#[path = "queue_test.rs"] +mod queue_test; diff --git a/crates/vqueues/src/scheduler/queue_meta.rs b/crates/vqueues/src/scheduler/queue_meta.rs deleted file mode 100644 index 712773166a..0000000000 --- a/crates/vqueues/src/scheduler/queue_meta.rs +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH. -// All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use metrics::counter; - -use restate_limiter::LimitKey; -use restate_storage_api::vqueue_table::metadata::VQueueMeta; -use restate_storage_api::vqueue_table::{EntryKey, EntryValue}; -use restate_types::{LockName, Scope}; -use restate_util_string::ReString; - -use crate::metric_definitions::VQUEUE_ENQUEUE; - -/// A smaller version of the vqueue metadata that only contains the information needed for the -/// scheduler to operate. -#[derive(Debug, Clone)] -pub struct VQueueMetaLite { - /// if true, the vqueue is paused, we don't pop entries from it until it's resumed. - queue_is_paused: bool, - - /// the length of the inbox stage in the vqueue - num_waiting: usize, - scope: Option, - limit_key: LimitKey, - lock_name: Option, -} - -impl VQueueMetaLite { - pub fn new(original: &VQueueMeta) -> Self { - Self { - queue_is_paused: original.queue_is_paused(), - num_waiting: original.total_waiting() as usize, - scope: original.scope().clone(), - limit_key: original.limit_key().clone(), - lock_name: original.lock_name().cloned(), - } - } - - pub const fn new_empty( - scope: Option, - limit_key: LimitKey, - lock_name: Option, - ) -> Self { - Self { - queue_is_paused: false, - num_waiting: 0, - scope, - limit_key, - lock_name, - } - } - - pub fn inbox_len(&self) -> usize { - self.num_waiting - } - - pub fn is_inbox_empty(&self) -> bool { - self.num_waiting == 0 - } - - pub fn is_queue_paused(&self) -> bool { - self.queue_is_paused - } - - pub fn scope(&self) -> &Option { - &self.scope - } - - pub fn lock_name(&self) -> Option<&LockName> { - match self.lock_name { - Some(ref lock_name) => Some(lock_name), - _ => None, - } - } - - // pub fn service_name(&self) -> Option<&ServiceName> { - // match self.link { - // VQueueLink::Service(ref service_name) => Some(service_name), - // VQueueLink::Lock(ref lock) => Some(lock.service_name()), - // _ => None, - // } - // } - - pub fn limit_key(&self) -> &LimitKey { - &self.limit_key - } - - pub fn apply_update(&mut self, update: &MetaLiteUpdate) { - match update { - MetaLiteUpdate::QueuePaused => self.queue_is_paused = true, - MetaLiteUpdate::QueueResumed { num_waiting } => { - self.queue_is_paused = false; - self.num_waiting = *num_waiting as usize; - } - MetaLiteUpdate::EnqueuedToInbox { .. } => { - counter!(VQUEUE_ENQUEUE).increment(1); - self.num_waiting += 1; - } - MetaLiteUpdate::RemovedFromInbox(_) => { - self.num_waiting = self.num_waiting.saturating_sub(1) - } - } - } -} - -#[derive(Debug)] -pub enum MetaLiteUpdate { - QueuePaused, - QueueResumed { num_waiting: u64 }, - RemovedFromInbox(EntryKey), - EnqueuedToInbox { key: EntryKey, value: EntryValue }, -} diff --git a/crates/vqueues/src/scheduler/queue_test.rs b/crates/vqueues/src/scheduler/queue_test.rs new file mode 100644 index 0000000000..38551902a7 --- /dev/null +++ b/crates/vqueues/src/scheduler/queue_test.rs @@ -0,0 +1,403 @@ +// Copyright (c) 2023 - 2026 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Test harness for the vqueue refill machinery. +//! +//! These tests use a fake [`VQueueStore`] (`GatedStore`) that lets the +//! caller freeze the snapshot a refill task sees and interleave +//! `notify_enqueued` / `notify_removed` events deterministically against +//! the in-flight refill thread. That's what makes overlay and cancellation +//! behaviour testable without a real RocksDB. +//! +//! Each [`GatedReader`] is created with `Options::allow_blocking_io` set +//! by the queue itself: +//! - The first attempt (`try_refill`) passes `allow_blocking_io: false`; +//! the fake responds with [`CursorError::WouldBlock`] which drives the +//! queue down the async refill path. +//! - The async path then constructs a second reader with +//! `allow_blocking_io: true`; that reader parks on its first `peek` +//! (signalling the test that the snapshot is frozen) and resumes once +//! the test releases the latch. + +use std::sync::{Arc, Condvar, Mutex}; +use std::task::{Context, Poll, Waker}; + +use restate_clock::RoughTimestamp; +use restate_clock::time::MillisSinceEpoch; +use restate_storage_api::StorageError; +use restate_storage_api::vqueue_table::stats::EntryStatistics; +use restate_storage_api::vqueue_table::{ + CursorError, EntryId, EntryKey, EntryKind, EntryMetadata, EntryValue, Options, Status, + VQueueCursor, VQueueRunningCursor, VQueueStore, +}; +use restate_types::clock::UniqueTimestamp; +use restate_types::vqueues::VQueueId; + +use super::*; + +// ---------- helpers ---------- + +const BASE_RUN_AT_MS: u64 = 1_744_000_000_000; + +/// Builds an entry whose sort position is determined entirely by `seq` +/// (other `EntryKey` components are fixed). +fn entry_at_seq(seq: u64) -> (EntryKey, EntryValue) { + let run_at = RoughTimestamp::from_unix_millis_clamped(MillisSinceEpoch::new(BASE_RUN_AT_MS)); + let created_at = UniqueTimestamp::try_from(1_000u64 + seq).unwrap(); + let entry_id = EntryId::new(EntryKind::Invocation, [0u8; EntryId::REMAINDER_LEN]); + let key = EntryKey::new(false, run_at, seq, entry_id); + let stats = EntryStatistics::new(created_at, run_at); + let value = EntryValue { + status: Status::New, + metadata: EntryMetadata::default(), + stats, + }; + (key, value) +} + +fn test_qid(partition_key: u64) -> VQueueId { + VQueueId::custom(partition_key, "1") +} + +/// Polls the queue once and asserts it returned `Pending`. Used to kick +/// off an in-flight refill task so the test can interleave events while +/// the refill thread is parked at its gate. +fn poll_once_expect_pending( + queue: &mut Queue, + storage: &S, + skip: &UnconfirmedAssignments, + qid: &VQueueId, +) { + let mut cx = Context::from_waker(Waker::noop()); + match queue.poll_advance_if_needed(&mut cx, storage, skip, qid, false, false) { + Poll::Pending => {} + Poll::Ready(Ok(item)) => panic!("expected Pending, got Ready({item:?})"), + Poll::Ready(Err(e)) => panic!("expected Pending, got error: {e:?}"), + } +} + +/// Drives `poll_advance_if_needed` until it returns `Ready`. Yields between +/// Pending iterations so the `spawn_blocking` refill task can make progress. +async fn drive_until_ready( + queue: &mut Queue, + storage: &S, + skip: &UnconfirmedAssignments, + qid: &VQueueId, +) { + let mut cx = Context::from_waker(Waker::noop()); + loop { + match queue.poll_advance_if_needed(&mut cx, storage, skip, qid, false, false) { + Poll::Ready(Ok(_)) => return, + Poll::Ready(Err(e)) => panic!("queue error: {e:?}"), + Poll::Pending => tokio::time::sleep(std::time::Duration::from_millis(1)).await, + } + } +} + +/// Pops everything currently in the cache (no refill, no I/O) and returns +/// the keys in head-to-tail order. +fn drain_cache(queue: &mut Queue) -> Vec { + let mut keys = vec![]; + while let Some(QueueItem::Inbox { key, .. }) = queue.head() { + keys.push(*key); + queue.try_advance().unwrap(); + } + keys +} + +// ---------- fake VQueueStore ---------- + +/// In-memory storage with hooks to gate the refill thread at a precise +/// moment, so tests can deterministically interleave queue events against +/// an in-flight refill. +struct GatedStore { + /// Source of truth for what `new_inbox_reader` will snapshot. Sorted + /// ascending by `EntryKey`. + entries: Mutex>, + /// Latch released by the test when it wants the parked refill thread + /// to proceed past its first `peek`. + release: Arc<(Mutex, Condvar)>, + /// Signalled by the reader the moment it parks. The test waits on + /// this to know the snapshot is frozen and queue events can now be + /// safely interleaved. + parked: Arc<(Mutex, Condvar)>, +} + +impl GatedStore { + fn new(entries: Vec<(EntryKey, EntryValue)>) -> Self { + Self { + entries: Mutex::new(entries), + release: Arc::new((Mutex::new(false), Condvar::new())), + parked: Arc::new((Mutex::new(false), Condvar::new())), + } + } + + fn release_refill_thread(&self) { + let (lock, cv) = &*self.release; + *lock.lock().unwrap() = true; + cv.notify_all(); + } + + fn wait_until_parked(&self) { + let (lock, cv) = &*self.parked; + let mut parked = lock.lock().unwrap(); + while !*parked { + parked = cv.wait(parked).unwrap(); + } + } +} + +struct GatedReader { + /// Frozen at construction time (matches RocksDB snapshot semantics). + /// Sorted ascending by `EntryKey`. + snapshot: Vec<(EntryKey, EntryValue)>, + cursor: usize, + /// `true` for blocking readers (those constructed via the async refill + /// path with `allow_blocking_io: true`); they park at the gate on the + /// first `peek` so the test can pin the snapshot moment. Non-blocking + /// readers always return `WouldBlock` instead, driving the queue down + /// the async refill path. + blocking: bool, + release: Arc<(Mutex, Condvar)>, + parked: Arc<(Mutex, Condvar)>, + /// Whether this reader has parked yet. Each reader parks at most + /// once, on its first peek/key/value call. + has_parked: bool, +} + +impl GatedReader { + fn block_until_released(&mut self) { + if self.has_parked { + return; + } + self.has_parked = true; + // Tell the test we're parked. + let (lock, cv) = &*self.parked; + *lock.lock().unwrap() = true; + cv.notify_all(); + // Wait for the test to release us. + let (lock, cv) = &*self.release; + let mut released = lock.lock().unwrap(); + while !*released { + released = cv.wait(released).unwrap(); + } + } +} + +impl VQueueCursor for GatedReader { + fn seek_to_first(&mut self) { + self.cursor = 0; + } + + fn seek_after(&mut self, anchor: &EntryKey) { + self.cursor = self + .snapshot + .iter() + .position(|(k, _)| k > anchor) + .unwrap_or(self.snapshot.len()); + } + + fn advance(&mut self) { + if self.cursor < self.snapshot.len() { + self.cursor += 1; + } + } + + fn peek(&mut self) -> Result, CursorError> { + if !self.blocking { + return Err(CursorError::WouldBlock); + } + self.block_until_released(); + Ok(self.snapshot.get(self.cursor).cloned()) + } +} + +/// Stub for the running stage. Tests construct queues with +/// `num_running = 0`, so this is never actually exercised; it only exists +/// to satisfy the [`VQueueStore`] trait bound. +struct StubRunningReader; + +impl VQueueRunningCursor for StubRunningReader { + fn seek_to_first(&mut self) {} + fn peek(&mut self) -> Result, StorageError> { + Ok(None) + } + fn advance(&mut self) {} +} + +impl VQueueStore for GatedStore { + type RunningReader = StubRunningReader; + type InboxReader = GatedReader; + + fn new_run_reader(&self, _qid: &VQueueId) -> Self::RunningReader { + StubRunningReader + } + + fn new_inbox_reader(&self, _qid: &VQueueId, opts: Options) -> Self::InboxReader { + // Snapshot freezes here. + let snapshot = self.entries.lock().unwrap().clone(); + GatedReader { + snapshot, + cursor: 0, + blocking: opts.allow_blocking_io, + release: self.release.clone(), + parked: self.parked.clone(), + has_parked: false, + } + } +} + +// ---------- tests ---------- + +/// Sanity: a single in-flight refill with no concurrent events surfaces +/// every storage row in the cache, in `EntryKey` order. +#[restate_core::test] +async fn refill_without_overlay_activity() { + let entries: Vec<_> = (1..=5).map(entry_at_seq).collect(); + let storage = GatedStore::new(entries.clone()); + let qid = test_qid(1); + let mut queue: Queue = Queue::new(0, &storage, &qid); + let skip = UnconfirmedAssignments::new(); + + poll_once_expect_pending(&mut queue, &storage, &skip, &qid); + storage.wait_until_parked(); + storage.release_refill_thread(); + drive_until_ready(&mut queue, &storage, &skip, &qid).await; + + let drained = drain_cache(&mut queue); + assert_eq!( + drained, + entries.iter().map(|(k, _)| *k).collect::>(), + "cache should contain every storage row in order", + ); +} + +/// Tombstones in the overlay correctly suppress the matching storage row +/// during merge — no overflow happens here, so the overlay's information +/// is fully preserved. This is the "happy path" for the +/// commit-before-notify invariant. +#[restate_core::test] +async fn tombstone_in_overlay_suppresses_storage_row() { + let r_target = entry_at_seq(100); + let storage = GatedStore::new(vec![r_target.clone()]); + let qid = test_qid(2); + let mut queue: Queue = Queue::new(0, &storage, &qid); + let skip = UnconfirmedAssignments::new(); + + poll_once_expect_pending(&mut queue, &storage, &skip, &qid); + storage.wait_until_parked(); + // Storage commit (modelled here as the snapshot being frozen with + // r_target visible) lands first; the notify_removed lands second. + queue.remove(&r_target.0); + storage.release_refill_thread(); + drive_until_ready(&mut queue, &storage, &skip, &qid).await; + + let drained = drain_cache(&mut queue); + assert!( + !drained.contains(&r_target.0), + "deleted row should be suppressed by overlay tombstone, got: {drained:?}", + ); +} + +/// **Bug demonstration.** When the overlay is at capacity and the back +/// entry is a tombstone, `push_added_item`'s `pop_back` silently drops +/// that tombstone. The merge then lets the (already-deleted) row into +/// the cache. +/// +/// Layout at the moment of overflow: +/// +/// ```text +/// overlay[0] = Tombstone(seq=50) // pre-tombstone +/// overlay[1] = Tombstone(seq=100) // pre-tombstone +/// overlay[2..CAPACITY-1] = Add(seq=150..) // CAPACITY-3 adds +/// overlay[CAPACITY-1] = Tombstone(seq=500) // *r_target* +/// ``` +/// +/// The trigger add (seq=400) sorts at `pos = CAPACITY-1`, which is +/// `< overlay.len() == CAPACITY`, so `push_added_item` evicts the back +/// (`Tombstone(500)`) and inserts the trigger. After this, the overlay +/// holds only `[T(50), T(100), Add(150..), Add(400)]` — the tombstone +/// for `r_target` is gone. +/// +/// Storage is `[r_target=seq500]` (a single row that's been deleted but +/// is still in the in-flight task's snapshot, faithful to the race the +/// invariant doc on `RefillTask` describes). The merge produces: +/// +/// ```text +/// [T(50), T(100), Add(150..), Add(400), Left(seq500)] +/// ``` +/// +/// With the tombstone evicted there's nothing to suppress `Left(seq500)`, +/// so the merge inserts it into the cache. The drain then sees `seq500`, +/// which is the bug. +#[restate_core::test] +async fn tombstone_evicted_on_overlay_overflow_leaks_deleted_row() { + // Overlay layout requires CAPACITY >= 4 (2 front tombstones + at + // least one add + 1 back tombstone). Bail loudly if someone shrinks + // the cache below that. + const { assert!(INBOX_CACHE_CAPACITY >= 4) }; + + // Front-anchor tombstones: low seqs so they sort to the front of the + // overlay and don't influence the eviction we want to trigger. + let pre_tombstone_a = entry_at_seq(50); + let pre_tombstone_b = entry_at_seq(100); + + // The deleted row that the in-flight task's snapshot still sees. + let r_target = entry_at_seq(500); + + // Adds that fit between the front tombstones and r_target's + // tombstone. Together with the three tombstones the overlay reaches + // exactly CAPACITY. We use a contiguous seq range starting at 150. + let n_adds = INBOX_CACHE_CAPACITY - 3; + let pre_adds: Vec<_> = (150..150 + n_adds as u64).map(entry_at_seq).collect(); + + // The trigger: an add that sorts BEFORE r_target's tombstone but + // after every other overlay entry. With `pos < overlay.len()` and + // the overlay at cap, `push_added_item` will `pop_back` — evicting + // `Tombstone(r_target)`. + let trigger_add = entry_at_seq(400); + + let storage = GatedStore::new(vec![r_target.clone()]); + let qid = test_qid(3); + let mut queue: Queue = Queue::new(0, &storage, &qid); + let skip = UnconfirmedAssignments::new(); + + // Kick off the in-flight refill so subsequent enqueue/remove events + // route through the overlay rather than the cache. + poll_once_expect_pending(&mut queue, &storage, &skip, &qid); + storage.wait_until_parked(); + + // Three tombstones; only Tombstone(r_target) matters for the bug. + queue.remove(&pre_tombstone_a.0); + queue.remove(&pre_tombstone_b.0); + queue.remove(&r_target.0); + + // Fill the overlay up to CAPACITY with the front-of-r_target adds. + for (k, v) in &pre_adds { + queue.enqueue(k, v); + } + + // Trigger the eviction. After this, Tombstone(r_target) is gone. + queue.enqueue(&trigger_add.0, &trigger_add.1); + + // Release the refill thread; it returns the snapshot ([r_target]). + storage.release_refill_thread(); + drive_until_ready(&mut queue, &storage, &skip, &qid).await; + + let drained = drain_cache(&mut queue); + + // Bug: r_target shows up in the cache even though we issued + // `notify_removed` for it before the merge ran. The matching + // tombstone was dropped on overlay overflow. + assert!( + !drained.contains(&r_target.0), + "deleted row leaked into cache: {drained:?}", + ); +} diff --git a/crates/vqueues/src/scheduler/resource_manager.rs b/crates/vqueues/src/scheduler/resource_manager.rs index 8a430d8794..a99e8d8aa6 100644 --- a/crates/vqueues/src/scheduler/resource_manager.rs +++ b/crates/vqueues/src/scheduler/resource_manager.rs @@ -28,6 +28,7 @@ use restate_limiter::RuleHandle; use restate_memory::{MemoryPool, NonZeroByteCount}; use restate_storage_api::StorageError; use restate_storage_api::lock_table::LoadLocks; +use restate_storage_api::vqueue_table::metadata::VQueueMeta; use restate_storage_api::vqueue_table::{EntryKey, EntryMetadata}; use restate_types::identifiers::PartitionKey; use restate_types::vqueues::EntryKind; @@ -44,7 +45,6 @@ use self::permit::ProvisionalPermit; use self::user_limiter::UserLimiter; use super::VQueueHandle; use super::eligible::EligibilityTracker; -use super::queue_meta::VQueueMetaLite; use crate::GlobalTokenBucket; // A set of queues waiting on a resource @@ -177,7 +177,7 @@ impl ResourceManager { &mut self, cx: &mut std::task::Context<'_>, vqueue: VQueueHandle, - meta: &VQueueMetaLite, + meta: &VQueueMeta, key: &EntryKey, _metadata: &EntryMetadata, current_permit: &mut PermitBuilder, diff --git a/crates/vqueues/src/scheduler/vqueue_state.rs b/crates/vqueues/src/scheduler/vqueue_state.rs index b83fc73696..bcb3406224 100644 --- a/crates/vqueues/src/scheduler/vqueue_state.rs +++ b/crates/vqueues/src/scheduler/vqueue_state.rs @@ -8,18 +8,21 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use std::task::{Poll, ready}; use std::time::Duration; use enum_map::{Enum, EnumMap}; use metrics::counter; +use restate_storage_api::StorageError; +use restate_storage_api::vqueue_table::metadata::VQueueMeta; use tokio::time::Instant; use restate_clock::RoughTimestamp; -use restate_storage_api::StorageError; use restate_storage_api::vqueue_table::{EntryKey, EntryValue, VQueueStore, stats::WaitStats}; use restate_types::vqueues::{EntryId, VQueueId}; use restate_worker_api::ResourceKind; +use crate::cache::{self, VQueueHandle}; use crate::metric_definitions::{ VQUEUE_CONCURRENCY_RULES_WAIT_MS, VQUEUE_DEPLOYMENT_CONCURRENCY_WAIT_MS, VQUEUE_INVOKER_CONCURRENCY_WAIT_MS, VQUEUE_INVOKER_MEMORY_WAIT_MS, @@ -29,10 +32,8 @@ use crate::scheduler::queue::QueueItem; use super::clock::SchedulerClock; use super::queue::Queue; -use super::queue_meta::MetaLiteUpdate; -pub use super::queue_meta::VQueueMetaLite; use super::resource_manager::{AcquireOutcome, PermitBuilder}; -use super::{ResourceManager, RunAction, UnconfirmedAssignments, VQueueHandle, YieldAction}; +use super::{ResourceManager, RunAction, UnconfirmedAssignments, YieldAction}; const QUANTUM: i32 = 1; @@ -210,9 +211,6 @@ impl Stats { #[derive(derive_more::Debug)] pub struct VQueueState { - pub handle: VQueueHandle, - pub qid: VQueueId, - meta: VQueueMetaLite, deficit: i32, #[debug(skip)] unconfirmed_assignments: UnconfirmedAssignments, @@ -224,17 +222,9 @@ pub struct VQueueState { } impl VQueueState { - pub fn new( - qid: VQueueId, - handle: VQueueHandle, - meta: VQueueMetaLite, - num_running: u32, - ) -> Self { - let queue = Queue::new(num_running); + pub fn new(qid: &VQueueId, storage: &S, num_running: u32) -> Self { + let queue = Queue::new(num_running, storage, qid); Self { - handle, - qid, - meta, deficit: QUANTUM, unconfirmed_assignments: UnconfirmedAssignments::new(), queue, @@ -243,15 +233,8 @@ impl VQueueState { } } - pub fn apply_meta_update(&mut self, update: &MetaLiteUpdate) { - self.meta.apply_update(update) - } - - pub fn new_empty(qid: VQueueId, handle: VQueueHandle, meta: VQueueMetaLite) -> Self { + pub fn new_empty() -> Self { Self { - qid, - handle, - meta, unconfirmed_assignments: UnconfirmedAssignments::new(), queue: Queue::new_closed(), deficit: 0, @@ -263,7 +246,8 @@ impl VQueueState { pub fn try_pop( &mut self, cx: &mut std::task::Context<'_>, - storage: &S, + handle: VQueueHandle, + slot: &cache::Slot, resources: &mut ResourceManager, ) -> Result { let (inbox_head_key, inbox_head_value, is_running) = match self.queue.head() { @@ -271,8 +255,8 @@ impl VQueueState { Some(QueueItem::Running { key, value }) => (key, value, true), e @ Some(QueueItem::None) | e @ None => { unreachable!( - "cannot pop from empty queue or attempted to pop before polling {:?}/{e:?}", - self.handle + "cannot pop from empty queue or attempted to pop before polling {}/{e:?}", + slot.vqueue_id() ) } }; @@ -291,15 +275,14 @@ impl VQueueState { key: *inbox_head_key, next_run_at: None, }; - self.queue - .advance(storage, &self.unconfirmed_assignments, &self.qid)?; + self.queue.try_advance()?; return Ok(Pop::Yield(action)); } match resources.poll_acquire_permit( cx, - self.handle, - &self.meta, + handle, + slot.meta(), inbox_head_key, &inbox_head_value.metadata, &mut self.current_permit, @@ -313,8 +296,7 @@ impl VQueueState { wait_stats: self.head_stats.finalize(), }; - self.queue - .advance(storage, &self.unconfirmed_assignments, &self.qid)?; + self.queue.try_advance()?; Ok(Pop::Run(action)) } AcquireOutcome::BlockedOn(resource) => { @@ -327,22 +309,33 @@ impl VQueueState { } } - pub fn is_dormant(&self) -> bool { + pub fn is_dormant(&self, meta: &VQueueMeta) -> bool { // We hold on to the vqueue until we confirm/reject all pending assignments. If we didn't // do so, we risk revisiting/redequeuing the unconfirmed items if the vqueue popped back to life // (i.e., on enqueue). This is the reason why we check for `unconfirmed_assignments` - (self.queue.is_empty() || self.meta.is_inbox_empty() || self.meta.is_queue_paused()) + (self.queue.is_empty() || meta.is_inbox_empty() || meta.queue_is_paused()) && self.unconfirmed_assignments.is_empty() } - pub fn poll_eligibility(&mut self, storage: &S) -> Result { - self.queue - .advance_if_needed(storage, &self.unconfirmed_assignments, &self.qid)?; - - Ok(self.check_eligibility()) + pub fn poll_eligibility( + &mut self, + cx: &mut std::task::Context<'_>, + slot: &cache::Slot, + storage: &S, + ) -> Poll> { + ready!(self.queue.poll_advance_if_needed( + cx, + storage, + &self.unconfirmed_assignments, + slot.vqueue_id(), + self.num_waiting_inbox(slot.meta()) == 0, + false, + ))?; + + Poll::Ready(Ok(self.check_eligibility(slot.meta()).as_compact())) } - pub fn check_eligibility(&self) -> DetailedEligibility { + pub fn check_eligibility(&self, meta: &VQueueMeta) -> DetailedEligibility { let inbox_head_key = match self.queue.head() { Some(QueueItem::Running { .. }) => return DetailedEligibility::EligibleRunning, Some(QueueItem::Inbox { key, .. }) => key, @@ -350,7 +343,7 @@ impl VQueueState { None if self.queue.remaining_in_running_stage() > 0 => { return DetailedEligibility::EligibleRunning; } - None if self.meta.inbox_len() > 0 => return DetailedEligibility::EligibleInbox, + None if meta.total_waiting() > 0 => return DetailedEligibility::EligibleInbox, None => return DetailedEligibility::Empty, }; @@ -408,10 +401,9 @@ impl VQueueState { /// Takes into account the number of unconfirmed assignments when calculating /// the inbox length - pub fn num_waiting_inbox(&self) -> u64 { - self.meta - .inbox_len() - .saturating_sub(self.unconfirmed_assignments.len()) as u64 + pub fn num_waiting_inbox(&self, meta: &VQueueMeta) -> u64 { + meta.total_waiting() + .saturating_sub(self.unconfirmed_assignments.len() as u64) } } diff --git a/crates/worker/src/partition/leadership/leader_state.rs b/crates/worker/src/partition/leadership/leader_state.rs index c26110b0f6..3d1eeb9c6f 100644 --- a/crates/worker/src/partition/leadership/leader_state.rs +++ b/crates/worker/src/partition/leadership/leader_state.rs @@ -42,9 +42,9 @@ use restate_types::net::partition_processor::{ }; use restate_types::sharding::KeyRange; use restate_types::{RESTATE_VERSION_1_7_0, SemanticRestateVersion, Version, Versioned, vqueues}; -use restate_vqueues::SchedulerService; use restate_vqueues::VQueueEvent; use restate_vqueues::scheduler::Decisions; +use restate_vqueues::{SchedulerService, VQueuesMeta}; use restate_wal_protocol::Command; use restate_wal_protocol::control::UpsertSchema; use restate_worker_api::invoker::InvokerHandle; @@ -143,9 +143,13 @@ impl LeaderState { } } - pub fn read_scheduler_status(&self, keys: KeyRange) -> Vec { + pub fn read_scheduler_status( + &self, + metas: VQueuesMeta<'_>, + keys: KeyRange, + ) -> Vec { self.scheduler - .iter_status() + .iter_status(metas) .into_iter() .flatten() .filter(|(qid, _)| keys.contains(&qid.partition_key())) @@ -164,7 +168,11 @@ impl LeaderState { /// /// Important: The future needs to be cancellation safe since it is polled as a tokio::select /// arm! - pub async fn run(&mut self, state_machine: &StateMachine) -> Result, Error> { + pub async fn run( + &mut self, + state_machine: &StateMachine, + vqueue_metas: VQueuesMeta<'_>, + ) -> Result, Error> { let timer_stream = std::pin::pin!(stream::unfold( &mut self.timer_service, |timer_service| async { @@ -177,7 +185,7 @@ impl LeaderState { // if we have problems with latency let scheduler_stream = std::pin::pin!(stream::unfold(&mut self.scheduler, |scheduler| async { - match scheduler.schedule_next().await { + match scheduler.schedule_next(vqueue_metas).await { Ok(decisions) => Some((ActionEffect::Scheduler(decisions), scheduler)), Err(e) => { error!("Fatal error when polling scheduler: {e}"); @@ -518,7 +526,11 @@ impl LeaderState { } } - pub fn handle_actions(&mut self, actions: impl Iterator) -> Result<(), Error> { + pub fn handle_actions( + &mut self, + vqueues: VQueuesMeta<'_>, + actions: impl Iterator, + ) -> Result<(), Error> { for action in actions { let action_name = action.name(); @@ -532,13 +544,13 @@ impl LeaderState { ) .increment(1); - self.handle_action(action)?; + self.handle_action(vqueues, action)?; } Ok(()) } - fn handle_action(&mut self, action: Action) -> Result<(), Error> { + fn handle_action(&mut self, metas: VQueuesMeta<'_>, action: Action) -> Result<(), Error> { match action { Action::Invoke { invocation_id, @@ -684,25 +696,25 @@ impl LeaderState { } } Action::VQEvent(inbox_event) => { - self.handle_vqueue_inbox_event(inbox_event); + self.handle_vqueue_inbox_event(metas, inbox_event); } Action::VQInvoke { - qid, + vq_handle, key, invocation_target, - limit_key, idempotency_key, } => { + let slot = metas.get(vq_handle).expect("vqueue meta must be in cache"); // state mutations should not create Invoke actions. At least for now. assert!(matches!(key.kind(), vqueues::EntryKind::Invocation)); let invocation_id = key .entry_id() - .to_invocation_id(qid.partition_key()) + .to_invocation_id(slot.partition_key()) .unwrap(); - let run_permit = self.scheduler.confirm_run_attempt(&qid, &key).unwrap_or_else(|| { + let run_permit = self.scheduler.confirm_run_attempt(vq_handle, slot, &key).unwrap_or_else(|| { tracing::error!( - vqueue = %qid, + vqueue = %slot.vqueue_id(), restate.invocation.id = %invocation_id, "Cannot find a permit for entry key {key:?} in scheduler. Will not respect the invoker limit for this invocation" ); @@ -710,11 +722,11 @@ impl LeaderState { }); self.invoker_handle .vqueue_invoke( - qid, + slot.vqueue_id().clone(), run_permit, invocation_id, invocation_target, - limit_key, + slot.meta().limit_key().clone(), idempotency_key, ) .map_err(Error::Invoker)? @@ -728,8 +740,8 @@ impl LeaderState { &mut self.invoker_handle } - fn handle_vqueue_inbox_event(&mut self, event: VQueueEvent) { - self.scheduler.on_inbox_event(event); + fn handle_vqueue_inbox_event(&mut self, metas: VQueuesMeta<'_>, event: VQueueEvent) { + self.scheduler.on_inbox_event(metas, event); } } diff --git a/crates/worker/src/partition/leadership/mod.rs b/crates/worker/src/partition/leadership/mod.rs index 768b0b2182..88079c1715 100644 --- a/crates/worker/src/partition/leadership/mod.rs +++ b/crates/worker/src/partition/leadership/mod.rs @@ -64,7 +64,7 @@ use restate_types::{ GenerationalNodeId, RESTATE_VERSION_1_6_0, RESTATE_VERSION_1_7_0, SemanticRestateVersion, }; use restate_vqueues::scheduler::{self}; -use restate_vqueues::{ResourceManager, SchedulerService, VQueuesMetaCache}; +use restate_vqueues::{ResourceManager, SchedulerService, VQueuesMeta, VQueuesMetaCache}; use restate_wal_protocol::control::{AnnounceLeader, PartitionDurability, VersionBarrier}; use restate_wal_protocol::timer::TimerKeyValue; use restate_wal_protocol::{Command, Envelope}; @@ -651,13 +651,17 @@ where } } - pub fn handle_actions(&mut self, actions: impl Iterator) -> Result<(), Error> { + pub fn handle_actions( + &mut self, + vqueues: VQueuesMeta<'_>, + actions: impl Iterator, + ) -> Result<(), Error> { match &mut self.state { State::Follower | State::Candidate { .. } => { // nothing to do :-) } State::Leader(leader_state) => { - leader_state.handle_actions(actions)?; + leader_state.handle_actions(vqueues, actions)?; } } @@ -669,7 +673,11 @@ where /// * Follower: Nothing to do /// * Candidate: Monitor appender task /// * Leader: Await action effects and monitor appender task - pub async fn run(&mut self, state_machine: &StateMachine) -> Result, Error> { + pub async fn run( + &mut self, + state_machine: &StateMachine, + vqueues: VQueuesMeta<'_>, + ) -> Result, Error> { match &mut self.state { State::Follower => Ok(futures::future::pending::>().await), State::Candidate { self_proposer, .. } => Err(self_proposer @@ -678,7 +686,7 @@ where .join_on_err() .await .expect_err("never should never be returned")), - State::Leader(leader_state) => leader_state.run(state_machine).await, + State::Leader(leader_state) => leader_state.run(state_machine, vqueues).await, } } @@ -708,13 +716,17 @@ where } impl LeadershipState { - pub fn handle_leader_query(&self, leader_query_cmd: LeaderQueryCommand) { + pub fn handle_leader_query( + &self, + metas: VQueuesMeta<'_>, + leader_query_cmd: LeaderQueryCommand, + ) { let (request, response_tx) = leader_query_cmd.into_inner(); match (&self.state, request) { (State::Leader(leader_state), LeaderQueryRequest::SchedulerStatus { keys }) => { let _ = response_tx.send(LeaderQueryResponse::SchedulerStatus( - leader_state.read_scheduler_status(keys), + leader_state.read_scheduler_status(metas, keys), )); } (State::Leader(leader_state), LeaderQueryRequest::UserLimitCounters { keys }) => { @@ -942,7 +954,7 @@ mod tests { &mut partition_store, &replica_set_states, &Configuration::pinned(), - &mut VQueuesMetaCache::new_empty(), + &mut VQueuesMetaCache::new_empty(1024), ) .await?; diff --git a/crates/worker/src/partition/mod.rs b/crates/worker/src/partition/mod.rs index e459cb3e2b..655e91627a 100644 --- a/crates/worker/src/partition/mod.rs +++ b/crates/worker/src/partition/mod.rs @@ -80,7 +80,7 @@ use restate_types::schema::Schema; use restate_types::storage::StorageDecodeError; use restate_types::time::{MillisSinceEpoch, NanosSinceEpoch}; use restate_types::{GenerationalNodeId, SemanticRestateVersion, Version}; -use restate_vqueues::VQueuesMetaCache; +use restate_vqueues::{VQueuesMeta, VQueuesMetaCache}; use restate_wal_protocol::control::{ AnnounceLeader, CurrentReplicaSetConfiguration, NextReplicaSetConfiguration, }; @@ -98,6 +98,11 @@ use crate::partition::leadership::LeadershipState; use crate::partition::state_machine::{ActionCollector, StateMachine}; use crate::partition_processor_manager::PartitionLeaderHandlesRegistry; +// Soft cap for the in-memory vqueue cache; once reached, inactive +// entries are evicted at insert time. The cache will still grow past +// this if compaction frees nothing. +const VQUEUE_CACHE_CAPACITY: usize = 100_000; + /// Information needed to run as leader, including the epoch and partition configurations. #[derive(Clone, Debug)] pub struct LeadershipInfo { @@ -515,7 +520,11 @@ where tokio::time::interval(with_jitter(Duration::from_millis(500), 0.5)); status_update_timer.set_missed_tick_behavior(MissedTickBehavior::Skip); - let mut vqueues = VQueuesMetaCache::create(partition_store.partition_db().clone()).await?; + let mut vqueues = VQueuesMetaCache::create( + partition_store.partition_db().clone(), + VQUEUE_CACHE_CAPACITY, + ) + .await?; let mut action_collector = ActionCollector::default(); let mut command_buffer = @@ -687,9 +696,9 @@ where if let Some(lsn) = &self.status.last_applied_log_lsn { self.last_applied_log_lsn_watch.send_replace(*lsn); } - self.leadership_state.handle_actions(action_collector.drain(..))?; + self.leadership_state.handle_actions(vqueues.view(), action_collector.drain(..))?; }, - result = self.leadership_state.run(&self.state_machine) => { + result = self.leadership_state.run(&self.state_machine, vqueues.view()) => { let action_effects = result?; // We process the action_effects not directly in the run future because it // requires the run future to be cancellation safe. In the future this could be @@ -697,7 +706,7 @@ where self.leadership_state.handle_action_effects(action_effects).await?; } Some(leader_query_cmd) = self.leader_query_rx.recv() => { - self.on_leader_query(leader_query_cmd); + self.on_leader_query(vqueues.view(), leader_query_cmd); } } // Allow other tasks on this thread to run, but only if we have exhausted the coop @@ -728,8 +737,9 @@ where Ok(()) } - fn on_leader_query(&mut self, leader_query_cmd: LeaderQueryCommand) { - self.leadership_state.handle_leader_query(leader_query_cmd); + fn on_leader_query(&mut self, metas: VQueuesMeta<'_>, leader_query_cmd: LeaderQueryCommand) { + self.leadership_state + .handle_leader_query(metas, leader_query_cmd); } async fn on_pp_rpc_request( diff --git a/crates/worker/src/partition/state_machine/actions.rs b/crates/worker/src/partition/state_machine/actions.rs index dbb4fe32be..f421b6fc6b 100644 --- a/crates/worker/src/partition/state_machine/actions.rs +++ b/crates/worker/src/partition/state_machine/actions.rs @@ -11,7 +11,6 @@ use restate_storage_api::outbox_table::OutboxMessage; use restate_storage_api::timer_table::TimerKey; use restate_storage_api::vqueue_table::EntryKey; -use restate_types::LimitKey; use restate_types::identifiers::{EntryIndex, InvocationId, PartitionProcessorRpcRequestId}; use restate_types::invocation::InvocationTarget; use restate_types::invocation::client::{ @@ -21,9 +20,8 @@ use restate_types::invocation::client::{ use restate_types::journal_v2::{CommandIndex, NotificationId}; use restate_types::message::MessageIndex; use restate_types::time::MillisSinceEpoch; -use restate_types::vqueues::VQueueId; use restate_util_string::ReString; -use restate_vqueues::VQueueEvent; +use restate_vqueues::{VQueueEvent, VQueueHandle}; use restate_wal_protocol::timer::TimerKeyValue; pub type ActionCollector = Vec; @@ -34,10 +32,9 @@ pub enum Action { VQEvent(VQueueEvent), /// Tells invoker to run this invocation (similar to Invoke) but carries more information VQInvoke { - qid: VQueueId, + vq_handle: VQueueHandle, key: EntryKey, invocation_target: InvocationTarget, - limit_key: LimitKey, idempotency_key: Option, }, Invoke { diff --git a/crates/worker/src/partition/state_machine/entries/mod.rs b/crates/worker/src/partition/state_machine/entries/mod.rs index 7cba1a4df7..7af7c3c1ed 100644 --- a/crates/worker/src/partition/state_machine/entries/mod.rs +++ b/crates/worker/src/partition/state_machine/entries/mod.rs @@ -380,7 +380,7 @@ where // Store journal entry WriteJournalTable::put_journal_entry( ctx.storage, - self.invocation_id, + &self.invocation_id, entry_index, // Make sure that a deterministic append time is set based on Bifrost's record creation // time. This ensures that the append time does not depend on the application time of diff --git a/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs b/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs index 97de8c1769..4a9b344ae0 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs @@ -63,7 +63,7 @@ where // Now write the entry in the new table, and remove it from the old one journal_table_v2::WriteJournalTable::put_journal_entry( ctx.storage, - self.invocation_id, + &self.invocation_id, 0, &StoredRawEntry::new( StoredRawEntryHeader::new(ctx.record_created_at), diff --git a/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs b/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs index 44f957699d..72f11dd890 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/restart_as_new.rs @@ -156,7 +156,7 @@ where // Now copy to the new journal journal_table_v2::WriteJournalTable::put_journal_entry( ctx.storage, - new_invocation_id, + &new_invocation_id, new_journal_index, &entry, &related_completion_ids, @@ -176,7 +176,7 @@ where // Now copy to the new journal journal_table_v2::WriteJournalTable::put_journal_entry( ctx.storage, - new_invocation_id, + &new_invocation_id, new_journal_index, &entry, &[], @@ -202,7 +202,7 @@ where // Copy over this notification journal_table_v2::WriteJournalTable::put_journal_entry( ctx.storage, - new_invocation_id, + &new_invocation_id, new_journal_index, &entry, &[], @@ -311,7 +311,7 @@ where }; ctx.on_pre_flight_invocation( - new_invocation_id, + &new_invocation_id, pre_flight_invocation_metadata, None, &limit_key, diff --git a/crates/worker/src/partition/state_machine/mod.rs b/crates/worker/src/partition/state_machine/mod.rs index 699dc1e7df..027114c8c8 100644 --- a/crates/worker/src/partition/state_machine/mod.rs +++ b/crates/worker/src/partition/state_machine/mod.rs @@ -113,7 +113,7 @@ use restate_types::{RESTATE_VERSION_1_6_0, journal_v2}; use restate_types::{RestateVersion, SemanticRestateVersion}; use restate_types::{Versioned, journal::*}; use restate_util_string::ReString; -use restate_vqueues::{VQueue, VQueuesMetaCache}; +use restate_vqueues::{VQueue, VQueueHandle, VQueuesMetaCache}; use restate_wal_protocol::Command; use restate_wal_protocol::timer::TimerKeyDisplay; use restate_wal_protocol::timer::TimerKeyValue; @@ -772,7 +772,7 @@ impl StateMachineApplyContext<'_, S> { ); self.on_pre_flight_invocation( - invocation_id, + &invocation_id, pre_flight_invocation_metadata, submit_notification_sink, &limit_key, @@ -782,7 +782,7 @@ impl StateMachineApplyContext<'_, S> { async fn on_pre_flight_invocation( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, mut pre_flight_invocation_metadata: PreFlightInvocationMetadata, submit_notification_sink: Option, limit_key: &LimitKey, @@ -936,7 +936,7 @@ impl StateMachineApplyContext<'_, S> { // Uses vqueues, replaces on_pre_flight_invocation async fn vqueue_enqueue( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, metadata: PreFlightInvocationMetadata, submit_notification_sink: Option, limit_key: &LimitKey, @@ -970,7 +970,7 @@ impl StateMachineApplyContext<'_, S> { record_unique_ts, self.record_lsn, metadata.execution_time, - EntryId::from(&invocation_id), + EntryId::from(invocation_id), vqueue_table::EntryMetadata::default(), ); @@ -994,7 +994,7 @@ impl StateMachineApplyContext<'_, S> { }; self.storage - .put_invocation_status(&invocation_id, &invocation_status) + .put_invocation_status(invocation_id, &invocation_status) .map_err(Error::Storage)?; // Invocation was scheduled, send back the ingress attach notification and return @@ -1054,7 +1054,7 @@ impl StateMachineApplyContext<'_, S> { // Send submit notification self.send_submit_notification_if_needed( - service_invocation.invocation_id, + &service_invocation.invocation_id, previous_invocation_status.execution_time(), // is_new_invocation is true if the RPC ingress request is a duplicate. service_invocation.source @@ -1125,7 +1125,7 @@ impl StateMachineApplyContext<'_, S> { /// Returns the invocation in case the invocation should run immediately fn handle_service_invocation_execution_time( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, metadata: PreFlightInvocationMetadata, ) -> Result, Error> where @@ -1136,13 +1136,13 @@ impl StateMachineApplyContext<'_, S> { debug_if_leader!(self.is_leader, "Store scheduled invocation"); self.register_timer( - TimerKeyValue::neo_invoke(execution_time, invocation_id), + TimerKeyValue::neo_invoke(execution_time, *invocation_id), span_context, )?; self.storage .put_invocation_status( - &invocation_id, + invocation_id, &InvocationStatus::Scheduled( ScheduledInvocation::from_pre_flight_invocation_metadata( metadata, @@ -1161,7 +1161,7 @@ impl StateMachineApplyContext<'_, S> { /// Returns the invocation in case the invocation was not inboxed async fn handle_service_invocation_exclusive_handler( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, metadata: PreFlightInvocationMetadata, ) -> Result, Error> where @@ -1186,7 +1186,7 @@ impl StateMachineApplyContext<'_, S> { if let VirtualObjectStatus::Locked(_) = service_status { // If locked, enqueue in inbox and be done with it let inbox_seq_number = self - .enqueue_into_inbox(InboxEntry::Invocation(keyed_service_id, invocation_id)) + .enqueue_into_inbox(InboxEntry::Invocation(keyed_service_id, *invocation_id)) .await?; debug_if_leader!( @@ -1196,7 +1196,7 @@ impl StateMachineApplyContext<'_, S> { ); self.storage .put_invocation_status( - &invocation_id, + invocation_id, &InvocationStatus::Inboxed( InboxedInvocation::from_pre_flight_invocation_metadata( metadata, @@ -1219,7 +1219,7 @@ impl StateMachineApplyContext<'_, S> { self.storage .put_virtual_object_status( &keyed_service_id, - &VirtualObjectStatus::Locked(invocation_id), + &VirtualObjectStatus::Locked(*invocation_id), ) .map_err(Error::Storage)?; } @@ -1229,12 +1229,11 @@ impl StateMachineApplyContext<'_, S> { fn init_journal_and_vqueue_invoke( &mut self, - qid: &VQueueId, + vq_handle: VQueueHandle, key: &EntryKey, - invocation_id: InvocationId, + invocation_id: &InvocationId, mut in_flight_invocation_metadata: InFlightInvocationMetadata, invocation_input: Option, - limit_key: LimitKey, ) -> Result<(), Error> where S: WriteJournalTable + WriteInvocationStatusTable + journal_table_v2::WriteJournalTable, @@ -1261,19 +1260,13 @@ impl StateMachineApplyContext<'_, S> { // Emit the trace anchor span for the invocation. if self.is_leader { let _start = instrumentation::create_invocation_start_span( - &invocation_id, + invocation_id, &in_flight_invocation_metadata.invocation_target, &in_flight_invocation_metadata.journal_metadata.span_context, in_flight_invocation_metadata.timestamps.creation_time(), ); } - self.vqueue_invoke( - qid, - key, - invocation_id, - in_flight_invocation_metadata, - limit_key, - ) + self.vqueue_invoke(vq_handle, key, invocation_id, in_flight_invocation_metadata) } /// Inits the journal if invocation_input is `Some` and invokes the invocation. If @@ -1281,24 +1274,13 @@ impl StateMachineApplyContext<'_, S> { /// invoke the invocation. fn init_journal_and_invoke( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, mut in_flight_invocation_metadata: InFlightInvocationMetadata, invocation_input: Option, ) -> Result<(), Error> where S: WriteJournalTable + WriteInvocationStatusTable + journal_table_v2::WriteJournalTable, { - // Usage metering for "actions" should include the Input journal entry - // type, but it gets filtered out before reaching the state machine. - // Therefore we count it here, as a special case. - if self.is_leader { - counter!( - USAGE_LEADER_JOURNAL_ENTRY_COUNT, - "entry" => "Command/Input", - ) - .increment(1); - } - // Only init the journal if we have some invocation input if let Some(invocation_input) = invocation_input { self.init_journal( @@ -1310,8 +1292,17 @@ impl StateMachineApplyContext<'_, S> { // Emit the trace anchor span for the invocation. if self.is_leader { + // Usage metering for "actions" should include the Input journal entry + // type, but it gets filtered out before reaching the state machine. + // Therefore we count it here, as a special case. + counter!( + USAGE_LEADER_JOURNAL_ENTRY_COUNT, + "entry" => "Command/Input", + ) + .increment(1); + let _start = instrumentation::create_invocation_start_span( - &invocation_id, + invocation_id, &in_flight_invocation_metadata.invocation_target, &in_flight_invocation_metadata.journal_metadata.span_context, in_flight_invocation_metadata.timestamps.creation_time(), @@ -1330,7 +1321,7 @@ impl StateMachineApplyContext<'_, S> { // will no longer be needed. fn init_journal( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, in_flight_invocation_metadata: &mut InFlightInvocationMetadata, invocation_input: InvocationInput, ) -> Result<(), Error> @@ -1380,7 +1371,7 @@ impl StateMachineApplyContext<'_, S> { )); journal_table::WriteJournalTable::put_journal_entry( self.storage, - &invocation_id, + invocation_id, 0, &input_entry, ) @@ -1392,11 +1383,10 @@ impl StateMachineApplyContext<'_, S> { fn vqueue_invoke( &mut self, - qid: &VQueueId, + vq_handle: VQueueHandle, key: &EntryKey, - invocation_id: InvocationId, + invocation_id: &InvocationId, in_flight_invocation_metadata: InFlightInvocationMetadata, - limit_key: LimitKey, ) -> Result<(), Error> where S: WriteInvocationStatusTable, @@ -1406,17 +1396,16 @@ impl StateMachineApplyContext<'_, S> { let status = InvocationStatus::Invoked(in_flight_invocation_metadata); self.storage - .put_invocation_status(&invocation_id, &status) + .put_invocation_status(invocation_id, &status) .map_err(Error::Storage)?; if self.is_leader { let invocation_metadata = status.into_invocation_metadata().unwrap(); let invocation_target = invocation_metadata.invocation_target; self.action_collector.push(Action::VQInvoke { - qid: qid.clone(), + vq_handle, key: *key, invocation_target, - limit_key, idempotency_key: invocation_metadata.idempotency_key.map(ReString::new_owned), }); } @@ -1426,7 +1415,7 @@ impl StateMachineApplyContext<'_, S> { fn invoke( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, in_flight_invocation_metadata: InFlightInvocationMetadata, ) -> Result<(), Error> where @@ -1434,13 +1423,15 @@ impl StateMachineApplyContext<'_, S> { { debug_if_leader!(self.is_leader, "Invoke"); - self.action_collector.push(Action::Invoke { - invocation_id, - invocation_target: in_flight_invocation_metadata.invocation_target.clone(), - }); + if self.is_leader { + self.action_collector.push(Action::Invoke { + invocation_id: *invocation_id, + invocation_target: in_flight_invocation_metadata.invocation_target.clone(), + }); + } self.storage .put_invocation_status( - &invocation_id, + invocation_id, &InvocationStatus::Invoked(in_flight_invocation_metadata), ) .map_err(Error::Storage)?; @@ -2349,11 +2340,11 @@ impl StateMachineApplyContext<'_, S> { .await?; Ok(()) } - Timer::NeoInvoke(invocation_id) => self.on_neo_invoke_timer(invocation_id).await, + Timer::NeoInvoke(ref invocation_id) => self.on_neo_invoke_timer(invocation_id).await, } } - async fn on_neo_invoke_timer(&mut self, invocation_id: InvocationId) -> Result<(), Error> + async fn on_neo_invoke_timer(&mut self, invocation_id: &InvocationId) -> Result<(), Error> where S: ReadVirtualObjectStatusTable + WriteVirtualObjectStatusTable @@ -2368,7 +2359,7 @@ impl StateMachineApplyContext<'_, S> { self.is_leader, "Handle scheduled invocation timer with invocation id {invocation_id}" ); - let invocation_status = self.get_invocation_status(&invocation_id).await?; + let invocation_status = self.get_invocation_status(invocation_id).await?; if let InvocationStatus::Free = &invocation_status { warn!( @@ -3055,15 +3046,13 @@ impl StateMachineApplyContext<'_, S> { .unwrap(); vqueue.run_entry(record_unique_ts, &header, wait_stats); - - let limit_key = vqueue.meta().limit_key().clone(); + let vq_handle = vqueue.handle(); if self.is_leader { self.action_collector.push(Action::VQInvoke { - qid: qid.clone(), + vq_handle, key: *entry_key, invocation_target: status.invocation_target().unwrap().clone(), - limit_key, // todo(tillrohrmann) avoid the transformation from ByteString to ReString by // storing the idempotency key as ReString in the first place idempotency_key: status @@ -3131,16 +3120,14 @@ impl StateMachineApplyContext<'_, S> { .unwrap(); vqueue.run_entry(record_unique_ts, &header, wait_stats); - - let limit_key = vqueue.meta().limit_key().clone(); + let vq_handle = vqueue.handle(); self.init_journal_and_vqueue_invoke( - qid, + vq_handle, entry_key, - invocation_id, + &invocation_id, metadata, invocation_input, - limit_key, )?; } _ => { @@ -3214,7 +3201,7 @@ impl StateMachineApplyContext<'_, S> { self.record_created_at, ); self.init_journal_and_invoke( - invocation_id, + &invocation_id, in_flight_invocation_meta, invocation_input, )?; @@ -4432,7 +4419,7 @@ impl StateMachineApplyContext<'_, S> { fn send_submit_notification_if_needed( &mut self, - invocation_id: InvocationId, + invocation_id: &InvocationId, execution_time: Option, is_new_invocation: bool, submit_notification_sink: Option, @@ -4880,7 +4867,7 @@ impl StateMachineApplyContext<'_, S> { if pinned_protocol_version.is_none_or(|sp| sp >= ServiceProtocolVersion::V4) { journal_table_v2::WriteJournalTable::delete_journal( self.storage, - invocation_id, + &invocation_id, journal_length, ) .map_err(Error::Storage)? diff --git a/crates/worker/src/partition/state_machine/tests/mod.rs b/crates/worker/src/partition/state_machine/tests/mod.rs index 9743d6d51e..e04d3ecf03 100644 --- a/crates/worker/src/partition/state_machine/tests/mod.rs +++ b/crates/worker/src/partition/state_machine/tests/mod.rs @@ -145,7 +145,7 @@ impl TestEnv { pub async fn apply(&mut self, command: Command) -> Vec { let mut transaction = self.storage.transaction(); let mut action_collector = ActionCollector::default(); - let mut vqueues = VQueuesMetaCache::new_empty(); + let mut vqueues = VQueuesMetaCache::new_empty(1024); self.state_machine .apply( command, @@ -167,7 +167,7 @@ impl TestEnv { pub async fn apply_fallible(&mut self, command: Command) -> Result, Error> { let mut transaction = self.storage.transaction(); let mut action_collector = ActionCollector::default(); - let mut vqueues = VQueuesMetaCache::new_empty(); + let mut vqueues = VQueuesMetaCache::new_empty(1024); self.state_machine .apply( command, diff --git a/tools/pp-bench/src/workload.rs b/tools/pp-bench/src/workload.rs index f55a119794..06f4d751e8 100644 --- a/tools/pp-bench/src/workload.rs +++ b/tools/pp-bench/src/workload.rs @@ -104,7 +104,8 @@ pub async fn run( // Initialize the vqueue cache from the (empty) partition store — this follows // the production init path and avoids requiring test-util features. - let mut vq_cache = VQueuesMetaCache::create(partition_store.partition_db().clone()).await?; + let mut vq_cache = + VQueuesMetaCache::create(partition_store.partition_db().clone(), 100_000).await?; let workload_name = format!("{:?}", opts.spec.workload); let batch_size = opts.batch_size;