From a249e8034c09bb710a8c458ccdc185df0ad74681 Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Fri, 16 Jan 2026 16:31:51 -0800 Subject: [PATCH 01/46] CEP-45: Incremental Repair Blocking Wait for offsets --- ...MutationTrackingIncrementalRepairTask.java | 207 ++++++++++++++++ .../cassandra/repair/RepairCoordinator.java | 10 +- .../replication/BroadcastLogOffsets.java | 6 + .../replication/MutationTrackingService.java | 54 +++++ .../MutationTrackingSyncCoordinator.java | 229 ++++++++++++++++++ .../apache/cassandra/replication/Shard.java | 18 ++ ...tionTrackingIncrementalRepairTaskTest.java | 171 +++++++++++++ .../MutationTrackingSyncCoordinatorTest.java | 196 +++++++++++++++ 8 files changed, 890 insertions(+), 1 deletion(-) create mode 100644 src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java create mode 100644 src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java create mode 100644 test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java create mode 100644 test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java new file mode 100644 index 000000000000..d70746408adf --- /dev/null +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.repair; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.replication.MutationTrackingSyncCoordinator; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; + +/** Incremental repair task for keyspaces using mutation tracking */ +public class MutationTrackingIncrementalRepairTask extends AbstractRepairTask +{ + private static final long SYNC_TIMEOUT_MINUTES = 30; + + private final TimeUUID parentSession; + private final RepairCoordinator.NeighborsAndRanges neighborsAndRanges; + private final String[] cfnames; + + protected MutationTrackingIncrementalRepairTask(RepairCoordinator coordinator, + TimeUUID parentSession, + RepairCoordinator.NeighborsAndRanges neighborsAndRanges, + String[] cfnames) + { + super(coordinator); + this.parentSession = parentSession; + this.neighborsAndRanges = neighborsAndRanges; + this.cfnames = cfnames; + } + + @Override + public String name() + { + return "MutationTrackingIncrementalRepair"; + } + + @Override + public Future performUnsafe(ExecutorPlus executor, Scheduler validationScheduler) + { + List allRanges = neighborsAndRanges.filterCommonRanges(keyspace, cfnames); + + if (allRanges.isEmpty()) + { + logger.info("No common ranges to repair for keyspace {}", keyspace); + return new AsyncPromise().setSuccess(CoordinatedRepairResult.create(List.of(), List.of())); + } + + List syncCoordinators = new ArrayList<>(); + List>> rangeCollections = new ArrayList<>(); + + for (CommonRange commonRange : allRanges) + { + for (Range range : commonRange.ranges) + { + MutationTrackingSyncCoordinator syncCoordinator = new MutationTrackingSyncCoordinator(keyspace, range); + syncCoordinator.start(); + syncCoordinators.add(syncCoordinator); + rangeCollections.add(List.of(range)); + + logger.info("Started mutation tracking sync for range {}", range); + } + } + + coordinator.notifyProgress("Started mutation tracking sync for " + syncCoordinators.size() + " ranges"); + + AsyncPromise resultPromise = new AsyncPromise<>(); + + executor.execute(() -> { + try + { + waitForSyncCompletion(syncCoordinators, executor, validationScheduler, allRanges, rangeCollections, resultPromise); + } + catch (Exception e) + { + logger.error("Error during mutation tracking repair", e); + resultPromise.tryFailure(e); + } + }); + + return resultPromise; + } + + private void waitForSyncCompletion(List syncCoordinators, + ExecutorPlus executor, + Scheduler validationScheduler, + List allRanges, + List>> rangeCollections, + AsyncPromise resultPromise) throws InterruptedException + { + boolean allSucceeded = true; + for (MutationTrackingSyncCoordinator syncCoordinator : syncCoordinators) + { + boolean completed = syncCoordinator.awaitCompletion(SYNC_TIMEOUT_MINUTES, TimeUnit.MINUTES); + if (!completed) + { + logger.warn("Mutation tracking sync timed out for keyspace {} range {}", + keyspace, syncCoordinator.getRange()); + syncCoordinator.cancel(); + allSucceeded = false; + } + } + + if (!allSucceeded) + { + resultPromise.tryFailure(new RuntimeException("Mutation tracking sync timed out for some ranges")); + return; + } + + coordinator.notifyProgress("Mutation tracking sync completed for all ranges"); + + if (requiresTraditionalRepair(keyspace)) + { + runTraditionalRepairForMigration(executor, validationScheduler, allRanges, resultPromise); + } + else + { + // Pure mutation tracking - create successful result + resultPromise.trySuccess(CoordinatedRepairResult.create(rangeCollections, List.of())); + } + } + + private void runTraditionalRepairForMigration(ExecutorPlus executor, + Scheduler validationScheduler, + List allRanges, + AsyncPromise resultPromise) + { + coordinator.notifyProgress("Running traditional repair for migration"); + + // Use the inherited runRepair method from AbstractRepairTask + Future traditionalRepair = runRepair(parentSession, true, executor, + validationScheduler, allRanges, + neighborsAndRanges.shouldExcludeDeadParticipants, + cfnames); + + traditionalRepair.addListener(f -> { + try + { + CoordinatedRepairResult result = (CoordinatedRepairResult) f.get(); + resultPromise.setSuccess(result); + } + catch (Exception e) + { + resultPromise.setFailure(e); + } + }); + } + + /** + * Determines if this keyspace should use mutation tracking incremental repair. + * Returns true if: + * - Keyspace uses mutation tracking replication, OR + * - Keyspace is currently migrating (either direction) + */ + public static boolean shouldUseMutationTrackingRepair(String keyspace) + { + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMetadata ksm = metadata.schema.maybeGetKeyspaceMetadata(keyspace).orElse(null); + if (ksm == null) + return false; + + // Check if keyspace uses mutation tracking + if (ksm.useMutationTracking()) + return true; + + // Check if keyspace is in migration (either direction) + KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); + return migrationInfo != null; + } + + /** + * Determines if we also need to run traditional repair. + * Returns true during migration: + * - Migrating TO mutation tracking: need traditional repair to sync pre-migration data + * - Migrating FROM mutation tracking: need traditional repair for post-migration consistency + */ + public static boolean requiresTraditionalRepair(String keyspace) + { + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); + return migrationInfo != null; + } +} diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index b511d081c984..55274dd7b996 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -503,7 +503,15 @@ private Future>> repair(String[] } else if (state.options.isIncremental()) { - task = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + // For keyspaces using mutation tracking, use the mutation tracking repair task + if (MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(state.keyspace)) + { + task = new MutationTrackingIncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + } + else + { + task = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + } } else { diff --git a/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java b/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java index a9832f90c65d..7e2b7b485e57 100644 --- a/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java +++ b/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.cassandra.db.TypeSizes; @@ -51,6 +52,11 @@ boolean isEmpty() return replicatedOffsets.isEmpty(); } + public List getOffsets() + { + return Collections.unmodifiableList(replicatedOffsets); + } + @Override public String toString() { diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index e890b9c0345b..b77f58f8445b 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -192,6 +192,8 @@ public static void shutdown() throws InterruptedException private final IncomingMutations incomingMutations = new IncomingMutations(); private final OutgoingMutations outgoingMutations = new OutgoingMutations(); + private final Map> syncCoordinatorsByKeyspace = new ConcurrentHashMap<>(); + private volatile boolean started = false; private MutationTrackingService() @@ -395,6 +397,19 @@ public void updateReplicatedOffsets(String keyspace, Range range, List coordinators = syncCoordinatorsByKeyspace.get(keyspace); + if (coordinators != null) + { + for (MutationTrackingSyncCoordinator coordinator : coordinators) + { + if (range.intersects(coordinator.getRange())) + { + coordinator.onOffsetsReceived(); + } + } + } } finally { @@ -457,6 +472,30 @@ public boolean registerMutationCallback(ShortMutationId mutationId, IncomingMuta return incomingMutations.subscribe(mutationId, callback); } + /** + * Register a sync coordinator to be notified when offset updates arrive. + */ + public void registerSyncCoordinator(MutationTrackingSyncCoordinator coordinator) + { + syncCoordinatorsByKeyspace.computeIfAbsent(coordinator.getKeyspace(), k -> ConcurrentHashMap.newKeySet()) + .add(coordinator); + } + + /** + * Unregister a sync coordinator. + */ + public void unregisterSyncCoordinator(MutationTrackingSyncCoordinator coordinator) + { + Set coordinators = syncCoordinatorsByKeyspace.get(coordinator.getKeyspace()); + if (coordinators != null) + { + coordinators.remove(coordinator); + + if (coordinators.isEmpty()) + syncCoordinatorsByKeyspace.remove(coordinator.getKeyspace(), coordinators); + } + } + public void executeTransfers(String keyspace, Set sstables, ConsistencyLevel cl) { shardLock.readLock().lock(); @@ -614,6 +653,21 @@ public Iterable getShards() return shards; } + public void forEachShardInKeyspace(String keyspace, Consumer consumer) + { + shardLock.readLock().lock(); + try + { + KeyspaceShards ksShards = keyspaceShards.get(keyspace); + if (ksShards != null) + ksShards.forEachShard(consumer); + } + finally + { + shardLock.readLock().unlock(); + } + } + public void collectLocallyMissingMutations(MutationSummary remoteSummary, Log2OffsetsMap.Mutable into) { shardLock.readLock().lock(); diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java new file mode 100644 index 000000000000..09a33c59a8f4 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; + +public class MutationTrackingSyncCoordinator +{ + private static final Logger logger = LoggerFactory.getLogger(MutationTrackingSyncCoordinator.class); + + private final String keyspace; + private final Range range; + private final AsyncPromise completionFuture = new AsyncPromise<>(); + + // Per-shard state: tracks what each node has reported for that shard + private final Map, ShardSyncState> shardStates = new ConcurrentHashMap<>(); + + private final AtomicBoolean started = new AtomicBoolean(false); + private final AtomicBoolean completed = new AtomicBoolean(false); + + public MutationTrackingSyncCoordinator(String keyspace, Range range) + { + this.keyspace = keyspace; + this.range = range; + } + + public void start() + { + if (!started.compareAndSet(false, true)) + throw new IllegalStateException("Sync coordinator already started"); + + List overlappingShards; + + overlappingShards = new ArrayList<>(); + MutationTrackingService.instance.forEachShardInKeyspace(keyspace, shard -> { + if (shard.range.intersects(range)) + overlappingShards.add(shard); + }); + + if (overlappingShards.isEmpty()) + { + completionFuture.setSuccess(null); + return; + } + + // Register to receive offset updates + MutationTrackingService.instance.registerSyncCoordinator(this); + + // Initialize state for each shard and capture targets + for (Shard shard : overlappingShards) + { + ShardSyncState state = new ShardSyncState(shard); + state.captureTargets(); + shardStates.put(shard.range, state); + } + + if (checkIfComplete()) + { + complete(); + return; + } + + logger.info("Sync coordinator started for keyspace {} range {}, tracking {} shards", + keyspace, range, overlappingShards.size()); + } + + private void complete() + { + if (!completed.compareAndSet(false, true)) + return; + MutationTrackingService.instance.unregisterSyncCoordinator(this); + completionFuture.setSuccess(null); + } + + private boolean checkIfComplete() + { + for (ShardSyncState state : shardStates.values()) + { + if (!state.isComplete()) + return false; + } + return true; + } + + public void onOffsetsReceived() + { + if (completed.get()) + return; + + // The underlying CoordinatorLog already updates its reconciled offsets. + // We just need to re-check if we're now complete. + if (checkIfComplete()) + { + complete(); + } + } + + public String getKeyspace() + { + return keyspace; + } + + public Range getRange() + { + return range; + } + + public Future awaitCompletion() + { + return completionFuture; + } + + /** + * Blocks until sync completes or timeout is reached. + * + * @param timeout Maximum time to wait + * @param unit Time unit + * @return true if completed, false if timed out + */ + public boolean awaitCompletion(long timeout, TimeUnit unit) throws InterruptedException + { + try + { + completionFuture.get(timeout, unit); + return true; + } + catch (java.util.concurrent.TimeoutException e) + { + return false; + } + catch (java.util.concurrent.ExecutionException e) + { + throw new RuntimeException(e.getCause()); + } + } + + public void cancel() + { + if (completed.compareAndSet(false, true)) + { + MutationTrackingService.instance.unregisterSyncCoordinator(this); + completionFuture.setFailure(new RuntimeException("Sync cancelled")); + } + } + + /** + * Tracks sync state for a single shard. + */ + private static class ShardSyncState + { + private final Shard shard; + + // Target offsets: LogId -> the offsets we're waiting for all nodes to have + private final Map targets = new ConcurrentHashMap<>(); + + ShardSyncState(Shard shard) + { + this.shard = shard; + } + + void captureTargets() + { + BroadcastLogOffsets current = shard.collectReplicatedOffsets(false); + for (Offsets.Immutable logOffsets : current.getOffsets()) + { + targets.put(logOffsets.logId(), logOffsets); + } + } + + boolean isComplete() + { + Map currentReconciled = shard.collectReconciledOffsetsPerLog(); + + for (Map.Entry entry : targets.entrySet()) + { + CoordinatorLogId logId = entry.getKey(); + Offsets.Immutable target = entry.getValue(); + + Offsets.Immutable reconciled = currentReconciled.get(logId); + if (reconciled == null) + return false; + + // Check if reconciled contains all offsets in target + if (!containsAll(reconciled, target)) + return false; + } + return true; + } + + private boolean containsAll(Offsets reconciled, Offsets target) + { + for (ShortMutationId id : target) + { + if (!reconciled.contains(id.offset())) + return false; + } + return true; + } + } +} diff --git a/src/java/org/apache/cassandra/replication/Shard.java b/src/java/org/apache/cassandra/replication/Shard.java index f282edf4262c..1893f0ab798f 100644 --- a/src/java/org/apache/cassandra/replication/Shard.java +++ b/src/java/org/apache/cassandra/replication/Shard.java @@ -20,7 +20,9 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; @@ -407,6 +409,22 @@ void collectShardReconciledOffsetsToBuilder(ReconciledKeyspaceOffsets.Builder ke logs.values().forEach(log -> keyspaceBuilder.put(log.logId, log.collectReconciledOffsets(), range)); } + /** + * Returns the reconciled offsets for each coordinator log in this shard. + * Reconciled offsets are the intersection of what all participants have. + */ + public Map collectReconciledOffsetsPerLog() + { + Map result = new HashMap<>(); + for (CoordinatorLog log : logs.values()) + { + Offsets.Immutable reconciled = log.collectReconciledOffsets(); + if (reconciled != null && !reconciled.isEmpty()) + result.put(log.logId, reconciled); + } + return result; + } + @Override public String toString() { diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java new file mode 100644 index 000000000000..5383fea0d79e --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.distributed.test.repair; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.repair.MutationTrackingIncrementalRepairTask; + +import static org.junit.Assert.*; + +/** + * Tests for MutationTrackingIncrementalRepairTask. + * Tests the decision logic for when to use mutation tracking repair. + * + * Uses a shared cluster across all tests to minimize overhead. + */ +public class MutationTrackingIncrementalRepairTaskTest extends TestBaseImpl +{ + private static Cluster CLUSTER; + private static final AtomicInteger ksCounter = new AtomicInteger(); + + @BeforeClass + public static void setupCluster() throws IOException + { + CLUSTER = Cluster.build() + .withNodes(3) + .withConfig(cfg -> cfg.with(Feature.NETWORK, Feature.GOSSIP) + .set("mutation_tracking_enabled", true)) + .start(); + } + + @AfterClass + public static void teardownCluster() + { + if (CLUSTER != null) + CLUSTER.close(); + } + + private static String nextKsName() + { + return "mtirt_ks" + ksCounter.incrementAndGet(); + } + + @Test + public void testShouldUseMutationTrackingRepairForTrackedKeyspace() throws Throwable + { + String ksName = nextKsName(); + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + + Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + + assertTrue("Tracked keyspace should use mutation tracking repair", shouldUse); + } + + @Test + public void testShouldNotUseMutationTrackingRepairForUntrackedKeyspace() throws Throwable + { + String ksName = nextKsName(); + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); + + Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + + assertFalse("Untracked keyspace should not use mutation tracking repair", shouldUse); + } + + @Test + public void testRequiresTraditionalRepairReturnsFalseForNonMigratingKeyspace() throws Throwable + { + String ksName = nextKsName(); + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + + Boolean requiresTraditional = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); + + assertFalse("Non-migrating keyspace should not require traditional repair", requiresTraditional); + } + + @Test + public void testShouldUseMutationTrackingRepairForNonexistentKeyspace() throws Throwable + { + Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair("nonexistent_ks_xyz")); + + assertFalse("Nonexistent keyspace should return false", shouldUse); + } + + @Test + public void testMigrationFromUntrackedToTracked() throws Throwable + { + String ksName = nextKsName(); + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); + CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); + + // Verify initial state + Boolean shouldUseBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + assertFalse("Untracked keyspace should not use mutation tracking repair", shouldUseBefore); + + Boolean requiresBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); + assertFalse("Non-migrating keyspace should not require traditional repair", requiresBefore); + + // Trigger migration by altering to tracked + CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + + // Verify migration state - both methods should now return true + Boolean shouldUseAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + assertTrue("Migrating keyspace should use mutation tracking repair", shouldUseAfter); + + Boolean requiresAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); + assertTrue("Migrating keyspace should require traditional repair", requiresAfter); + } + + @Test + public void testMigrationFromTrackedToUntracked() throws Throwable + { + String ksName = nextKsName(); + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); + + // Verify initial state + Boolean shouldUseBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + assertTrue("Tracked keyspace should use mutation tracking repair", shouldUseBefore); + + Boolean requiresBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); + assertFalse("Non-migrating tracked keyspace should not require traditional repair", requiresBefore); + + // Migrate back to untracked + CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); + + // During reverse migration, both should still apply + Boolean shouldUseAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + assertTrue("Keyspace migrating from tracked should still use mutation tracking repair", shouldUseAfter); + + Boolean requiresAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); + assertTrue("Keyspace migrating from tracked should require traditional repair", requiresAfter); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java new file mode 100644 index 000000000000..7c29ccc121d2 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.distributed.test.replication; + +import java.util.concurrent.TimeUnit; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.MutationTrackingSyncCoordinator; + +import static org.junit.Assert.*; + +/** + * Distributed tests for MutationTrackingSyncCoordinator. + * + * Tests that the sync coordinator correctly waits for offset convergence + * across all nodes in a cluster. + */ +public class MutationTrackingSyncCoordinatorTest extends TestBaseImpl +{ + private static final String KS_NAME = "sync_test_ks"; + private static final String TBL_NAME = "sync_test_tbl"; + + @Test + public void testSyncCoordinatorCompletesWhenNoShards() throws Throwable + { + try (Cluster cluster = builder().withNodes(3).start()) + { + // Create a tracked keyspace + cluster.schemaChange("CREATE KEYSPACE " + KS_NAME + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + cluster.schemaChange("CREATE TABLE " + KS_NAME + '.' + TBL_NAME + " (k int PRIMARY KEY, v int)"); + + // Create a sync coordinator for a range that has no data + // It should complete immediately since there are no offsets to sync + Boolean completed = cluster.get(1).callOnInstance(() -> { + Range fullRange = new Range<>( + new Murmur3Partitioner.LongToken(Long.MIN_VALUE), + new Murmur3Partitioner.LongToken(Long.MAX_VALUE) + ); + + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME, fullRange); + coordinator.start(); + + try + { + return coordinator.awaitCompletion(5, TimeUnit.SECONDS); + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + return false; + } + }); + + assertTrue("Sync coordinator should complete when there are no pending offsets", completed); + } + } + + @Test + public void testSyncCoordinatorCompletesAfterDataSync() throws Throwable + { + try (Cluster cluster = builder().withNodes(6).start()) + { + // Create a tracked keyspace + cluster.schemaChange("CREATE KEYSPACE " + KS_NAME + "2 WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + cluster.schemaChange("CREATE TABLE " + KS_NAME + "2.tbl (k int PRIMARY KEY, v int)"); + + // Insert some data to create mutations + for (int i = 0; i < 10000; i++) + { + cluster.coordinator(1).execute( + "INSERT INTO " + KS_NAME + "2.tbl (k, v) VALUES (?, ?)", + ConsistencyLevel.ALL, i, i + ); + } + + Thread.sleep(500); // Wait for offset broadcasts to propagate + + // Create a sync coordinator - should complete since all data is synced (CL.ALL) + Boolean completed = cluster.get(1).callOnInstance(() -> { + Range fullRange = new Range<>( + new Murmur3Partitioner.LongToken(Long.MIN_VALUE), + new Murmur3Partitioner.LongToken(Long.MAX_VALUE) + ); + + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '2', fullRange); + coordinator.start(); + + try + { + // Give it enough time for broadcasts to arrive + return coordinator.awaitCompletion(15, TimeUnit.SECONDS); + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + return false; + } + }); + + assertTrue("Sync coordinator should complete after data is fully replicated", completed); + } + } + + @Test + public void testSyncCoordinatorCancel() throws Throwable + { + try (Cluster cluster = builder().withNodes(3).start()) + { + // Create a tracked keyspace with data so there are shards to sync + cluster.schemaChange("CREATE KEYSPACE cancel_test_ks WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + cluster.schemaChange("CREATE TABLE cancel_test_ks.tbl (k int PRIMARY KEY, v int)"); + + // Pause offset broadcasts on all nodes to prevent sync from completing + for (int i = 1; i <= 3; i++) + { + cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.pauseOffsetBroadcast(true)); + } + + for (int i = 0; i < 100; i++) + { + cluster.coordinator(1).execute( + "INSERT INTO cancel_test_ks.tbl (k, v) VALUES (?, ?)", + ConsistencyLevel.ONE, i, i); + } + + // Start coordinator - it will be stuck waiting for offsets + Boolean wasCancelled = cluster.get(1).callOnInstance(() -> { + Range fullRange = new Range<>( + new Murmur3Partitioner.LongToken(Long.MIN_VALUE), + new Murmur3Partitioner.LongToken(Long.MAX_VALUE) + ); + + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator("cancel_test_ks", fullRange); + coordinator.start(); + + try + { + Thread.sleep(100); + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + return false; + } + + coordinator.cancel(); // Cancel it + + // Verify it was cancelled + try + { + coordinator.awaitCompletion(1, TimeUnit.SECONDS); + return false; // Should have thrown + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + return false; + } + catch (RuntimeException e) + { + return e.getMessage() != null && e.getMessage().contains("cancelled"); + } + }); + assertTrue("Sync coordinator should be cancelled", wasCancelled); + } + } +} From 9e750e40ce67da6f31e00ad6287df54b89e2c734 Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Wed, 21 Jan 2026 15:27:25 -0800 Subject: [PATCH 02/46] end--no-edit --- .../cassandra/replication/CoordinatorLog.java | 18 ++++++++++++++ .../replication/MutationTrackingService.java | 24 +++++++++---------- .../MutationTrackingSyncCoordinator.java | 13 ++-------- .../replication/Node2OffsetsMap.java | 13 ++++++++++ .../apache/cassandra/replication/Shard.java | 16 +++++++++++++ 5 files changed, 61 insertions(+), 23 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/CoordinatorLog.java b/src/java/org/apache/cassandra/replication/CoordinatorLog.java index 7db405929601..68ad16e42a20 100644 --- a/src/java/org/apache/cassandra/replication/CoordinatorLog.java +++ b/src/java/org/apache/cassandra/replication/CoordinatorLog.java @@ -284,6 +284,24 @@ Offsets.Immutable collectReconciledOffsets() } } + /** + * Returns the UNION of all witnessed offsets from all participants. + * This represents all offsets that ANY replica has witnessed. + */ + Offsets.Immutable collectUnionOfWitnessedOffsets() + { + lock.readLock().lock(); + try + { + Offsets.Mutable union = witnessedOffsets.union(); + return union.isEmpty() ? null : Offsets.Immutable.copy(union); + } + finally + { + lock.readLock().unlock(); + } + } + public long getUnreconciledCount() { lock.readLock().lock(); diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index b77f58f8445b..68cbe1c4cf9c 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -397,24 +397,24 @@ public void updateReplicatedOffsets(String keyspace, Range range, List coordinators = syncCoordinatorsByKeyspace.get(keyspace); - if (coordinators != null) + // Notify any registered sync coordinators about the offset update + Set coordinators = syncCoordinatorsByKeyspace.get(keyspace); + if (coordinators != null) + { + for (MutationTrackingSyncCoordinator coordinator : coordinators) { - for (MutationTrackingSyncCoordinator coordinator : coordinators) + if (range.intersects(coordinator.getRange())) { - if (range.intersects(coordinator.getRange())) - { - coordinator.onOffsetsReceived(); - } + coordinator.onOffsetsReceived(); } } } - finally - { - shardLock.readLock().unlock(); - } } public void recordFullyReconciledOffsets(ReconciledLogSnapshot reconciledSnapshot) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 09a33c59a8f4..4210d4638306 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -31,7 +31,6 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.utils.concurrent.AsyncPromise; -import org.apache.cassandra.utils.concurrent.Future; public class MutationTrackingSyncCoordinator { @@ -134,11 +133,6 @@ public Range getRange() return range; } - public Future awaitCompletion() - { - return completionFuture; - } - /** * Blocks until sync completes or timeout is reached. * @@ -189,11 +183,8 @@ private static class ShardSyncState void captureTargets() { - BroadcastLogOffsets current = shard.collectReplicatedOffsets(false); - for (Offsets.Immutable logOffsets : current.getOffsets()) - { - targets.put(logOffsets.logId(), logOffsets); - } + Map unionOffsets = shard.collectUnionOfWitnessedOffsetsPerLog(); + targets.putAll(unionOffsets); } boolean isComplete() diff --git a/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java b/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java index ac6fcc0dafae..8d943feb1050 100644 --- a/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java +++ b/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java @@ -73,6 +73,19 @@ Offsets.Mutable intersection() return intersection; } + Offsets.Mutable union() + { + Iterator iter = offsetsMap.values().iterator(); + if (offsetsMap.size() == 1) + return Offsets.Mutable.copy(iter.next()); + + Offsets.Mutable union = Offsets.Mutable.copy(iter.next()); + while (iter.hasNext()) + union.addAll(iter.next()); + + return union; + } + public void add(int node, Offsets offsets) { Offsets.Mutable current = offsetsMap.get(node); diff --git a/src/java/org/apache/cassandra/replication/Shard.java b/src/java/org/apache/cassandra/replication/Shard.java index 1893f0ab798f..f319c27524b5 100644 --- a/src/java/org/apache/cassandra/replication/Shard.java +++ b/src/java/org/apache/cassandra/replication/Shard.java @@ -425,6 +425,22 @@ public Map collectReconciledOffsetsPerLog() return result; } + /** + * Returns the UNION of witnessed offsets from all participants for each coordinator log. + * Union = all offsets that ANY replica has witnessed. + */ + public Map collectUnionOfWitnessedOffsetsPerLog() + { + Map result = new HashMap<>(); + for (CoordinatorLog log : logs.values()) + { + Offsets.Immutable union = log.collectUnionOfWitnessedOffsets(); + if (union != null && !union.isEmpty()) + result.put(log.logId, union); + } + return result; + } + @Override public String toString() { From 5f51ed1c5783e0357c1f320d36858079547ab506 Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Wed, 21 Jan 2026 16:03:18 -0800 Subject: [PATCH 03/46] Create new test to validate inc repair on ALL replicas --- .../MutationTrackingSyncCoordinatorTest.java | 122 ++++++++++++------ 1 file changed, 82 insertions(+), 40 deletions(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index 7c29ccc121d2..4ca689666f26 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -43,26 +43,45 @@ public class MutationTrackingSyncCoordinatorTest extends TestBaseImpl private static final String KS_NAME = "sync_test_ks"; private static final String TBL_NAME = "sync_test_tbl"; + private void createTrackedKeyspace(Cluster cluster, String keyspaceSuffix) + { + String ksName = KS_NAME + keyspaceSuffix; + cluster.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + cluster.schemaChange("CREATE TABLE " + ksName + '.' + TBL_NAME + " (k int PRIMARY KEY, v int)"); + } + + private String tableName(String suffix) + { + return KS_NAME + suffix + '.' + TBL_NAME; + } + + private void pauseOffsetBroadcasts(Cluster cluster, boolean pause) + { + for (int i = 1; i <= cluster.size(); i++) + cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.pauseOffsetBroadcast(pause)); + } + + private static Range fullTokenRange() + { + return new Range<>( + new Murmur3Partitioner.LongToken(Long.MIN_VALUE), + new Murmur3Partitioner.LongToken(Long.MAX_VALUE) + ); + } + @Test public void testSyncCoordinatorCompletesWhenNoShards() throws Throwable { try (Cluster cluster = builder().withNodes(3).start()) { - // Create a tracked keyspace - cluster.schemaChange("CREATE KEYSPACE " + KS_NAME + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='tracked'"); - cluster.schemaChange("CREATE TABLE " + KS_NAME + '.' + TBL_NAME + " (k int PRIMARY KEY, v int)"); + createTrackedKeyspace(cluster, ""); // Create a sync coordinator for a range that has no data // It should complete immediately since there are no offsets to sync Boolean completed = cluster.get(1).callOnInstance(() -> { - Range fullRange = new Range<>( - new Murmur3Partitioner.LongToken(Long.MIN_VALUE), - new Murmur3Partitioner.LongToken(Long.MAX_VALUE) - ); - - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME, fullRange); + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME, fullTokenRange()); coordinator.start(); try @@ -85,17 +104,13 @@ public void testSyncCoordinatorCompletesAfterDataSync() throws Throwable { try (Cluster cluster = builder().withNodes(6).start()) { - // Create a tracked keyspace - cluster.schemaChange("CREATE KEYSPACE " + KS_NAME + "2 WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='tracked'"); - cluster.schemaChange("CREATE TABLE " + KS_NAME + "2.tbl (k int PRIMARY KEY, v int)"); + createTrackedKeyspace(cluster, "2"); // Insert some data to create mutations for (int i = 0; i < 10000; i++) { cluster.coordinator(1).execute( - "INSERT INTO " + KS_NAME + "2.tbl (k, v) VALUES (?, ?)", + "INSERT INTO " + tableName("2") + " (k, v) VALUES (?, ?)", ConsistencyLevel.ALL, i, i ); } @@ -104,12 +119,7 @@ public void testSyncCoordinatorCompletesAfterDataSync() throws Throwable // Create a sync coordinator - should complete since all data is synced (CL.ALL) Boolean completed = cluster.get(1).callOnInstance(() -> { - Range fullRange = new Range<>( - new Murmur3Partitioner.LongToken(Long.MIN_VALUE), - new Murmur3Partitioner.LongToken(Long.MAX_VALUE) - ); - - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '2', fullRange); + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '2', fullTokenRange()); coordinator.start(); try @@ -128,38 +138,70 @@ public void testSyncCoordinatorCompletesAfterDataSync() throws Throwable } } + @Test + public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable + { + try (Cluster cluster = builder().withNodes(6).start()) + { + createTrackedKeyspace(cluster, "3"); + + // Pause broadcasts so nodes don't share offsets yet + pauseOffsetBroadcasts(cluster, true); + + // Write from different nodes with CL.ONE - each node has different mutations + // Different coordinators create mutations that only their local replica group knows about initially + cluster.coordinator(1).execute("INSERT INTO " + tableName("3") + " (k, v) VALUES (1, 1)", ConsistencyLevel.ONE); + cluster.coordinator(2).execute("INSERT INTO " + tableName("3") + " (k, v) VALUES (2, 2)", ConsistencyLevel.ONE); + cluster.coordinator(3).execute("INSERT INTO " + tableName("3") + " (k, v) VALUES (3, 3)", ConsistencyLevel.ONE); + + // Resume broadcasts so nodes can share their offsets + pauseOffsetBroadcasts(cluster, false); + + // Trigger broadcasts to share offsets between nodes + for (int i = 1; i <= 6; i++) + cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); + + Thread.sleep(500); // Wait for broadcasts to propagate + + Boolean completed = cluster.get(4).callOnInstance(() -> { + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + "3", fullTokenRange()); + coordinator.start(); + + try + { + return coordinator.awaitCompletion(30, TimeUnit.SECONDS); + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + return false; + } + }); + + assertTrue("Sync should complete after all mutations from all nodes are reconciled", completed); + } + } + @Test public void testSyncCoordinatorCancel() throws Throwable { try (Cluster cluster = builder().withNodes(3).start()) { - // Create a tracked keyspace with data so there are shards to sync - cluster.schemaChange("CREATE KEYSPACE cancel_test_ks WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='tracked'"); - cluster.schemaChange("CREATE TABLE cancel_test_ks.tbl (k int PRIMARY KEY, v int)"); + createTrackedKeyspace(cluster, "4"); // Pause offset broadcasts on all nodes to prevent sync from completing - for (int i = 1; i <= 3; i++) - { - cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.pauseOffsetBroadcast(true)); - } + pauseOffsetBroadcasts(cluster, true); for (int i = 0; i < 100; i++) { cluster.coordinator(1).execute( - "INSERT INTO cancel_test_ks.tbl (k, v) VALUES (?, ?)", + "INSERT INTO " + tableName("4") + " (k, v) VALUES (?, ?)", ConsistencyLevel.ONE, i, i); } // Start coordinator - it will be stuck waiting for offsets Boolean wasCancelled = cluster.get(1).callOnInstance(() -> { - Range fullRange = new Range<>( - new Murmur3Partitioner.LongToken(Long.MIN_VALUE), - new Murmur3Partitioner.LongToken(Long.MAX_VALUE) - ); - - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator("cancel_test_ks", fullRange); + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + "4", fullTokenRange()); coordinator.start(); try @@ -187,7 +229,7 @@ public void testSyncCoordinatorCancel() throws Throwable } catch (RuntimeException e) { - return e.getMessage() != null && e.getMessage().contains("cancelled"); + return e.getMessage() != null && e.getMessage().contains("cancelled"); } }); assertTrue("Sync coordinator should be cancelled", wasCancelled); From 4aced3065feea3f10e4b1835642de9b6b4690290 Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Mon, 26 Jan 2026 10:56:25 -0800 Subject: [PATCH 04/46] SyncCoordinatorTest file fix --- .../replication/MutationTrackingService.java | 2 +- .../MutationTrackingSyncCoordinator.java | 110 +++++++++++++-- .../MutationTrackingSyncCoordinatorTest.java | 125 ++++++++++-------- 3 files changed, 166 insertions(+), 71 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index 68cbe1c4cf9c..e25650fb16b8 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -411,7 +411,7 @@ public void updateReplicatedOffsets(String keyspace, Range range, List range; private final AsyncPromise completionFuture = new AsyncPromise<>(); + private volatile long startTimeMs; // Per-shard state: tracks what each node has reported for that shard private final Map, ShardSyncState> shardStates = new ConcurrentHashMap<>(); @@ -46,6 +56,9 @@ public class MutationTrackingSyncCoordinator private final AtomicBoolean started = new AtomicBoolean(false); private final AtomicBoolean completed = new AtomicBoolean(false); + private final Set allParticipants = new HashSet<>(); + private final Set reportedParticipants = ConcurrentHashMap.newKeySet(); + public MutationTrackingSyncCoordinator(String keyspace, Range range) { this.keyspace = keyspace; @@ -57,6 +70,8 @@ public void start() if (!started.compareAndSet(false, true)) throw new IllegalStateException("Sync coordinator already started"); + startTimeMs = System.currentTimeMillis(); + List overlappingShards; overlappingShards = new ArrayList<>(); @@ -71,25 +86,35 @@ public void start() return; } + InetAddressAndPort localAddress = FBUtilities.getBroadcastAddressAndPort(); + for (Shard shard : overlappingShards) + { + allParticipants.addAll(shard.remoteReplicas()); + allParticipants.add(localAddress); + } + // Register to receive offset updates MutationTrackingService.instance.registerSyncCoordinator(this); - // Initialize state for each shard and capture targets + // Initialize state for each shard for (Shard shard : overlappingShards) { ShardSyncState state = new ShardSyncState(shard); - state.captureTargets(); shardStates.put(shard.range, state); } - if (checkIfComplete()) - { - complete(); - return; - } + // Mark self as reported and capture local targets + reportedParticipants.add(localAddress); + recaptureTargets(); + + logger.info("Sync coordinator started for keyspace {} range {}, tracking {} shards, waiting for {} participants", + keyspace, range, overlappingShards.size(), allParticipants.size()); - logger.info("Sync coordinator started for keyspace {} range {}, tracking {} shards", - keyspace, range, overlappingShards.size()); + // Check if we're the only participant and already complete + checkIfReadyToComplete(); + + // Schedule a delayed check for the empty targets timeout case + scheduler.schedule(this::checkIfReadyToComplete, EMPTY_TARGETS_TIMEOUT_MS + 100, TimeUnit.MILLISECONDS); } private void complete() @@ -110,19 +135,80 @@ private boolean checkIfComplete() return true; } - public void onOffsetsReceived() + private void recaptureTargets() + { + for (ShardSyncState state : shardStates.values()) + { + state.captureTargets(); + } + } + + /** + * Check if we're ready to complete. We can complete when: + * 1. All participants have reported their offsets AND all targets are reconciled, OR + * 2. No targets have been discovered after the timeout (no data to sync anywhere) + */ + private void checkIfReadyToComplete() { if (completed.get()) return; - // The underlying CoordinatorLog already updates its reconciled offsets. - // We just need to re-check if we're now complete. + if (hasNoTargets() && (System.currentTimeMillis() - startTimeMs) > EMPTY_TARGETS_TIMEOUT_MS) + { + logger.info("Sync coordinator completed for keyspace {} range {} - no targets discovered after {}ms", + keyspace, range, EMPTY_TARGETS_TIMEOUT_MS); + complete(); + return; + } + + // Wait until all participants have reported + if (!reportedParticipants.containsAll(allParticipants)) + { + logger.trace("Sync coordinator waiting for participants. Reported: {}, All: {}", + reportedParticipants.size(), allParticipants.size()); + return; + } + + // All participants have reported, check if targets are reconciled if (checkIfComplete()) { + logger.info("Sync coordinator completed for keyspace {} range {}", keyspace, range); complete(); } } + private boolean hasNoTargets() + { + for (ShardSyncState state : shardStates.values()) + { + if (!state.targets.isEmpty()) + return false; + } + return true; + } + + /** + * Called when offset updates are received from a participant. + * @param from The participant that sent the offsets + */ + public void onOffsetsReceived(InetAddressAndPort from) + { + if (completed.get()) + return; + + boolean newParticipant = reportedParticipants.add(from); + + if (newParticipant) + { + logger.trace("Sync coordinator received offsets from new participant {}. Reported: {}/{}", + from, reportedParticipants.size(), allParticipants.size()); + } + + recaptureTargets(); // Recapture targets to include any new coordinator logs + + checkIfReadyToComplete(); + } + public String getKeyspace() { return keyspace; diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index 4ca689666f26..b2a5cbfe5ae0 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -17,6 +17,8 @@ */ package org.apache.cassandra.distributed.test.replication; +import java.time.Duration; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import org.junit.Test; @@ -29,6 +31,7 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.replication.MutationTrackingSyncCoordinator; +import org.awaitility.Awaitility; import static org.junit.Assert.*; @@ -100,85 +103,91 @@ public void testSyncCoordinatorCompletesWhenNoShards() throws Throwable } @Test - public void testSyncCoordinatorCompletesAfterDataSync() throws Throwable + public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable { - try (Cluster cluster = builder().withNodes(6).start()) + try (Cluster cluster = builder().withNodes(3).start()) { - createTrackedKeyspace(cluster, "2"); + createTrackedKeyspace(cluster, "3"); - // Insert some data to create mutations - for (int i = 0; i < 10000; i++) - { - cluster.coordinator(1).execute( - "INSERT INTO " + tableName("2") + " (k, v) VALUES (?, ?)", - ConsistencyLevel.ALL, i, i - ); - } + // Block all messages FROM node 1 to prevent write replication + // This ensures that write only succeeds locally on node 1 + cluster.filters().allVerbs().from(1).drop(); - Thread.sleep(500); // Wait for offset broadcasts to propagate + cluster.coordinator(1).execute( + "INSERT INTO " + tableName("3") + " (k, v) VALUES (1, 1)", + ConsistencyLevel.ONE + ); - // Create a sync coordinator - should complete since all data is synced (CL.ALL) - Boolean completed = cluster.get(1).callOnInstance(() -> { - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '2', fullTokenRange()); + // Start MutationTrackingSyncCoordinator on node 2 in a separate thread + // It should wait for offsets to sync since node 1's data hasn't propagated yet + CompletableFuture coordinatorFuture = CompletableFuture.supplyAsync(() -> cluster.get(2).callOnInstance(() -> { + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '3', fullTokenRange()); coordinator.start(); try { - // Give it enough time for broadcasts to arrive - return coordinator.awaitCompletion(15, TimeUnit.SECONDS); + return coordinator.awaitCompletion(10, TimeUnit.SECONDS); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return false; } - }); - - assertTrue("Sync coordinator should complete after data is fully replicated", completed); - } - } - - @Test - public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable - { - try (Cluster cluster = builder().withNodes(6).start()) - { - createTrackedKeyspace(cluster, "3"); - - // Pause broadcasts so nodes don't share offsets yet - pauseOffsetBroadcasts(cluster, true); + })); + + // Wait until node 1 has the data + Awaitility.await() + .atMost(Duration.ofSeconds(5)) + .pollInterval(Duration.ofMillis(100)) + .untilAsserted(() -> { + Object[][] results = cluster.get(1).executeInternal( + "SELECT k, v FROM " + tableName("3") + " WHERE k = 1"); + assertEquals("Node 1 should have the data", 1, results.length); + }); + + // Verify other nodes shouldn't have the data yet since we have blocked messages + for (int i = 2; i <= 3; i++) + { + Object[][] results = cluster.get(i).executeInternal( + "SELECT k, v FROM " + tableName("3") + " WHERE k = 1" + ); + assertEquals("Node " + i + " should not have data yet", 0, results.length); + } - // Write from different nodes with CL.ONE - each node has different mutations - // Different coordinators create mutations that only their local replica group knows about initially - cluster.coordinator(1).execute("INSERT INTO " + tableName("3") + " (k, v) VALUES (1, 1)", ConsistencyLevel.ONE); - cluster.coordinator(2).execute("INSERT INTO " + tableName("3") + " (k, v) VALUES (2, 2)", ConsistencyLevel.ONE); - cluster.coordinator(3).execute("INSERT INTO " + tableName("3") + " (k, v) VALUES (3, 3)", ConsistencyLevel.ONE); + // Verify coordinator stays blocked for at least 2 seconds + Awaitility.await() + .during(Duration.ofSeconds(2)) + .atMost(Duration.ofSeconds(3)) + .until(() -> !coordinatorFuture.isDone()); - // Resume broadcasts so nodes can share their offsets - pauseOffsetBroadcasts(cluster, false); + cluster.filters().reset(); - // Trigger broadcasts to share offsets between nodes - for (int i = 1; i <= 6; i++) + for (int i = 1; i <= 3; i++) cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); - Thread.sleep(500); // Wait for broadcasts to propagate + // Wait for coordinator to complete + Awaitility.await() + .atMost(Duration.ofSeconds(30)) + .pollInterval(Duration.ofMillis(200)) + .until(coordinatorFuture::isDone); - Boolean completed = cluster.get(4).callOnInstance(() -> { - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + "3", fullTokenRange()); - coordinator.start(); - - try - { - return coordinator.awaitCompletion(30, TimeUnit.SECONDS); - } - catch (InterruptedException e) - { - Thread.currentThread().interrupt(); - return false; - } - }); + assertTrue("Coordinator should complete successfully", coordinatorFuture.get()); - assertTrue("Sync should complete after all mutations from all nodes are reconciled", completed); + // Verify data propagated to all replicas + for (int i = 1; i <= 3; i++) + { + final int nodeId = i; + Awaitility.await() + .atMost(Duration.ofSeconds(10)) + .pollInterval(Duration.ofMillis(100)) + .untilAsserted(() -> { + Object[][] results = cluster.get(nodeId).executeInternal( + "SELECT k, v FROM " + tableName("3") + " WHERE k = 1"); + assertEquals("Node " + nodeId + " should have the data", 1, results.length); + assertEquals(1, results[0][0]); + assertEquals(1, results[0][1]); + }); + } } } @@ -201,7 +210,7 @@ public void testSyncCoordinatorCancel() throws Throwable // Start coordinator - it will be stuck waiting for offsets Boolean wasCancelled = cluster.get(1).callOnInstance(() -> { - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + "4", fullTokenRange()); + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '4', fullTokenRange()); coordinator.start(); try From 42b465e0faceab91644ff881da799b7ed205d4d2 Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Mon, 26 Jan 2026 14:41:37 -0800 Subject: [PATCH 05/46] Change shardStates from CHM -> HM --- .../replication/MutationTrackingSyncCoordinator.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 1ab1d459ff92..7d6f59858e94 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; +import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -51,7 +52,7 @@ public class MutationTrackingSyncCoordinator private volatile long startTimeMs; // Per-shard state: tracks what each node has reported for that shard - private final Map, ShardSyncState> shardStates = new ConcurrentHashMap<>(); + private final Map, ShardSyncState> shardStates = new HashMap<>(); private final AtomicBoolean started = new AtomicBoolean(false); private final AtomicBoolean completed = new AtomicBoolean(false); @@ -93,9 +94,6 @@ public void start() allParticipants.add(localAddress); } - // Register to receive offset updates - MutationTrackingService.instance.registerSyncCoordinator(this); - // Initialize state for each shard for (Shard shard : overlappingShards) { @@ -103,6 +101,9 @@ public void start() shardStates.put(shard.range, state); } + // Register to receive offset updates + MutationTrackingService.instance.registerSyncCoordinator(this); + // Mark self as reported and capture local targets reportedParticipants.add(localAddress); recaptureTargets(); From 30cf06fd7d4eb00563e25acfd178e8301ec590ee Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Mon, 26 Jan 2026 15:06:08 -0800 Subject: [PATCH 06/46] Fix possible shard staleness --- .../MutationTrackingSyncCoordinator.java | 48 +++++++++++++++++-- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 7d6f59858e94..818718dcb3ed 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -92,11 +92,7 @@ public void start() { allParticipants.addAll(shard.remoteReplicas()); allParticipants.add(localAddress); - } - // Initialize state for each shard - for (Shard shard : overlappingShards) - { ShardSyncState state = new ShardSyncState(shard); shardStates.put(shard.range, state); } @@ -138,12 +134,54 @@ private boolean checkIfComplete() private void recaptureTargets() { + if (checkForTopologyChange()) + return; + for (ShardSyncState state : shardStates.values()) { state.captureTargets(); } } + /** + * Checks if any of the shards we're tracking have changed due to topology updates. + * @return true if topology changed (and repair was failed), false if all shards are still current + */ + private boolean checkForTopologyChange() + { + for (ShardSyncState state : shardStates.values()) + { + Shard currentShard = getCurrentShard(state.shard.range); + if (currentShard != state.shard) + { + failWithTopologyChange(); + return true; + } + } + return false; + } + + private Shard getCurrentShard(Range shardRange) + { + Shard[] result = new Shard[1]; + MutationTrackingService.instance.forEachShardInKeyspace(keyspace, shard -> { + if (shard.range.equals(shardRange)) + result[0] = shard; + }); + return result[0]; + } + + private void failWithTopologyChange() + { + if (completed.compareAndSet(false, true)) + { + logger.warn("Sync coordinator for keyspace {} range {} failed due to topology change", + keyspace, range); + MutationTrackingService.instance.unregisterSyncCoordinator(this); + completionFuture.setFailure(new RuntimeException("Repair failed: topology changed during sync")); + } + } + /** * Check if we're ready to complete. We can complete when: * 1. All participants have reported their offsets AND all targets are reconciled, OR @@ -151,7 +189,7 @@ private void recaptureTargets() */ private void checkIfReadyToComplete() { - if (completed.get()) + if (completed.get() || checkForTopologyChange()) return; if (hasNoTargets() && (System.currentTimeMillis() - startTimeMs) > EMPTY_TARGETS_TIMEOUT_MS) From 92ab440f633d223117c187987135edf020f06030 Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Wed, 28 Jan 2026 15:29:18 -0800 Subject: [PATCH 07/46] Fix for happens-before --- .../MutationTrackingSyncCoordinator.java | 47 +++++++++++++--- .../MutationTrackingSyncCoordinatorTest.java | 54 +++++++++++++++++++ 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 818718dcb3ed..0dd3935aead5 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -44,6 +44,10 @@ public class MutationTrackingSyncCoordinator private static final Logger logger = LoggerFactory.getLogger(MutationTrackingSyncCoordinator.class); private static final long EMPTY_TARGETS_TIMEOUT_MS = 3000; + // Must be >= TRANSIENT_BROADCAST_INTERVAL_MILLIS (200ms) + network buffer + // to ensure we receive at least one fresh broadcast from each participant + private static final long MIN_BROADCAST_WAIT_MS = 300; + private static final long PARTICIPANT_TIMEOUT_MS = 10000; private static final ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor(); private final String keyspace; @@ -192,7 +196,10 @@ private void checkIfReadyToComplete() if (completed.get() || checkForTopologyChange()) return; - if (hasNoTargets() && (System.currentTimeMillis() - startTimeMs) > EMPTY_TARGETS_TIMEOUT_MS) + long elapsedMs = System.currentTimeMillis() - startTimeMs; + + // Handle the empty targets + if (hasNoTargets() && elapsedMs > EMPTY_TARGETS_TIMEOUT_MS) { logger.info("Sync coordinator completed for keyspace {} range {} - no targets discovered after {}ms", keyspace, range, EMPTY_TARGETS_TIMEOUT_MS); @@ -200,20 +207,48 @@ private void checkIfReadyToComplete() return; } - // Wait until all participants have reported - if (!reportedParticipants.containsAll(allParticipants)) + // Ensuring we have waited long enough for fresh broadcasts from all replicas - happens-before situation + if (elapsedMs < MIN_BROADCAST_WAIT_MS) { - logger.trace("Sync coordinator waiting for participants. Reported: {}, All: {}", - reportedParticipants.size(), allParticipants.size()); + long remainingMs = MIN_BROADCAST_WAIT_MS - elapsedMs + 10; + logger.trace("Sync coordinator waiting for broadcast cycle. Elapsed: {}ms, Required: {}ms", + elapsedMs, MIN_BROADCAST_WAIT_MS); + scheduler.schedule(this::checkIfReadyToComplete, remainingMs, TimeUnit.MILLISECONDS); return; } - // All participants have reported, check if targets are reconciled + // Wait until all participants have reported or timeout + if (!reportedParticipants.containsAll(allParticipants)) + { + if (elapsedMs < PARTICIPANT_TIMEOUT_MS) + { + logger.trace("Sync coordinator waiting for participants. Reported: {}, All: {}", + reportedParticipants.size(), allParticipants.size()); + // Schedule a retry to check again after timeout + long remainingMs = PARTICIPANT_TIMEOUT_MS - elapsedMs + 100; + scheduler.schedule(this::checkIfReadyToComplete, remainingMs, TimeUnit.MILLISECONDS); + return; + } + + Set missing = new HashSet<>(allParticipants); + missing.removeAll(reportedParticipants); + logger.warn("Sync coordinator timed out waiting for participants: {}. Proceeding with available offsets.", + missing); + } + + // All participants have reported (or timed out), check if targets are reconciled if (checkIfComplete()) { logger.info("Sync coordinator completed for keyspace {} range {}", keyspace, range); complete(); } + else if (elapsedMs >= PARTICIPANT_TIMEOUT_MS) + { + // Participant timeout reached but targets not reconciled - complete anyway + logger.warn("Sync coordinator completing for keyspace {} range {} after timeout, some targets may not be reconciled", + keyspace, range); + complete(); + } } private boolean hasNoTargets() diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index b2a5cbfe5ae0..5b2d5cd55b8d 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -120,6 +120,7 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable // Start MutationTrackingSyncCoordinator on node 2 in a separate thread // It should wait for offsets to sync since node 1's data hasn't propagated yet + long syncStartTime = System.currentTimeMillis(); CompletableFuture coordinatorFuture = CompletableFuture.supplyAsync(() -> cluster.get(2).callOnInstance(() -> { MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '3', fullTokenRange()); coordinator.start(); @@ -188,6 +189,11 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable assertEquals(1, results[0][1]); }); } + + // Verify the sync respected the minimum broadcast wait time (MIN_BROADCAST_WAIT_MS = 300ms) + long syncDuration = System.currentTimeMillis() - syncStartTime; + assertTrue("Sync should wait at least MIN_BROADCAST_WAIT_MS (300ms). Actual: " + syncDuration + "ms", + syncDuration >= 300); } } @@ -244,4 +250,52 @@ public void testSyncCoordinatorCancel() throws Throwable assertTrue("Sync coordinator should be cancelled", wasCancelled); } } + + @Test + public void testSyncCoordinatorTimesOutOnUnresponsiveParticipant() throws Throwable + { + try (Cluster cluster = builder().withNodes(3).start()) + { + createTrackedKeyspace(cluster, "5"); + + cluster.coordinator(1).execute( + "INSERT INTO " + tableName("5") + " (k, v) VALUES (1, 1)", + ConsistencyLevel.ALL + ); + + // Broadcast from all nodes first so they're in sync + for (int i = 1; i <= cluster.size(); i++) + cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); + + // Block all messages FROM node 3 permanently - it will never report + cluster.filters().allVerbs().from(3).drop(); + + long syncStartTime = System.currentTimeMillis(); + + // Start sync coordinator on node 1 - it should time out waiting for node 3 + Boolean completed = cluster.get(1).callOnInstance(() -> { + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( + KS_NAME + '5', fullTokenRange()); + coordinator.start(); + + try + { + // Wait longer than PARTICIPANT_TIMEOUT_MS (10s) + buffer + return coordinator.awaitCompletion(20, TimeUnit.SECONDS); + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + return false; + } + }); + + long syncDuration = System.currentTimeMillis() - syncStartTime; + + assertTrue("Sync coordinator should complete after timeout", completed); + // Should have taken at least PARTICIPANT_TIMEOUT_MS (10s) + assertTrue("Sync should have timed out waiting for participant. Actual: " + syncDuration + "ms", + syncDuration >= 10000); + } + } } From 6d68ef0cd139a27aed7fcf64c9c17ad72c96dea5 Mon Sep 17 00:00:00 2001 From: Aparna Naik Date: Thu, 29 Jan 2026 11:39:50 -0800 Subject: [PATCH 08/46] Fix MutationTrackingIncrementalRepairTask file --- .../cassandra/config/CassandraRelevantProperties.java | 1 + .../repair/MutationTrackingIncrementalRepairTask.java | 11 +++++++++-- .../MutationTrackingSyncCoordinatorTest.java | 9 +++++++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java index 8c0c08bf59cc..dfd7d0744326 100644 --- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java +++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java @@ -447,6 +447,7 @@ public enum CassandraRelevantProperties REPAIR_FAIL_TIMEOUT_SECONDS("cassandra.repair_fail_timeout_seconds", convertToString(Ints.checkedCast(TimeUnit.DAYS.toSeconds(1)))), REPAIR_MUTATION_REPAIR_ROWS_PER_BATCH("cassandra.repair.mutation_repair_rows_per_batch", "100"), REPAIR_STATUS_CHECK_TIMEOUT_SECONDS("cassandra.repair_status_check_timeout_seconds", convertToString(Ints.checkedCast(TimeUnit.HOURS.toSeconds(1)))), + REPAIR_SYNC_TIMEOUT_MINUTES("cassandra.repair_sync_timeout_minutes", "30"), /** * When doing a host replacement its possible that the gossip state is "empty" meaning that the endpoint is known * but the current state isn't known. If the host replacement is needed to repair this state, this property must diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index d70746408adf..54ba5bcfacba 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -23,6 +23,7 @@ import java.util.concurrent.TimeUnit; import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.replication.MutationTrackingSyncCoordinator; @@ -36,7 +37,7 @@ /** Incremental repair task for keyspaces using mutation tracking */ public class MutationTrackingIncrementalRepairTask extends AbstractRepairTask { - private static final long SYNC_TIMEOUT_MINUTES = 30; + private static final long SYNC_TIMEOUT_MINUTES = CassandraRelevantProperties.REPAIR_SYNC_TIMEOUT_MINUTES.getLong(); private final TimeUUID parentSession; private final RepairCoordinator.NeighborsAndRanges neighborsAndRanges; @@ -140,7 +141,13 @@ private void waitForSyncCompletion(List syncCoo else { // Pure mutation tracking - create successful result - resultPromise.trySuccess(CoordinatedRepairResult.create(rangeCollections, List.of())); + List results = new ArrayList<>(); + for (int i = 0; i < rangeCollections.size(); i++) + { + Collection> ranges = rangeCollections.get(i); + results.add(new RepairSessionResult(parentSession, keyspace, ranges, List.of(), false)); + } + resultPromise.trySuccess(CoordinatedRepairResult.create(rangeCollections, results)); } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index 5b2d5cd55b8d..9ab6cb8113ab 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -263,11 +263,16 @@ public void testSyncCoordinatorTimesOutOnUnresponsiveParticipant() throws Throwa ConsistencyLevel.ALL ); - // Broadcast from all nodes first so they're in sync + // Pause broadcasts on node 3 BEFORE any broadcasts - this ensures node 3 + // never sends any offset broadcasts to the coordinator, simulating an unresponsive node + cluster.get(3).runOnInstance(() -> MutationTrackingService.instance.pauseOffsetBroadcast(true)); + + // Broadcast from all nodes - node 3's broadcast is a no-op because it's paused for (int i = 1; i <= cluster.size(); i++) cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); - // Block all messages FROM node 3 permanently - it will never report + // Also block messages FROM node 3 to ensure even if periodic broadcasts resume, + // they won't reach the coordinator cluster.filters().allVerbs().from(3).drop(); long syncStartTime = System.currentTimeMillis(); From 3ab3b55de78248a640737f2a0865b6cd65f5c6c5 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Tue, 3 Mar 2026 15:06:42 -0500 Subject: [PATCH 09/46] Fix IR still doing anti-compaction and TCM consulted multiple times non-atomically --- ...MutationTrackingIncrementalRepairTask.java | 17 +++-- .../cassandra/repair/RepairCoordinator.java | 59 +++++++++++---- .../repair/autorepair/AutoRepairState.java | 4 +- .../repair/messages/RepairOption.java | 19 +++++ .../cassandra/service/StorageService.java | 2 +- ...tionTrackingIncrementalRepairTaskTest.java | 71 ++++++++++++++----- 6 files changed, 132 insertions(+), 40 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index 54ba5bcfacba..b56c7eddbd1e 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -134,7 +134,7 @@ private void waitForSyncCompletion(List syncCoo coordinator.notifyProgress("Mutation tracking sync completed for all ranges"); - if (requiresTraditionalRepair(keyspace)) + if (coordinator.mutationTrackingMigrationInProgress) { runTraditionalRepairForMigration(executor, validationScheduler, allRanges, resultPromise); } @@ -182,10 +182,12 @@ private void runTraditionalRepairForMigration(ExecutorPlus executor, * Returns true if: * - Keyspace uses mutation tracking replication, OR * - Keyspace is currently migrating (either direction) + * + * @param metadata the snapshotted cluster metadata to evaluate against + * @param keyspace the keyspace name to check */ - public static boolean shouldUseMutationTrackingRepair(String keyspace) + public static boolean shouldUseMutationTrackingRepair(ClusterMetadata metadata, String keyspace) { - ClusterMetadata metadata = ClusterMetadata.current(); KeyspaceMetadata ksm = metadata.schema.maybeGetKeyspaceMetadata(keyspace).orElse(null); if (ksm == null) return false; @@ -195,19 +197,22 @@ public static boolean shouldUseMutationTrackingRepair(String keyspace) return true; // Check if keyspace is in migration (either direction) + // TODO (required): What we do depends on direction right? Migration to MT requires incremental repair, migration away requires MT repair KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); return migrationInfo != null; } /** - * Determines if we also need to run traditional repair. + * Determines if a mutation tracking migration is in progress for this keyspace. * Returns true during migration: * - Migrating TO mutation tracking: need traditional repair to sync pre-migration data * - Migrating FROM mutation tracking: need traditional repair for post-migration consistency + * + * @param metadata the snapshotted cluster metadata to evaluate against + * @param keyspace the keyspace name to check */ - public static boolean requiresTraditionalRepair(String keyspace) + public static boolean isMutationTrackingMigrationInProgress(ClusterMetadata metadata, String keyspace) { - ClusterMetadata metadata = ClusterMetadata.current(); KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); return migrationInfo != null; } diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index 55274dd7b996..e4a3ae5a24a6 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -113,20 +113,51 @@ public class RepairCoordinator implements Runnable, ProgressEventNotifier, Repai final SharedContext ctx; final Scheduler validationScheduler; + // Mutation tracking decision, snapshotted once at creation time from TCM + final boolean useMutationTracking; + final boolean mutationTrackingMigrationInProgress; + private TraceState traceState; - public RepairCoordinator(StorageService storageService, int cmd, RepairOption options, String keyspace, Epoch minEpoch) + /** + * Creates a RepairCoordinator, snapshotting TCM state to decide whether mutation tracking + * should be used. If mutation tracking is active for the keyspace (and no migration is in progress), + * the incremental flag in RepairOption is flipped to false to prevent anti-compaction. + */ + public static RepairCoordinator create(StorageService storageService, int cmd, RepairOption options, String keyspace, Epoch minEpoch) { - this(SharedContext.Global.instance, - (ks, tables) -> storageService.getValidColumnFamilies(false, false, ks, tables), - storageService::getLocalReplicas, - cmd, options, keyspace, minEpoch); + ClusterMetadata metadata = ClusterMetadata.current(); + boolean useMT = MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, keyspace); + boolean mtMigration = MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, keyspace); + + // If using mutation tracking without migration, flip incremental to false + // to prevent anti-compaction since mutation tracking doesn't use repaired/unrepaired distinction + if (useMT && !mtMigration && options.isIncremental()) + { + logger.info("Keyspace {} uses mutation tracking; disabling incremental repair to skip anti-compaction", keyspace); + options = options.withIncremental(false); + } + + return new RepairCoordinator(SharedContext.Global.instance, + (ks, tables) -> storageService.getValidColumnFamilies(false, false, ks, tables), + storageService::getLocalReplicas, + cmd, options, keyspace, minEpoch, + useMT, mtMigration); } RepairCoordinator(SharedContext ctx, BiFunction> validColumnFamilies, Function getLocalReplicas, int cmd, RepairOption options, String keyspace, Epoch minEpoch) + { + this(ctx, validColumnFamilies, getLocalReplicas, cmd, options, keyspace, minEpoch, false, false); + } + + RepairCoordinator(SharedContext ctx, + BiFunction> validColumnFamilies, + Function getLocalReplicas, + int cmd, RepairOption options, String keyspace, Epoch minEpoch, + boolean useMutationTracking, boolean mutationTrackingMigrationInProgress) { this.ctx = ctx; this.minEpoch = minEpoch; @@ -135,6 +166,8 @@ public RepairCoordinator(StorageService storageService, int cmd, RepairOption op this.tag = "repair:" + cmd; this.validColumnFamilies = validColumnFamilies; this.getLocalReplicas = getLocalReplicas; + this.useMutationTracking = useMutationTracking; + this.mutationTrackingMigrationInProgress = mutationTrackingMigrationInProgress; ctx.repair().register(state); } @@ -501,17 +534,15 @@ private Future>> repair(String[] { task = new PreviewRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); } + else if (useMutationTracking) + { + // Mutation tracking repair: incremental was already flipped to false in the factory method + // (unless migration is in progress, in which case incremental stays true for anti-compaction) + task = new MutationTrackingIncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + } else if (state.options.isIncremental()) { - // For keyspaces using mutation tracking, use the mutation tracking repair task - if (MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(state.keyspace)) - { - task = new MutationTrackingIncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); - } - else - { - task = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); - } + task = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); } else { diff --git a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java index a5c4fdc2c16f..e89a3a40d88e 100644 --- a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java +++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java @@ -98,8 +98,8 @@ protected AutoRepairState(RepairType repairType) protected RepairCoordinator getRepairRunnable(String keyspace, RepairOption options) { - return new RepairCoordinator(StorageService.instance, StorageService.nextRepairCommand.incrementAndGet(), - options, keyspace, ClusterMetadata.current().epoch); + return RepairCoordinator.create(StorageService.instance, StorageService.nextRepairCommand.incrementAndGet(), + options, keyspace, ClusterMetadata.current().epoch); } public long getLastRepairTime() diff --git a/src/java/org/apache/cassandra/repair/messages/RepairOption.java b/src/java/org/apache/cassandra/repair/messages/RepairOption.java index 3c8a260164d1..fa77d3ce7fcb 100644 --- a/src/java/org/apache/cassandra/repair/messages/RepairOption.java +++ b/src/java/org/apache/cassandra/repair/messages/RepairOption.java @@ -494,6 +494,25 @@ public String toString() ')'; } + /** + * Returns a new RepairOption with the incremental flag set to the given value. + * All other options are copied from this instance. + */ + public RepairOption withIncremental(boolean incremental) + { + if (this.incremental == incremental) + return this; + + RepairOption copy = new RepairOption(parallelism, primaryRange, incremental, trace, jobThreads, + ranges, pullRepair, forceRepair, previewKind, optimiseStreams, + ignoreUnreplicatedKeyspaces, repairData, repairPaxos, + dontPurgeTombstones, repairAccord); + copy.getColumnFamilies().addAll(columnFamilies); + copy.getDataCenters().addAll(dataCenters); + copy.getHosts().addAll(hosts); + return copy; + } + public Map asMap() { Map options = new HashMap<>(); diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index eea21afe8194..39993ffe7fab 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -3139,7 +3139,7 @@ private FutureTask createRepairTask(final int cmd, final String keyspace throw new IllegalArgumentException("data center(s) " + datacenters.toString() + " not found"); } - RepairCoordinator task = new RepairCoordinator(this, cmd, options, keyspace, ClusterMetadata.current().epoch); + RepairCoordinator task = RepairCoordinator.create(this, cmd, options, keyspace, ClusterMetadata.current().epoch); task.addProgressListener(progressSupport); for (ProgressListener listener : listeners) task.addProgressListener(listener); diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java index 5383fea0d79e..052c1869ca7b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java @@ -28,6 +28,7 @@ import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.repair.MutationTrackingIncrementalRepairTask; +import org.apache.cassandra.tcm.ClusterMetadata; import static org.junit.Assert.*; @@ -72,7 +73,10 @@ public void testShouldUseMutationTrackingRepairForTrackedKeyspace() throws Throw "{'class': 'SimpleStrategy', 'replication_factor': 3} " + "AND replication_type='tracked'"); - Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, ksName); + }); assertTrue("Tracked keyspace should use mutation tracking repair", shouldUse); } @@ -85,7 +89,10 @@ public void testShouldNotUseMutationTrackingRepairForUntrackedKeyspace() throws "{'class': 'SimpleStrategy', 'replication_factor': 3} " + "AND replication_type='untracked'"); - Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, ksName); + }); assertFalse("Untracked keyspace should not use mutation tracking repair", shouldUse); } @@ -98,15 +105,21 @@ public void testRequiresTraditionalRepairReturnsFalseForNonMigratingKeyspace() t "{'class': 'SimpleStrategy', 'replication_factor': 3} " + "AND replication_type='tracked'"); - Boolean requiresTraditional = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); + Boolean migrationInProgress = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ksName); + }); - assertFalse("Non-migrating keyspace should not require traditional repair", requiresTraditional); + assertFalse("Non-migrating keyspace should not have migration in progress", migrationInProgress); } @Test public void testShouldUseMutationTrackingRepairForNonexistentKeyspace() throws Throwable { - Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair("nonexistent_ks_xyz")); + Boolean shouldUse = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, "nonexistent_ks_xyz"); + }); assertFalse("Nonexistent keyspace should return false", shouldUse); } @@ -121,11 +134,17 @@ public void testMigrationFromUntrackedToTracked() throws Throwable CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); // Verify initial state - Boolean shouldUseBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + Boolean shouldUseBefore = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, ksName); + }); assertFalse("Untracked keyspace should not use mutation tracking repair", shouldUseBefore); - Boolean requiresBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); - assertFalse("Non-migrating keyspace should not require traditional repair", requiresBefore); + Boolean migrationBefore = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ksName); + }); + assertFalse("Non-migrating keyspace should not have migration in progress", migrationBefore); // Trigger migration by altering to tracked CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + @@ -133,11 +152,17 @@ public void testMigrationFromUntrackedToTracked() throws Throwable "AND replication_type='tracked'"); // Verify migration state - both methods should now return true - Boolean shouldUseAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + Boolean shouldUseAfter = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, ksName); + }); assertTrue("Migrating keyspace should use mutation tracking repair", shouldUseAfter); - Boolean requiresAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); - assertTrue("Migrating keyspace should require traditional repair", requiresAfter); + Boolean migrationAfter = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ksName); + }); + assertTrue("Migrating keyspace should have migration in progress", migrationAfter); } @Test @@ -150,11 +175,17 @@ public void testMigrationFromTrackedToUntracked() throws Throwable CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); // Verify initial state - Boolean shouldUseBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + Boolean shouldUseBefore = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, ksName); + }); assertTrue("Tracked keyspace should use mutation tracking repair", shouldUseBefore); - Boolean requiresBefore = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); - assertFalse("Non-migrating tracked keyspace should not require traditional repair", requiresBefore); + Boolean migrationBefore = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ksName); + }); + assertFalse("Non-migrating tracked keyspace should not have migration in progress", migrationBefore); // Migrate back to untracked CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + @@ -162,10 +193,16 @@ public void testMigrationFromTrackedToUntracked() throws Throwable "AND replication_type='untracked'"); // During reverse migration, both should still apply - Boolean shouldUseAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(ksName)); + Boolean shouldUseAfter = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, ksName); + }); assertTrue("Keyspace migrating from tracked should still use mutation tracking repair", shouldUseAfter); - Boolean requiresAfter = CLUSTER.get(1).callOnInstance(() -> MutationTrackingIncrementalRepairTask.requiresTraditionalRepair(ksName)); - assertTrue("Keyspace migrating from tracked should require traditional repair", requiresAfter); + Boolean migrationAfter = CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ksName); + }); + assertTrue("Keyspace migrating from tracked should have migration in progress", migrationAfter); } } From f0f68c3ede8eacc011f7e22aa73c14238fbb2c3d Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Wed, 4 Mar 2026 14:47:28 -0500 Subject: [PATCH 10/46] Collect offsets via message exchange --- src/java/org/apache/cassandra/net/Verb.java | 4 + ...MutationTrackingIncrementalRepairTask.java | 5 +- .../repair/RepairMessageVerbHandler.java | 41 +++++ .../messages/MutationTrackingSyncRequest.java | 66 ++++++++ .../MutationTrackingSyncResponse.java | 156 ++++++++++++++++++ .../repair/messages/RepairMessage.java | 4 + .../replication/MutationTrackingService.java | 37 +++++ .../MutationTrackingSyncCoordinator.java | 106 +++++++++++- .../MutationTrackingSyncCoordinatorTest.java | 30 +++- 9 files changed, 438 insertions(+), 11 deletions(-) create mode 100644 src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java create mode 100644 src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java index 93c0a68e7390..cb5151795ed1 100644 --- a/src/java/org/apache/cassandra/net/Verb.java +++ b/src/java/org/apache/cassandra/net/Verb.java @@ -64,6 +64,8 @@ import org.apache.cassandra.repair.messages.FinalizeCommit; import org.apache.cassandra.repair.messages.FinalizePromise; import org.apache.cassandra.repair.messages.FinalizePropose; +import org.apache.cassandra.repair.messages.MutationTrackingSyncRequest; +import org.apache.cassandra.repair.messages.MutationTrackingSyncResponse; import org.apache.cassandra.repair.messages.PrepareConsistentRequest; import org.apache.cassandra.repair.messages.PrepareConsistentResponse; import org.apache.cassandra.repair.messages.PrepareMessage; @@ -289,6 +291,8 @@ public enum Verb FAILED_SESSION_MSG (113, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> FailSession.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), STATUS_RSP (115, P1, repairTimeout, ANTI_ENTROPY, () -> StatusResponse.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), STATUS_REQ (114, P1, repairTimeout, ANTI_ENTROPY, () -> StatusRequest.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), + MT_SYNC_RSP (117, P1, repairWithBackoffTimeout, REQUEST_RESPONSE, () -> MutationTrackingSyncResponse.serializer, RESPONSE_HANDLER), + MT_SYNC_REQ (116, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> MutationTrackingSyncRequest.serializer, () -> RepairMessageVerbHandler.instance(), MT_SYNC_RSP ), REPLICATION_DONE_RSP (82, P0, rpcTimeout, MISC, () -> NoPayload.serializer, RESPONSE_HANDLER ), REPLICATION_DONE_REQ (22, P0, rpcTimeout, MISC, () -> NoPayload.serializer, () -> ReplicationDoneVerbHandler.instance, REPLICATION_DONE_RSP), diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index b56c7eddbd1e..daf2b1a6484a 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -78,7 +78,10 @@ public Future performUnsafe(ExecutorPlus executor, Sche { for (Range range : commonRange.ranges) { - MutationTrackingSyncCoordinator syncCoordinator = new MutationTrackingSyncCoordinator(keyspace, range); + RepairJobDesc desc = new RepairJobDesc(parentSession, TimeUUID.Generator.nextTimeUUID(), + keyspace, "", List.of(range)); + MutationTrackingSyncCoordinator syncCoordinator = new MutationTrackingSyncCoordinator( + coordinator.ctx, desc); syncCoordinator.start(); syncCoordinators.add(syncCoordinator); rangeCollections.add(List.of(range)); diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java index a0ed59d97913..ab9b0ad5515d 100644 --- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java +++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.concurrent.TimeoutException; import java.util.function.BiFunction; import java.util.function.Function; @@ -29,11 +30,19 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.replication.CoordinatorLogId; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.Offsets; import org.apache.cassandra.repair.messages.CleanupMessage; import org.apache.cassandra.repair.messages.FailSession; +import org.apache.cassandra.repair.messages.MutationTrackingSyncRequest; +import org.apache.cassandra.repair.messages.MutationTrackingSyncResponse; import org.apache.cassandra.repair.messages.PrepareMessage; import org.apache.cassandra.repair.messages.RepairMessage; import org.apache.cassandra.repair.messages.StatusRequest; @@ -51,6 +60,7 @@ import org.apache.cassandra.service.StorageService; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; @@ -381,6 +391,10 @@ public void doVerb(final Message message) ctx.repair().consistent.local.handleStatusResponse(message.from(), (StatusResponse) message.payload); break; + case MT_SYNC_REQ: + handleMutationTrackingSyncRequest(message); + break; + default: ctx.repair().handleMessage(message); break; @@ -489,4 +503,31 @@ private static boolean acceptMessage(final ValidationRequest validationRequest, "validation request", from); } + + @SuppressWarnings("unchecked") + private void handleMutationTrackingSyncRequest(Message message) + { + MutationTrackingSyncRequest request = (MutationTrackingSyncRequest) message.payload; + RepairJobDesc desc = request.desc; + logger.debug("Handling mutation tracking sync request {}", desc); + + try + { + Map, Map> offsets = + MutationTrackingService.instance.collectWitnessedOffsetsForRanges(desc.keyspace, desc.ranges); + + MutationTrackingSyncResponse response = new MutationTrackingSyncResponse( + desc, + FBUtilities.getBroadcastAddressAndPort(), + true, + offsets); + + ctx.messaging().send(message.responseWith(response), message.from()); + } + catch (Exception e) + { + logger.error("Failed to handle mutation tracking sync request {}", desc, e); + sendFailureResponse(message); + } + } } diff --git a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java new file mode 100644 index 000000000000..60adf0792162 --- /dev/null +++ b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.repair.messages; + +import java.io.IOException; + +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.repair.RepairJobDesc; + +/** + * Request sent from the mutation tracking repair coordinator to each participant to collect + * their current witnessed offsets. This establishes a happens-before relationship: the + * participant's response contains offsets captured after receiving this request, which is + * sent after the repair starts. + */ +public class MutationTrackingSyncRequest extends RepairMessage +{ + public MutationTrackingSyncRequest(RepairJobDesc desc) + { + super(desc); + } + + @Override + public String toString() + { + return "MutationTrackingSyncRequest{" + + "desc=" + desc + + '}'; + } + + public static final IVersionedSerializer serializer = new IVersionedSerializer<>() + { + public void serialize(MutationTrackingSyncRequest request, DataOutputPlus out, int version) throws IOException + { + RepairJobDesc.serializer.serialize(request.desc, out, version); + } + + public MutationTrackingSyncRequest deserialize(DataInputPlus in, int version) throws IOException + { + RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version); + return new MutationTrackingSyncRequest(desc); + } + + public long serializedSize(MutationTrackingSyncRequest request, int version) + { + return RepairJobDesc.serializer.serializedSize(request.desc, version); + } + }; +} diff --git a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java new file mode 100644 index 000000000000..a5d216b8a7bf --- /dev/null +++ b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.repair.messages; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.replication.CoordinatorLogId; +import org.apache.cassandra.replication.Offsets; +import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.utils.CollectionSerializers; + +import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer; + +/** + * Response from a participant to a {@link MutationTrackingSyncRequest}. Contains the + * participant's current witnessed offsets for each shard overlapping the requested ranges. + * These offsets are captured after the request is received, establishing a happens-before + * relationship with the repair start. + */ +public class MutationTrackingSyncResponse extends RepairMessage +{ + public final InetAddressAndPort participant; + public final boolean success; + /** Per-shard witnessed offsets: shard range -> (logId -> offsets) */ + public final Map, Map> offsetsByShard; + + /** + * The inner map is keyed by CoordinatorLogId, but the logId is already embedded in each + * Offsets.Immutable, so we serialize only the values as a list and reconstruct the map + * on deserialization. + */ + private static final IVersionedSerializer> offsetsMapSerializer = new IVersionedSerializer<>() + { + public void serialize(Map map, DataOutputPlus out, int version) throws IOException + { + CollectionSerializers.serializeCollection(map.values(), out, version, Offsets.serializer); + } + + public Map deserialize(DataInputPlus in, int version) throws IOException + { + List offsetsList = CollectionSerializers.deserializeList(in, version, Offsets.serializer); + Map map = new HashMap<>(offsetsList.size()); + for (Offsets.Immutable offsets : offsetsList) + map.put(offsets.logId(), offsets); + return map; + } + + public long serializedSize(Map map, int version) + { + return CollectionSerializers.serializedCollectionSize(map.values(), version, Offsets.serializer); + } + }; + + public MutationTrackingSyncResponse(RepairJobDesc desc, + InetAddressAndPort participant, + boolean success, + Map, Map> offsetsByShard) + { + super(desc); + assert participant != null; + this.participant = participant; + this.success = success; + this.offsetsByShard = offsetsByShard; + } + + @Override + public boolean equals(Object o) + { + if (!(o instanceof MutationTrackingSyncResponse)) + return false; + MutationTrackingSyncResponse other = (MutationTrackingSyncResponse) o; + return Objects.equals(desc, other.desc) + && participant.equals(other.participant) + && success == other.success + && Objects.equals(offsetsByShard, other.offsetsByShard); + } + + @Override + public int hashCode() + { + return Objects.hash(desc, participant, success, offsetsByShard); + } + + @Override + public String toString() + { + return "MutationTrackingSyncResponse{" + + "desc=" + desc + + ", participant=" + participant + + ", success=" + success + + ", shardCount=" + (offsetsByShard != null ? offsetsByShard.size() : 0) + + '}'; + } + + @SuppressWarnings("unchecked") + public static final IVersionedSerializer serializer = new IVersionedSerializer<>() + { + public void serialize(MutationTrackingSyncResponse response, DataOutputPlus out, int version) throws IOException + { + RepairJobDesc.serializer.serialize(response.desc, out, version); + inetAddressAndPortSerializer.serialize(response.participant, out, version); + out.writeBoolean(response.success); + CollectionSerializers.serializeMap((Map, Map>) (Map) response.offsetsByShard, + out, version, Range.tokenSerializer, offsetsMapSerializer); + } + + public MutationTrackingSyncResponse deserialize(DataInputPlus in, int version) throws IOException + { + RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version); + InetAddressAndPort participant = inetAddressAndPortSerializer.deserialize(in, version); + boolean success = in.readBoolean(); + Map, Map> raw = + CollectionSerializers.deserializeMap(in, version, Range.tokenSerializer, offsetsMapSerializer); + Map, Map> offsetsByShard = + (Map, Map>) (Map) raw; + return new MutationTrackingSyncResponse(desc, participant, success, offsetsByShard); + } + + public long serializedSize(MutationTrackingSyncResponse response, int version) + { + long size = RepairJobDesc.serializer.serializedSize(response.desc, version); + size += inetAddressAndPortSerializer.serializedSize(response.participant, version); + size += TypeSizes.sizeof(response.success); + size += CollectionSerializers.serializedMapSize((Map, Map>) (Map) response.offsetsByShard, + version, Range.tokenSerializer, offsetsMapSerializer); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java index c615bf9f52f3..59dbb9ba5bd5 100644 --- a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java +++ b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java @@ -93,6 +93,8 @@ public void onFailure(InetAddressAndPort from, RequestFailure failure) map.put(Verb.FINALIZE_PROMISE_MSG, SUPPORTS_RETRY); map.put(Verb.FINALIZE_COMMIT_MSG, SUPPORTS_RETRY); map.put(Verb.FAILED_SESSION_MSG, SUPPORTS_RETRY); + // Mutation tracking messages + map.put(Verb.MT_SYNC_REQ, SUPPORTS_RETRY); VERB_TIMEOUT_VERSIONS = Collections.unmodifiableMap(map); EnumSet allowsRetry = EnumSet.noneOf(Verb.class); @@ -110,6 +112,8 @@ public void onFailure(InetAddressAndPort from, RequestFailure failure) allowsRetry.add(Verb.FINALIZE_PROMISE_MSG); allowsRetry.add(Verb.FINALIZE_COMMIT_MSG); allowsRetry.add(Verb.FAILED_SESSION_MSG); + // Mutation tracking messages + allowsRetry.add(Verb.MT_SYNC_REQ); ALLOWS_RETRY = Collections.unmodifiableSet(allowsRetry); } diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index e25650fb16b8..1eae2206e71d 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -668,6 +668,43 @@ public void forEachShardInKeyspace(String keyspace, Consumer consumer) } } + /** + * Collects the union of witnessed offsets for all shards in the given keyspace that overlap + * with the specified ranges. Used by the mutation tracking repair protocol to establish + * a happens-before relationship. + * + * @param keyspace the keyspace to collect offsets for + * @param ranges the token ranges to find overlapping shards for + * @return a map from shard range to the union of witnessed offsets per coordinator log + */ + public Map, Map> collectWitnessedOffsetsForRanges(String keyspace, Collection> ranges) + { + Map, Map> result = new HashMap<>(); + shardLock.readLock().lock(); + try + { + KeyspaceShards ksShards = keyspaceShards.get(keyspace); + if (ksShards != null) + { + ksShards.forEachShard(shard -> { + for (Range range : ranges) + { + if (shard.range.intersects(range)) + { + result.put(shard.range, shard.collectUnionOfWitnessedOffsetsPerLog()); + break; + } + } + }); + } + } + finally + { + shardLock.readLock().unlock(); + } + return result; + } + public void collectLocallyMissingMutations(MutationSummary remoteSummary, Log2OffsetsMap.Mutable into) { shardLock.readLock().lock(); diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 0dd3935aead5..1e97b8fec2e8 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -33,9 +33,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.Iterables; + import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.repair.messages.MutationTrackingSyncRequest; +import org.apache.cassandra.repair.messages.MutationTrackingSyncResponse; +import org.apache.cassandra.repair.messages.RepairMessage; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.concurrent.AsyncPromise; @@ -48,8 +59,11 @@ public class MutationTrackingSyncCoordinator // to ensure we receive at least one fresh broadcast from each participant private static final long MIN_BROADCAST_WAIT_MS = 300; private static final long PARTICIPANT_TIMEOUT_MS = 10000; + // TODO (required): This needs to be a proper executor not a forbidden one private static final ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor(); + private final SharedContext ctx; + private final RepairJobDesc desc; private final String keyspace; private final Range range; private final AsyncPromise completionFuture = new AsyncPromise<>(); @@ -64,10 +78,12 @@ public class MutationTrackingSyncCoordinator private final Set allParticipants = new HashSet<>(); private final Set reportedParticipants = ConcurrentHashMap.newKeySet(); - public MutationTrackingSyncCoordinator(String keyspace, Range range) + public MutationTrackingSyncCoordinator(SharedContext ctx, RepairJobDesc desc) { - this.keyspace = keyspace; - this.range = range; + this.ctx = ctx; + this.desc = desc; + this.keyspace = desc.keyspace; + this.range = Iterables.getOnlyElement(desc.ranges); } public void start() @@ -111,6 +127,9 @@ public void start() logger.info("Sync coordinator started for keyspace {} range {}, tracking {} shards, waiting for {} participants", keyspace, range, overlappingShards.size(), allParticipants.size()); + // Send sync requests to all remote participants + sendSyncRequests(localAddress); + // Check if we're the only participant and already complete checkIfReadyToComplete(); @@ -126,6 +145,45 @@ private void complete() completionFuture.setSuccess(null); } + private void sendSyncRequests(InetAddressAndPort localAddress) + { + MutationTrackingSyncRequest request = new MutationTrackingSyncRequest(desc); + for (InetAddressAndPort participant : allParticipants) + { + if (participant.equals(localAddress)) + continue; + + logger.debug("Sending mutation tracking sync request to {} for {}", participant, desc); + + RepairMessage.sendMessageWithRetries(ctx, + RepairMessage.notDone(completionFuture), + request, + Verb.MT_SYNC_REQ, + participant, + new RequestCallback() + { + @Override + public void onResponse(Message msg) + { + onSyncResponse(msg.payload); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + fail(new RuntimeException( + String.format("Mutation tracking sync failed: participant %s returned failure %s", from, failure.reason))); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }); + } + } + private boolean checkIfComplete() { for (ShardSyncState state : shardStates.values()) @@ -176,13 +234,18 @@ private Shard getCurrentShard(Range shardRange) } private void failWithTopologyChange() + { + fail(new RuntimeException("Repair failed: topology changed during sync")); + } + + private void fail(Throwable cause) { if (completed.compareAndSet(false, true)) { - logger.warn("Sync coordinator for keyspace {} range {} failed due to topology change", - keyspace, range); + logger.warn("Sync coordinator for keyspace {} range {} failed: {}", + keyspace, range, cause.getMessage()); MutationTrackingService.instance.unregisterSyncCoordinator(this); - completionFuture.setFailure(new RuntimeException("Repair failed: topology changed during sync")); + completionFuture.setFailure(cause); } } @@ -283,6 +346,37 @@ public void onOffsetsReceived(InetAddressAndPort from) checkIfReadyToComplete(); } + /** + * Called when a sync response is received from a participant in response to a + * MutationTrackingSyncRequest. Updates the shard targets with the offsets from the + * response, establishing a happens-before relationship with the repair start. + * + * @param response the sync response from a participant + */ + public void onSyncResponse(MutationTrackingSyncResponse response) + { + if (completed.get()) + return; + + // Update shard targets with the offsets received from the participant + for (Map.Entry, Map> entry : response.offsetsByShard.entrySet()) + { + Range shardRange = entry.getKey(); + ShardSyncState state = shardStates.get(shardRange); + if (state != null) + { + state.targets.putAll(entry.getValue()); + } + } + + reportedParticipants.add(response.participant); + + logger.trace("Sync coordinator received sync response from {}. Reported: {}/{}", + response.participant, reportedParticipants.size(), allParticipants.size()); + + checkIfReadyToComplete(); + } + public String getKeyspace() { return keyspace; diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index 9ab6cb8113ab..5374bc2c9f40 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -29,8 +29,11 @@ import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.repair.SharedContext; import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.replication.MutationTrackingSyncCoordinator; +import org.apache.cassandra.utils.TimeUUID; import org.awaitility.Awaitility; import static org.junit.Assert.*; @@ -84,7 +87,12 @@ public void testSyncCoordinatorCompletesWhenNoShards() throws Throwable // Create a sync coordinator for a range that has no data // It should complete immediately since there are no offsets to sync Boolean completed = cluster.get(1).callOnInstance(() -> { - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME, fullTokenRange()); + Range range = fullTokenRange(); + RepairJobDesc desc = new RepairJobDesc(TimeUUID.Generator.nextTimeUUID(), + TimeUUID.Generator.nextTimeUUID(), + KS_NAME, "", java.util.List.of(range)); + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( + SharedContext.Global.instance, desc); coordinator.start(); try @@ -122,7 +130,12 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable // It should wait for offsets to sync since node 1's data hasn't propagated yet long syncStartTime = System.currentTimeMillis(); CompletableFuture coordinatorFuture = CompletableFuture.supplyAsync(() -> cluster.get(2).callOnInstance(() -> { - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '3', fullTokenRange()); + Range range = fullTokenRange(); + RepairJobDesc desc = new RepairJobDesc(TimeUUID.Generator.nextTimeUUID(), + TimeUUID.Generator.nextTimeUUID(), + KS_NAME + '3', "", java.util.List.of(range)); + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( + SharedContext.Global.instance, desc); coordinator.start(); try @@ -216,7 +229,12 @@ public void testSyncCoordinatorCancel() throws Throwable // Start coordinator - it will be stuck waiting for offsets Boolean wasCancelled = cluster.get(1).callOnInstance(() -> { - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator(KS_NAME + '4', fullTokenRange()); + Range range = fullTokenRange(); + RepairJobDesc desc = new RepairJobDesc(TimeUUID.Generator.nextTimeUUID(), + TimeUUID.Generator.nextTimeUUID(), + KS_NAME + '4', "", java.util.List.of(range)); + MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( + SharedContext.Global.instance, desc); coordinator.start(); try @@ -279,8 +297,12 @@ public void testSyncCoordinatorTimesOutOnUnresponsiveParticipant() throws Throwa // Start sync coordinator on node 1 - it should time out waiting for node 3 Boolean completed = cluster.get(1).callOnInstance(() -> { + Range range = fullTokenRange(); + RepairJobDesc desc = new RepairJobDesc(TimeUUID.Generator.nextTimeUUID(), + TimeUUID.Generator.nextTimeUUID(), + KS_NAME + '5', "", java.util.List.of(range)); MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( - KS_NAME + '5', fullTokenRange()); + SharedContext.Global.instance, desc); coordinator.start(); try From 44a3bd319b01c8b75f753bb1219c018437ab1876 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Wed, 4 Mar 2026 15:22:58 -0500 Subject: [PATCH 11/46] Remove extra timeouts, make top level timeout configurable hot prop --- .../org/apache/cassandra/config/Config.java | 2 + .../cassandra/config/DatabaseDescriptor.java | 10 ++ ...MutationTrackingIncrementalRepairTask.java | 6 +- .../replication/MutationTrackingService.java | 2 +- .../MutationTrackingSyncCoordinator.java | 126 +++--------------- .../cassandra/service/StorageService.java | 13 ++ .../service/StorageServiceMBean.java | 3 + .../MutationTrackingSyncCoordinatorTest.java | 63 --------- 8 files changed, 47 insertions(+), 178 deletions(-) diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index d8da63d03eaa..09aaa16323ac 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -169,6 +169,8 @@ public static Set splitCommaDelimited(String src) @Replaces(oldName = "repair_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true) public volatile DurationSpec.LongMillisecondsBound repair_request_timeout = new DurationSpec.LongMillisecondsBound("120000ms"); + public volatile DurationSpec.LongMillisecondsBound mutation_tracking_sync_timeout = new DurationSpec.LongMillisecondsBound("1800000ms"); + public Integer streaming_connections_per_host = 1; @Replaces(oldName = "streaming_keep_alive_period_in_secs", converter = Converters.SECONDS_DURATION, deprecated = true) public DurationSpec.IntSecondsBound streaming_keep_alive_period = new DurationSpec.IntSecondsBound("300s"); diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index 1e06a402e0a5..cc93f4532b56 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -2592,6 +2592,16 @@ public static void setRepairRpcTimeout(Long timeOutInMillis) conf.repair_request_timeout = new DurationSpec.LongMillisecondsBound(timeOutInMillis); } + public static long getMutationTrackingSyncTimeout(TimeUnit unit) + { + return conf.mutation_tracking_sync_timeout.to(unit); + } + + public static void setMutationTrackingSyncTimeout(long timeoutInMillis) + { + conf.mutation_tracking_sync_timeout = new DurationSpec.LongMillisecondsBound(timeoutInMillis); + } + public static boolean hasCrossNodeTimeout() { return conf.internode_timeout; diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index daf2b1a6484a..78a7a6275444 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -23,7 +23,7 @@ import java.util.concurrent.TimeUnit; import org.apache.cassandra.concurrent.ExecutorPlus; -import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.replication.MutationTrackingSyncCoordinator; @@ -37,7 +37,6 @@ /** Incremental repair task for keyspaces using mutation tracking */ public class MutationTrackingIncrementalRepairTask extends AbstractRepairTask { - private static final long SYNC_TIMEOUT_MINUTES = CassandraRelevantProperties.REPAIR_SYNC_TIMEOUT_MINUTES.getLong(); private final TimeUUID parentSession; private final RepairCoordinator.NeighborsAndRanges neighborsAndRanges; @@ -119,7 +118,8 @@ private void waitForSyncCompletion(List syncCoo boolean allSucceeded = true; for (MutationTrackingSyncCoordinator syncCoordinator : syncCoordinators) { - boolean completed = syncCoordinator.awaitCompletion(SYNC_TIMEOUT_MINUTES, TimeUnit.MINUTES); + boolean completed = syncCoordinator.awaitCompletion( + DatabaseDescriptor.getMutationTrackingSyncTimeout(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); if (!completed) { logger.warn("Mutation tracking sync timed out for keyspace {} range {}", diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index 1eae2206e71d..5de2a3979ffc 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -411,7 +411,7 @@ public void updateReplicatedOffsets(String keyspace, Range range, List= TRANSIENT_BROADCAST_INTERVAL_MILLIS (200ms) + network buffer - // to ensure we receive at least one fresh broadcast from each participant - private static final long MIN_BROADCAST_WAIT_MS = 300; - private static final long PARTICIPANT_TIMEOUT_MS = 10000; - // TODO (required): This needs to be a proper executor not a forbidden one - private static final ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor(); - private final SharedContext ctx; private final RepairJobDesc desc; private final String keyspace; private final Range range; private final AsyncPromise completionFuture = new AsyncPromise<>(); - private volatile long startTimeMs; // Per-shard state: tracks what each node has reported for that shard private final Map, ShardSyncState> shardStates = new HashMap<>(); @@ -75,9 +63,6 @@ public class MutationTrackingSyncCoordinator private final AtomicBoolean started = new AtomicBoolean(false); private final AtomicBoolean completed = new AtomicBoolean(false); - private final Set allParticipants = new HashSet<>(); - private final Set reportedParticipants = ConcurrentHashMap.newKeySet(); - public MutationTrackingSyncCoordinator(SharedContext ctx, RepairJobDesc desc) { this.ctx = ctx; @@ -91,8 +76,6 @@ public void start() if (!started.compareAndSet(false, true)) throw new IllegalStateException("Sync coordinator already started"); - startTimeMs = System.currentTimeMillis(); - List overlappingShards; overlappingShards = new ArrayList<>(); @@ -110,9 +93,6 @@ public void start() InetAddressAndPort localAddress = FBUtilities.getBroadcastAddressAndPort(); for (Shard shard : overlappingShards) { - allParticipants.addAll(shard.remoteReplicas()); - allParticipants.add(localAddress); - ShardSyncState state = new ShardSyncState(shard); shardStates.put(shard.range, state); } @@ -120,21 +100,17 @@ public void start() // Register to receive offset updates MutationTrackingService.instance.registerSyncCoordinator(this); - // Mark self as reported and capture local targets - reportedParticipants.add(localAddress); + // Capture local targets recaptureTargets(); - logger.info("Sync coordinator started for keyspace {} range {}, tracking {} shards, waiting for {} participants", - keyspace, range, overlappingShards.size(), allParticipants.size()); + logger.info("Sync coordinator started for keyspace {} range {}, tracking {} shards", + keyspace, range, overlappingShards.size()); // Send sync requests to all remote participants sendSyncRequests(localAddress); - // Check if we're the only participant and already complete + // Check if already complete (e.g. single node, no targets) checkIfReadyToComplete(); - - // Schedule a delayed check for the empty targets timeout case - scheduler.schedule(this::checkIfReadyToComplete, EMPTY_TARGETS_TIMEOUT_MS + 100, TimeUnit.MILLISECONDS); } private void complete() @@ -148,11 +124,12 @@ private void complete() private void sendSyncRequests(InetAddressAndPort localAddress) { MutationTrackingSyncRequest request = new MutationTrackingSyncRequest(desc); - for (InetAddressAndPort participant : allParticipants) - { - if (participant.equals(localAddress)) - continue; + Set remoteParticipants = ConcurrentHashMap.newKeySet(); + for (ShardSyncState state : shardStates.values()) + remoteParticipants.addAll(state.shard.remoteReplicas()); + for (InetAddressAndPort participant : remoteParticipants) + { logger.debug("Sending mutation tracking sync request to {} for {}", participant, desc); RepairMessage.sendMessageWithRetries(ctx, @@ -184,16 +161,6 @@ public boolean invokeOnFailure() } } - private boolean checkIfComplete() - { - for (ShardSyncState state : shardStates.values()) - { - if (!state.isComplete()) - return false; - } - return true; - } - private void recaptureTargets() { if (checkForTopologyChange()) @@ -250,75 +217,25 @@ private void fail(Throwable cause) } /** - * Check if we're ready to complete. We can complete when: - * 1. All participants have reported their offsets AND all targets are reconciled, OR - * 2. No targets have been discovered after the timeout (no data to sync anywhere) + * Check if all targets are reconciled across all shards. */ private void checkIfReadyToComplete() { if (completed.get() || checkForTopologyChange()) return; - long elapsedMs = System.currentTimeMillis() - startTimeMs; - - // Handle the empty targets - if (hasNoTargets() && elapsedMs > EMPTY_TARGETS_TIMEOUT_MS) - { - logger.info("Sync coordinator completed for keyspace {} range {} - no targets discovered after {}ms", - keyspace, range, EMPTY_TARGETS_TIMEOUT_MS); - complete(); - return; - } - - // Ensuring we have waited long enough for fresh broadcasts from all replicas - happens-before situation - if (elapsedMs < MIN_BROADCAST_WAIT_MS) - { - long remainingMs = MIN_BROADCAST_WAIT_MS - elapsedMs + 10; - logger.trace("Sync coordinator waiting for broadcast cycle. Elapsed: {}ms, Required: {}ms", - elapsedMs, MIN_BROADCAST_WAIT_MS); - scheduler.schedule(this::checkIfReadyToComplete, remainingMs, TimeUnit.MILLISECONDS); - return; - } - - // Wait until all participants have reported or timeout - if (!reportedParticipants.containsAll(allParticipants)) - { - if (elapsedMs < PARTICIPANT_TIMEOUT_MS) - { - logger.trace("Sync coordinator waiting for participants. Reported: {}, All: {}", - reportedParticipants.size(), allParticipants.size()); - // Schedule a retry to check again after timeout - long remainingMs = PARTICIPANT_TIMEOUT_MS - elapsedMs + 100; - scheduler.schedule(this::checkIfReadyToComplete, remainingMs, TimeUnit.MILLISECONDS); - return; - } - - Set missing = new HashSet<>(allParticipants); - missing.removeAll(reportedParticipants); - logger.warn("Sync coordinator timed out waiting for participants: {}. Proceeding with available offsets.", - missing); - } - - // All participants have reported (or timed out), check if targets are reconciled if (checkIfComplete()) { logger.info("Sync coordinator completed for keyspace {} range {}", keyspace, range); complete(); } - else if (elapsedMs >= PARTICIPANT_TIMEOUT_MS) - { - // Participant timeout reached but targets not reconciled - complete anyway - logger.warn("Sync coordinator completing for keyspace {} range {} after timeout, some targets may not be reconciled", - keyspace, range); - complete(); - } } - private boolean hasNoTargets() + private boolean checkIfComplete() { for (ShardSyncState state : shardStates.values()) { - if (!state.targets.isEmpty()) + if (!state.isComplete()) return false; } return true; @@ -326,23 +243,13 @@ private boolean hasNoTargets() /** * Called when offset updates are received from a participant. - * @param from The participant that sent the offsets */ - public void onOffsetsReceived(InetAddressAndPort from) + public void onOffsetsReceived() { if (completed.get()) return; - boolean newParticipant = reportedParticipants.add(from); - - if (newParticipant) - { - logger.trace("Sync coordinator received offsets from new participant {}. Reported: {}/{}", - from, reportedParticipants.size(), allParticipants.size()); - } - - recaptureTargets(); // Recapture targets to include any new coordinator logs - + recaptureTargets(); checkIfReadyToComplete(); } @@ -369,10 +276,7 @@ public void onSyncResponse(MutationTrackingSyncResponse response) } } - reportedParticipants.add(response.participant); - - logger.trace("Sync coordinator received sync response from {}. Reported: {}/{}", - response.participant, reportedParticipants.size(), allParticipants.size()); + logger.trace("Sync coordinator received sync response from {}", response.participant); checkIfReadyToComplete(); } diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 39993ffe7fab..8fb60ff1c2d6 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -5464,6 +5464,19 @@ public void setRepairRpcTimeout(Long timeoutInMillis) DatabaseDescriptor.setRepairRpcTimeout(timeoutInMillis); logger.info("RepairRpcTimeout set to {}ms via JMX", timeoutInMillis); } + + public long getMutationTrackingSyncTimeout() + { + return DatabaseDescriptor.getMutationTrackingSyncTimeout(MILLISECONDS); + } + + public void setMutationTrackingSyncTimeout(long timeoutInMillis) + { + checkState(timeoutInMillis > 0); + DatabaseDescriptor.setMutationTrackingSyncTimeout(timeoutInMillis); + logger.info("MutationTrackingSyncTimeout set to {}ms via JMX", timeoutInMillis); + } + public void evictHungRepairs() { logger.info("StorageService#clearPaxosRateLimiters called via jmx"); diff --git a/src/java/org/apache/cassandra/service/StorageServiceMBean.java b/src/java/org/apache/cassandra/service/StorageServiceMBean.java index 5d4781c54651..31935467e400 100644 --- a/src/java/org/apache/cassandra/service/StorageServiceMBean.java +++ b/src/java/org/apache/cassandra/service/StorageServiceMBean.java @@ -1334,6 +1334,9 @@ public void enableAuditLog(String loggerName, String includedKeyspaces, String e public Long getRepairRpcTimeout(); public void setRepairRpcTimeout(Long timeoutInMillis); + public long getMutationTrackingSyncTimeout(); + public void setMutationTrackingSyncTimeout(long timeoutInMillis); + public void evictHungRepairs(); public void clearPaxosRepairs(); public void setSkipPaxosRepairCompatibilityCheck(boolean v); diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index 5374bc2c9f40..499b24322ddf 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -128,7 +128,6 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable // Start MutationTrackingSyncCoordinator on node 2 in a separate thread // It should wait for offsets to sync since node 1's data hasn't propagated yet - long syncStartTime = System.currentTimeMillis(); CompletableFuture coordinatorFuture = CompletableFuture.supplyAsync(() -> cluster.get(2).callOnInstance(() -> { Range range = fullTokenRange(); RepairJobDesc desc = new RepairJobDesc(TimeUUID.Generator.nextTimeUUID(), @@ -202,11 +201,6 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable assertEquals(1, results[0][1]); }); } - - // Verify the sync respected the minimum broadcast wait time (MIN_BROADCAST_WAIT_MS = 300ms) - long syncDuration = System.currentTimeMillis() - syncStartTime; - assertTrue("Sync should wait at least MIN_BROADCAST_WAIT_MS (300ms). Actual: " + syncDuration + "ms", - syncDuration >= 300); } } @@ -268,61 +262,4 @@ public void testSyncCoordinatorCancel() throws Throwable assertTrue("Sync coordinator should be cancelled", wasCancelled); } } - - @Test - public void testSyncCoordinatorTimesOutOnUnresponsiveParticipant() throws Throwable - { - try (Cluster cluster = builder().withNodes(3).start()) - { - createTrackedKeyspace(cluster, "5"); - - cluster.coordinator(1).execute( - "INSERT INTO " + tableName("5") + " (k, v) VALUES (1, 1)", - ConsistencyLevel.ALL - ); - - // Pause broadcasts on node 3 BEFORE any broadcasts - this ensures node 3 - // never sends any offset broadcasts to the coordinator, simulating an unresponsive node - cluster.get(3).runOnInstance(() -> MutationTrackingService.instance.pauseOffsetBroadcast(true)); - - // Broadcast from all nodes - node 3's broadcast is a no-op because it's paused - for (int i = 1; i <= cluster.size(); i++) - cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); - - // Also block messages FROM node 3 to ensure even if periodic broadcasts resume, - // they won't reach the coordinator - cluster.filters().allVerbs().from(3).drop(); - - long syncStartTime = System.currentTimeMillis(); - - // Start sync coordinator on node 1 - it should time out waiting for node 3 - Boolean completed = cluster.get(1).callOnInstance(() -> { - Range range = fullTokenRange(); - RepairJobDesc desc = new RepairJobDesc(TimeUUID.Generator.nextTimeUUID(), - TimeUUID.Generator.nextTimeUUID(), - KS_NAME + '5', "", java.util.List.of(range)); - MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( - SharedContext.Global.instance, desc); - coordinator.start(); - - try - { - // Wait longer than PARTICIPANT_TIMEOUT_MS (10s) + buffer - return coordinator.awaitCompletion(20, TimeUnit.SECONDS); - } - catch (InterruptedException e) - { - Thread.currentThread().interrupt(); - return false; - } - }); - - long syncDuration = System.currentTimeMillis() - syncStartTime; - - assertTrue("Sync coordinator should complete after timeout", completed); - // Should have taken at least PARTICIPANT_TIMEOUT_MS (10s) - assertTrue("Sync should have timed out waiting for participant. Actual: " + syncDuration + "ms", - syncDuration >= 10000); - } - } } From 3c457fc9cc88d6f388d289a89e73862001a93695 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 5 Mar 2026 11:49:58 -0500 Subject: [PATCH 12/46] Fix timeout calculation and clean up error handling --- ...MutationTrackingIncrementalRepairTask.java | 31 ++++++++++++++----- .../MutationTrackingSyncCoordinator.java | 23 +++++--------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index 78a7a6275444..5959b2ae9831 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -30,6 +30,7 @@ import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; @@ -115,22 +116,38 @@ private void waitForSyncCompletion(List syncCoo List>> rangeCollections, AsyncPromise resultPromise) throws InterruptedException { + long deadlineNanos = coordinator.ctx.clock().nanoTime() + TimeUnit.MILLISECONDS.toNanos( + DatabaseDescriptor.getMutationTrackingSyncTimeout(TimeUnit.MILLISECONDS)); boolean allSucceeded = true; + Throwable error = null; for (MutationTrackingSyncCoordinator syncCoordinator : syncCoordinators) { - boolean completed = syncCoordinator.awaitCompletion( - DatabaseDescriptor.getMutationTrackingSyncTimeout(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); - if (!completed) + long remainingNanos = deadlineNanos - coordinator.ctx.clock().nanoTime(); + try + { + boolean completed = syncCoordinator.awaitCompletion(remainingNanos, TimeUnit.NANOSECONDS); + if (!completed) + { + syncCoordinator.cancel(); + allSucceeded = false; + } + } + catch (RuntimeException e) { - logger.warn("Mutation tracking sync timed out for keyspace {} range {}", - keyspace, syncCoordinator.getRange()); - syncCoordinator.cancel(); - allSucceeded = false; + error = Throwables.merge(error, e); } } + if (error != null) + { + logger.warn("Mutation tracking sync failed for keyspace {}", keyspace, error); + resultPromise.tryFailure(error); + return; + } + if (!allSucceeded) { + logger.warn("Mutation tracking sync timed out for keyspace {}", keyspace); resultPromise.tryFailure(new RuntimeException("Mutation tracking sync timed out for some ranges")); return; } diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 7f452b434ed2..3e3e54bd7234 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -61,7 +61,6 @@ public class MutationTrackingSyncCoordinator private final Map, ShardSyncState> shardStates = new HashMap<>(); private final AtomicBoolean started = new AtomicBoolean(false); - private final AtomicBoolean completed = new AtomicBoolean(false); public MutationTrackingSyncCoordinator(SharedContext ctx, RepairJobDesc desc) { @@ -115,10 +114,8 @@ public void start() private void complete() { - if (!completed.compareAndSet(false, true)) - return; - MutationTrackingService.instance.unregisterSyncCoordinator(this); - completionFuture.setSuccess(null); + if (completionFuture.trySuccess(null)) + MutationTrackingService.instance.unregisterSyncCoordinator(this); } private void sendSyncRequests(InetAddressAndPort localAddress) @@ -207,12 +204,11 @@ private void failWithTopologyChange() private void fail(Throwable cause) { - if (completed.compareAndSet(false, true)) + if (completionFuture.tryFailure(cause)) { logger.warn("Sync coordinator for keyspace {} range {} failed: {}", keyspace, range, cause.getMessage()); MutationTrackingService.instance.unregisterSyncCoordinator(this); - completionFuture.setFailure(cause); } } @@ -221,7 +217,7 @@ private void fail(Throwable cause) */ private void checkIfReadyToComplete() { - if (completed.get() || checkForTopologyChange()) + if (completionFuture.isDone() || checkForTopologyChange()) return; if (checkIfComplete()) @@ -246,7 +242,7 @@ private boolean checkIfComplete() */ public void onOffsetsReceived() { - if (completed.get()) + if (completionFuture.isDone()) return; recaptureTargets(); @@ -262,7 +258,7 @@ public void onOffsetsReceived() */ public void onSyncResponse(MutationTrackingSyncResponse response) { - if (completed.get()) + if (completionFuture.isDone()) return; // Update shard targets with the offsets received from the participant @@ -311,17 +307,14 @@ public boolean awaitCompletion(long timeout, TimeUnit unit) throws InterruptedEx } catch (java.util.concurrent.ExecutionException e) { - throw new RuntimeException(e.getCause()); + throw new RuntimeException(e); } } public void cancel() { - if (completed.compareAndSet(false, true)) - { + if (completionFuture.tryFailure(new RuntimeException("Sync cancelled"))) MutationTrackingService.instance.unregisterSyncCoordinator(this); - completionFuture.setFailure(new RuntimeException("Sync cancelled")); - } } /** From 17095cd951bc66a510a83e60ce411a20512e4a4f Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 5 Mar 2026 11:54:40 -0500 Subject: [PATCH 13/46] Mutation tracking sync timeout doesn't need to be that long. If it didn't work in 2 minutes it won't work. --- src/java/org/apache/cassandra/config/Config.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 09aaa16323ac..49067333aebc 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -169,7 +169,7 @@ public static Set splitCommaDelimited(String src) @Replaces(oldName = "repair_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true) public volatile DurationSpec.LongMillisecondsBound repair_request_timeout = new DurationSpec.LongMillisecondsBound("120000ms"); - public volatile DurationSpec.LongMillisecondsBound mutation_tracking_sync_timeout = new DurationSpec.LongMillisecondsBound("1800000ms"); + public volatile DurationSpec.LongMillisecondsBound mutation_tracking_sync_timeout = new DurationSpec.LongMillisecondsBound("2m"); public Integer streaming_connections_per_host = 1; @Replaces(oldName = "streaming_keep_alive_period_in_secs", converter = Converters.SECONDS_DURATION, deprecated = true) From 65996e43326b6060317eea8bf894e98c845541de Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 5 Mar 2026 14:31:44 -0500 Subject: [PATCH 14/46] Using IncrementalRepairTask directly instead of embedding inside MutationTrackingIncrementalRepairTask --- ...MutationTrackingIncrementalRepairTask.java | 53 +++--------------- .../cassandra/repair/RepairCoordinator.java | 55 +++++++++++++------ 2 files changed, 46 insertions(+), 62 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index 5959b2ae9831..75b0cf1e7e3f 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -35,7 +35,7 @@ import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; -/** Incremental repair task for keyspaces using mutation tracking */ +/** Repair task that syncs mutation tracking offsets across replicas */ public class MutationTrackingIncrementalRepairTask extends AbstractRepairTask { @@ -57,7 +57,7 @@ protected MutationTrackingIncrementalRepairTask(RepairCoordinator coordinator, @Override public String name() { - return "MutationTrackingIncrementalRepair"; + return "MutationTrackingRepair"; } @Override @@ -97,7 +97,7 @@ public Future performUnsafe(ExecutorPlus executor, Sche executor.execute(() -> { try { - waitForSyncCompletion(syncCoordinators, executor, validationScheduler, allRanges, rangeCollections, resultPromise); + waitForSyncCompletion(syncCoordinators, rangeCollections, resultPromise); } catch (Exception e) { @@ -110,9 +110,6 @@ public Future performUnsafe(ExecutorPlus executor, Sche } private void waitForSyncCompletion(List syncCoordinators, - ExecutorPlus executor, - Scheduler validationScheduler, - List allRanges, List>> rangeCollections, AsyncPromise resultPromise) throws InterruptedException { @@ -154,47 +151,13 @@ private void waitForSyncCompletion(List syncCoo coordinator.notifyProgress("Mutation tracking sync completed for all ranges"); - if (coordinator.mutationTrackingMigrationInProgress) + List results = new ArrayList<>(); + for (int i = 0; i < rangeCollections.size(); i++) { - runTraditionalRepairForMigration(executor, validationScheduler, allRanges, resultPromise); + Collection> ranges = rangeCollections.get(i); + results.add(new RepairSessionResult(parentSession, keyspace, ranges, List.of(), false)); } - else - { - // Pure mutation tracking - create successful result - List results = new ArrayList<>(); - for (int i = 0; i < rangeCollections.size(); i++) - { - Collection> ranges = rangeCollections.get(i); - results.add(new RepairSessionResult(parentSession, keyspace, ranges, List.of(), false)); - } - resultPromise.trySuccess(CoordinatedRepairResult.create(rangeCollections, results)); - } - } - - private void runTraditionalRepairForMigration(ExecutorPlus executor, - Scheduler validationScheduler, - List allRanges, - AsyncPromise resultPromise) - { - coordinator.notifyProgress("Running traditional repair for migration"); - - // Use the inherited runRepair method from AbstractRepairTask - Future traditionalRepair = runRepair(parentSession, true, executor, - validationScheduler, allRanges, - neighborsAndRanges.shouldExcludeDeadParticipants, - cfnames); - - traditionalRepair.addListener(f -> { - try - { - CoordinatedRepairResult result = (CoordinatedRepairResult) f.get(); - resultPromise.setSuccess(result); - } - catch (Exception e) - { - resultPromise.setFailure(e); - } - }); + resultPromise.trySuccess(CoordinatedRepairResult.create(rangeCollections, results)); } /** diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index e4a3ae5a24a6..bc6251de2763 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -87,6 +87,7 @@ import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.WrappedRunnable; import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.progress.ProgressEvent; import org.apache.cassandra.utils.progress.ProgressEventNotifier; import org.apache.cassandra.utils.progress.ProgressEventType; @@ -127,12 +128,14 @@ public class RepairCoordinator implements Runnable, ProgressEventNotifier, Repai public static RepairCoordinator create(StorageService storageService, int cmd, RepairOption options, String keyspace, Epoch minEpoch) { ClusterMetadata metadata = ClusterMetadata.current(); - boolean useMT = MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, keyspace); - boolean mtMigration = MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, keyspace); + boolean useMT = options.isIncremental() + && MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, keyspace); + boolean mtMigration = useMT + && MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, keyspace); // If using mutation tracking without migration, flip incremental to false // to prevent anti-compaction since mutation tracking doesn't use repaired/unrepaired distinction - if (useMT && !mtMigration && options.isIncremental()) + if (useMT && !mtMigration) { logger.info("Keyspace {} uses mutation tracking; disabling incremental repair to skip anti-compaction", keyspace); options = options.withIncremental(false); @@ -529,32 +532,50 @@ private Future prepare(List columnFamilies, Set>> repair(String[] cfnames, NeighborsAndRanges neighborsAndRanges) { - RepairTask task; + ExecutorPlus executor = createExecutor(); + state.phase.repairSubmitted(); + if (state.options.isPreview()) { - task = new PreviewRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); + RepairTask task = new PreviewRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); + return task.perform(executor, validationScheduler) + .>>map(r -> Pair.create(r, task::successMessage)) + .addCallback((s, f) -> executor.shutdown()); } else if (useMutationTracking) { - // Mutation tracking repair: incremental was already flipped to false in the factory method - // (unless migration is in progress, in which case incremental stays true for anti-compaction) - task = new MutationTrackingIncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + RepairTask mtTask = new MutationTrackingIncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + if (mutationTrackingMigrationInProgress) + { + // During migration, run incremental repair first, then mutation tracking sync + RepairTask incrementalTask = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + return incrementalTask.perform(executor, validationScheduler) + .flatMap(irResult -> { + if (irResult.hasFailed()) + return ImmediateFuture.success(Pair.create(irResult, incrementalTask::successMessage)); + return mtTask.perform(executor, validationScheduler) + .>>map(r -> Pair.create(r, mtTask::successMessage)); + }) + .addCallback((s, f) -> executor.shutdown()); + } + return mtTask.perform(executor, validationScheduler) + .>>map(r -> Pair.create(r, mtTask::successMessage)) + .addCallback((s, f) -> executor.shutdown()); } else if (state.options.isIncremental()) { - task = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + RepairTask task = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); + return task.perform(executor, validationScheduler) + .>>map(r -> Pair.create(r, task::successMessage)) + .addCallback((s, f) -> executor.shutdown()); } else { - task = new NormalRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); + RepairTask task = new NormalRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); + return task.perform(executor, validationScheduler) + .>>map(r -> Pair.create(r, task::successMessage)) + .addCallback((s, f) -> executor.shutdown()); } - - ExecutorPlus executor = createExecutor(); - state.phase.repairSubmitted(); - return task.perform(executor, validationScheduler) - // after adding the callback java could no longer infer the type... - .>>map(r -> Pair.create(r, task::successMessage)) - .addCallback((s, f) -> executor.shutdown()); } private ExecutorPlus createExecutor() From f99c9d5faeec3ec47387f8ed05ae0ac197744fc2 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 9 Mar 2026 13:16:46 -0400 Subject: [PATCH 15/46] Add support for force and with hosts --- ...MutationTrackingIncrementalRepairTask.java | 4 +- .../cassandra/repair/RepairCoordinator.java | 9 +- .../repair/RepairMessageVerbHandler.java | 2 +- .../messages/MutationTrackingSyncRequest.java | 21 ++++- .../cassandra/replication/CoordinatorLog.java | 56 ++++++++++- .../replication/MutationTrackingService.java | 4 +- .../MutationTrackingSyncCoordinator.java | 93 +++++++++++++++---- .../apache/cassandra/replication/Shard.java | 42 ++++++++- .../utils/CollectionSerializers.java | 27 ++++++ 9 files changed, 226 insertions(+), 32 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index 75b0cf1e7e3f..a43c6b872227 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -42,6 +42,7 @@ public class MutationTrackingIncrementalRepairTask extends AbstractRepairTask private final TimeUUID parentSession; private final RepairCoordinator.NeighborsAndRanges neighborsAndRanges; private final String[] cfnames; + private final ClusterMetadata metadata; protected MutationTrackingIncrementalRepairTask(RepairCoordinator coordinator, TimeUUID parentSession, @@ -52,6 +53,7 @@ protected MutationTrackingIncrementalRepairTask(RepairCoordinator coordinator, this.parentSession = parentSession; this.neighborsAndRanges = neighborsAndRanges; this.cfnames = cfnames; + this.metadata = coordinator.metadata; } @Override @@ -81,7 +83,7 @@ public Future performUnsafe(ExecutorPlus executor, Sche RepairJobDesc desc = new RepairJobDesc(parentSession, TimeUUID.Generator.nextTimeUUID(), keyspace, "", List.of(range)); MutationTrackingSyncCoordinator syncCoordinator = new MutationTrackingSyncCoordinator( - coordinator.ctx, desc); + coordinator.ctx, desc, commonRange.endpoints, metadata); syncCoordinator.start(); syncCoordinators.add(syncCoordinator); rangeCollections.add(List.of(range)); diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index bc6251de2763..a51a62eb1e25 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -117,6 +117,7 @@ public class RepairCoordinator implements Runnable, ProgressEventNotifier, Repai // Mutation tracking decision, snapshotted once at creation time from TCM final boolean useMutationTracking; final boolean mutationTrackingMigrationInProgress; + final ClusterMetadata metadata; private TraceState traceState; @@ -145,7 +146,7 @@ public static RepairCoordinator create(StorageService storageService, int cmd, R (ks, tables) -> storageService.getValidColumnFamilies(false, false, ks, tables), storageService::getLocalReplicas, cmd, options, keyspace, minEpoch, - useMT, mtMigration); + useMT, mtMigration, metadata); } RepairCoordinator(SharedContext ctx, @@ -153,14 +154,15 @@ public static RepairCoordinator create(StorageService storageService, int cmd, R Function getLocalReplicas, int cmd, RepairOption options, String keyspace, Epoch minEpoch) { - this(ctx, validColumnFamilies, getLocalReplicas, cmd, options, keyspace, minEpoch, false, false); + this(ctx, validColumnFamilies, getLocalReplicas, cmd, options, keyspace, minEpoch, false, false, null); } RepairCoordinator(SharedContext ctx, BiFunction> validColumnFamilies, Function getLocalReplicas, int cmd, RepairOption options, String keyspace, Epoch minEpoch, - boolean useMutationTracking, boolean mutationTrackingMigrationInProgress) + boolean useMutationTracking, boolean mutationTrackingMigrationInProgress, + ClusterMetadata metadata) { this.ctx = ctx; this.minEpoch = minEpoch; @@ -171,6 +173,7 @@ public static RepairCoordinator create(StorageService storageService, int cmd, R this.getLocalReplicas = getLocalReplicas; this.useMutationTracking = useMutationTracking; this.mutationTrackingMigrationInProgress = mutationTrackingMigrationInProgress; + this.metadata = metadata; ctx.repair().register(state); } diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java index ab9b0ad5515d..2060ec1badb0 100644 --- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java +++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java @@ -514,7 +514,7 @@ private void handleMutationTrackingSyncRequest(Message message) try { Map, Map> offsets = - MutationTrackingService.instance.collectWitnessedOffsetsForRanges(desc.keyspace, desc.ranges); + MutationTrackingService.instance().collectWitnessedOffsetsForRanges(desc.keyspace, desc.ranges, request.liveHostIds); MutationTrackingSyncResponse response = new MutationTrackingSyncResponse( desc, diff --git a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java index 60adf0792162..dab889580a3f 100644 --- a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java +++ b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java @@ -18,23 +18,34 @@ package org.apache.cassandra.repair.messages; import java.io.IOException; +import java.util.Set; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.repair.RepairJobDesc; +import static org.apache.cassandra.utils.CollectionSerializers.nullableIntSetSerializer; + /** * Request sent from the mutation tracking repair coordinator to each participant to collect * their current witnessed offsets. This establishes a happens-before relationship: the * participant's response contains offsets captured after receiving this request, which is * sent after the repair starts. + * + * The liveHostIds set tells the responder which hosts are participating in this repair, + * so that the response only includes offsets witnessed by those hosts. This prevents the + * coordinator from setting sync targets that include offsets only known to down nodes. */ public class MutationTrackingSyncRequest extends RepairMessage { - public MutationTrackingSyncRequest(RepairJobDesc desc) + /** The set of host IDs participating in this repair. Null means all replicas. */ + public final Set liveHostIds; + + public MutationTrackingSyncRequest(RepairJobDesc desc, Set liveHostIds) { super(desc); + this.liveHostIds = liveHostIds; } @Override @@ -42,6 +53,7 @@ public String toString() { return "MutationTrackingSyncRequest{" + "desc=" + desc + + ", liveHostIds=" + liveHostIds + '}'; } @@ -50,17 +62,20 @@ public String toString() public void serialize(MutationTrackingSyncRequest request, DataOutputPlus out, int version) throws IOException { RepairJobDesc.serializer.serialize(request.desc, out, version); + nullableIntSetSerializer.serialize(request.liveHostIds, out); } public MutationTrackingSyncRequest deserialize(DataInputPlus in, int version) throws IOException { RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version); - return new MutationTrackingSyncRequest(desc); + Set liveHostIds = nullableIntSetSerializer.deserialize(in); + return new MutationTrackingSyncRequest(desc, liveHostIds); } public long serializedSize(MutationTrackingSyncRequest request, int version) { - return RepairJobDesc.serializer.serializedSize(request.desc, version); + return RepairJobDesc.serializer.serializedSize(request.desc, version) + + nullableIntSetSerializer.serializedSize(request.liveHostIds); } }; } diff --git a/src/java/org/apache/cassandra/replication/CoordinatorLog.java b/src/java/org/apache/cassandra/replication/CoordinatorLog.java index 68ad16e42a20..4ca4949620b1 100644 --- a/src/java/org/apache/cassandra/replication/CoordinatorLog.java +++ b/src/java/org/apache/cassandra/replication/CoordinatorLog.java @@ -293,8 +293,60 @@ Offsets.Immutable collectUnionOfWitnessedOffsets() lock.readLock().lock(); try { - Offsets.Mutable union = witnessedOffsets.union(); - return union.isEmpty() ? null : Offsets.Immutable.copy(union); + return Offsets.Immutable.copy(witnessedOffsets.union()); + } + finally + { + lock.readLock().unlock(); + } + } + + /** + * Returns the UNION of witnessed offsets scoped to only the specified host IDs. + */ + Offsets.Immutable collectUnionOfWitnessedOffsets(Set liveHostIds) + { + Offsets.Mutable union = new Offsets.Mutable(logId); + lock.readLock().lock(); + try + { + for (int hostId : liveHostIds) + { + if (!participants.contains(hostId)) + continue; + + Offsets.Mutable nodeOffsets = witnessedOffsets.get(hostId); + union.addAll(nodeOffsets); + } + } + finally + { + lock.readLock().unlock(); + } + return Offsets.Immutable.copy(union); + } + + /** + * Returns the intersection of witnessed offsets scoped to only the specified host IDs. + */ + Offsets.Immutable collectReconciledOffsets(Set liveHostIds) + { + lock.readLock().lock(); + try + { + Offsets.Mutable intersection = null; + for (int hostId : liveHostIds) + { + if (!participants.contains(hostId)) + continue; + + Offsets.Mutable nodeOffsets = witnessedOffsets.get(hostId); + if (intersection == null) + intersection = Offsets.Mutable.copy(nodeOffsets); + else + intersection = Offsets.Mutable.intersection(intersection, nodeOffsets); + } + return intersection == null ? new Offsets.Immutable(logId) : Offsets.Immutable.copy(intersection); } finally { diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index 5de2a3979ffc..ea97e0319e75 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -677,7 +677,7 @@ public void forEachShardInKeyspace(String keyspace, Consumer consumer) * @param ranges the token ranges to find overlapping shards for * @return a map from shard range to the union of witnessed offsets per coordinator log */ - public Map, Map> collectWitnessedOffsetsForRanges(String keyspace, Collection> ranges) + public Map, Map> collectWitnessedOffsetsForRanges(String keyspace, Collection> ranges, Set liveHostIds) { Map, Map> result = new HashMap<>(); shardLock.readLock().lock(); @@ -691,7 +691,7 @@ public Map, Map> collectWitnes { if (shard.range.intersects(range)) { - result.put(shard.range, shard.collectUnionOfWitnessedOffsetsPerLog()); + result.put(shard.range, shard.collectUnionOfWitnessedOffsetsPerLog(liveHostIds)); break; } } diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 3e3e54bd7234..6cb0515e97e2 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -19,6 +19,7 @@ package org.apache.cassandra.replication; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.HashMap; import java.util.Map; @@ -44,7 +45,7 @@ import org.apache.cassandra.repair.messages.MutationTrackingSyncRequest; import org.apache.cassandra.repair.messages.MutationTrackingSyncResponse; import org.apache.cassandra.repair.messages.RepairMessage; -import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.concurrent.AsyncPromise; public class MutationTrackingSyncCoordinator @@ -55,19 +56,41 @@ public class MutationTrackingSyncCoordinator private final RepairJobDesc desc; private final String keyspace; private final Range range; + private final Set participants; + private final ClusterMetadata metadata; private final AsyncPromise completionFuture = new AsyncPromise<>(); // Per-shard state: tracks what each node has reported for that shard private final Map, ShardSyncState> shardStates = new HashMap<>(); + // Host IDs of participants for scoped offset collection/completion. + // Null means all shard participants (no filtering). + private Set liveHostIds; + private final AtomicBoolean started = new AtomicBoolean(false); - public MutationTrackingSyncCoordinator(SharedContext ctx, RepairJobDesc desc) + // Remote participants we are waiting for sync responses from. Completion is + // not possible until all responses have been received, since remote nodes may + // report targets that the local node doesn't know about yet. + private final Set pendingSyncResponses = ConcurrentHashMap.newKeySet(); + + /** + * @param ctx shared context + * @param desc repair job descriptor + * @param participants the set of remote endpoints that should participate in this sync, + * as determined by the repair options (force, specific hosts). + * Only these endpoints will receive sync requests. If null, + * all remote replicas for overlapping shards will participate. + * @param metadata the snapshotted cluster metadata used to resolve endpoint-to-host-ID mappings + */ + public MutationTrackingSyncCoordinator(SharedContext ctx, RepairJobDesc desc, Set participants, ClusterMetadata metadata) { this.ctx = ctx; this.desc = desc; this.keyspace = desc.keyspace; this.range = Iterables.getOnlyElement(desc.ranges); + this.participants = participants; + this.metadata = metadata; } public void start() @@ -78,7 +101,7 @@ public void start() List overlappingShards; overlappingShards = new ArrayList<>(); - MutationTrackingService.instance.forEachShardInKeyspace(keyspace, shard -> { + MutationTrackingService.instance().forEachShardInKeyspace(keyspace, shard -> { if (shard.range.intersects(range)) overlappingShards.add(shard); }); @@ -89,15 +112,27 @@ public void start() return; } - InetAddressAndPort localAddress = FBUtilities.getBroadcastAddressAndPort(); + // Convert participant endpoints to host IDs for scoped completion checks. + // If participants is null (no filtering), all shard participants are live. + if (participants != null) + { + liveHostIds = new HashSet<>(); + for (InetAddressAndPort ep : participants) + { + liveHostIds.add(metadata.directory.peerId(ep).id()); + } + // Always include the local node + liveHostIds.add(metadata.directory.peerId(ctx.broadcastAddressAndPort()).id()); + } + for (Shard shard : overlappingShards) { - ShardSyncState state = new ShardSyncState(shard); + ShardSyncState state = new ShardSyncState(shard, liveHostIds); shardStates.put(shard.range, state); } // Register to receive offset updates - MutationTrackingService.instance.registerSyncCoordinator(this); + MutationTrackingService.instance().registerSyncCoordinator(this); // Capture local targets recaptureTargets(); @@ -106,7 +141,7 @@ public void start() keyspace, range, overlappingShards.size()); // Send sync requests to all remote participants - sendSyncRequests(localAddress); + sendSyncRequests(); // Check if already complete (e.g. single node, no targets) checkIfReadyToComplete(); @@ -115,16 +150,24 @@ public void start() private void complete() { if (completionFuture.trySuccess(null)) - MutationTrackingService.instance.unregisterSyncCoordinator(this); + MutationTrackingService.instance().unregisterSyncCoordinator(this); } - private void sendSyncRequests(InetAddressAndPort localAddress) + private void sendSyncRequests() { - MutationTrackingSyncRequest request = new MutationTrackingSyncRequest(desc); + MutationTrackingSyncRequest request = new MutationTrackingSyncRequest(desc, liveHostIds); + // Collect remote replicas, filtering to only allowed participants if specified. + // This respects --force (which excludes dead nodes) and --hosts (which + // restricts to specific nodes). Set remoteParticipants = ConcurrentHashMap.newKeySet(); for (ShardSyncState state : shardStates.values()) remoteParticipants.addAll(state.shard.remoteReplicas()); + if (participants != null) + remoteParticipants.retainAll(participants); + + pendingSyncResponses.addAll(remoteParticipants); + for (InetAddressAndPort participant : remoteParticipants) { logger.debug("Sending mutation tracking sync request to {} for {}", participant, desc); @@ -190,7 +233,7 @@ private boolean checkForTopologyChange() private Shard getCurrentShard(Range shardRange) { Shard[] result = new Shard[1]; - MutationTrackingService.instance.forEachShardInKeyspace(keyspace, shard -> { + MutationTrackingService.instance().forEachShardInKeyspace(keyspace, shard -> { if (shard.range.equals(shardRange)) result[0] = shard; }); @@ -208,7 +251,7 @@ private void fail(Throwable cause) { logger.warn("Sync coordinator for keyspace {} range {} failed: {}", keyspace, range, cause.getMessage()); - MutationTrackingService.instance.unregisterSyncCoordinator(this); + MutationTrackingService.instance().unregisterSyncCoordinator(this); } } @@ -229,6 +272,9 @@ private void checkIfReadyToComplete() private boolean checkIfComplete() { + if (!pendingSyncResponses.isEmpty()) + return false; + for (ShardSyncState state : shardStates.values()) { if (!state.isComplete()) @@ -261,6 +307,11 @@ public void onSyncResponse(MutationTrackingSyncResponse response) if (completionFuture.isDone()) return; + // Deduplicate: retries of MT_SYNC_REQ can produce multiple responses from the + // same participant. Only process the first one. + if (!pendingSyncResponses.remove(response.participant)) + return; + // Update shard targets with the offsets received from the participant for (Map.Entry, Map> entry : response.offsetsByShard.entrySet()) { @@ -314,33 +365,40 @@ public boolean awaitCompletion(long timeout, TimeUnit unit) throws InterruptedEx public void cancel() { if (completionFuture.tryFailure(new RuntimeException("Sync cancelled"))) - MutationTrackingService.instance.unregisterSyncCoordinator(this); + MutationTrackingService.instance().unregisterSyncCoordinator(this); } /** * Tracks sync state for a single shard. + * Completion is scoped to only the live participant host IDs when provided, + * so that dead/excluded nodes don't block sync completion. */ private static class ShardSyncState { private final Shard shard; - // Target offsets: LogId -> the offsets we're waiting for all nodes to have + // If non-null, only these host IDs are considered for union/intersection. + // If null, all shard participants are used (equivalent to no filtering). + private final Set liveHostIds; + + // Target offsets: LogId -> the offsets we're waiting for live nodes to have private final Map targets = new ConcurrentHashMap<>(); - ShardSyncState(Shard shard) + ShardSyncState(Shard shard, Set liveHostIds) { this.shard = shard; + this.liveHostIds = liveHostIds; } void captureTargets() { - Map unionOffsets = shard.collectUnionOfWitnessedOffsetsPerLog(); + Map unionOffsets = shard.collectUnionOfWitnessedOffsetsPerLog(liveHostIds); targets.putAll(unionOffsets); } boolean isComplete() { - Map currentReconciled = shard.collectReconciledOffsetsPerLog(); + Map currentReconciled = shard.collectReconciledOffsetsPerLog(liveHostIds); for (Map.Entry entry : targets.entrySet()) { @@ -351,7 +409,6 @@ boolean isComplete() if (reconciled == null) return false; - // Check if reconciled contains all offsets in target if (!containsAll(reconciled, target)) return false; } diff --git a/src/java/org/apache/cassandra/replication/Shard.java b/src/java/org/apache/cassandra/replication/Shard.java index f319c27524b5..f6462d9e0232 100644 --- a/src/java/org/apache/cassandra/replication/Shard.java +++ b/src/java/org/apache/cassandra/replication/Shard.java @@ -419,7 +419,7 @@ public Map collectReconciledOffsetsPerLog() for (CoordinatorLog log : logs.values()) { Offsets.Immutable reconciled = log.collectReconciledOffsets(); - if (reconciled != null && !reconciled.isEmpty()) + if (!reconciled.isEmpty()) result.put(log.logId, reconciled); } return result; @@ -435,12 +435,50 @@ public Map collectUnionOfWitnessedOffsetsPe for (CoordinatorLog log : logs.values()) { Offsets.Immutable union = log.collectUnionOfWitnessedOffsets(); - if (union != null && !union.isEmpty()) + if (!union.isEmpty()) result.put(log.logId, union); } return result; } + /** + * Returns the UNION of witnessed offsets scoped to only the specified participant host IDs. + * If liveHostIds is null, behaves the same as {@link #collectUnionOfWitnessedOffsetsPerLog()}. + */ + public Map collectUnionOfWitnessedOffsetsPerLog(Set liveHostIds) + { + if (liveHostIds == null) + return collectUnionOfWitnessedOffsetsPerLog(); + + Map result = new HashMap<>(); + for (CoordinatorLog log : logs.values()) + { + Offsets.Immutable union = log.collectUnionOfWitnessedOffsets(liveHostIds); + if (!union.isEmpty()) + result.put(log.logId, union); + } + return result; + } + + /** + * Returns the intersection of witnessed offsets scoped to only the specified participant host IDs. + * If liveHostIds is null, behaves the same as {@link #collectReconciledOffsetsPerLog()}. + */ + public Map collectReconciledOffsetsPerLog(Set liveHostIds) + { + if (liveHostIds == null) + return collectReconciledOffsetsPerLog(); + + Map result = new HashMap<>(); + for (CoordinatorLog log : logs.values()) + { + Offsets.Immutable reconciled = log.collectReconciledOffsets(liveHostIds); + if (!reconciled.isEmpty()) + result.put(log.logId, reconciled); + } + return result; + } + @Override public String toString() { diff --git a/src/java/org/apache/cassandra/utils/CollectionSerializers.java b/src/java/org/apache/cassandra/utils/CollectionSerializers.java index bac7b2450c08..c3fef945c25b 100644 --- a/src/java/org/apache/cassandra/utils/CollectionSerializers.java +++ b/src/java/org/apache/cassandra/utils/CollectionSerializers.java @@ -77,6 +77,9 @@ public long serializedSize(String str, int version) } }; + public static final UnversionedSerializer> intSetSerializer = newSetSerializer(Int32Serializer.serializer); + public static final UnversionedSerializer> nullableIntSetSerializer = NullableSerializer.wrap(intSetSerializer); + public static void serializeCollection(Collection values, DataOutputPlus out, UnversionedSerializer valueSerializer) throws IOException { out.writeUnsignedVInt32(values.size()); @@ -642,6 +645,30 @@ private static > C deserializeCollection(Data return result; } + public static UnversionedSerializer> newSetSerializer(UnversionedSerializer itemSerializer) + { + return new UnversionedSerializer>() + { + @Override + public void serialize(Set set, DataOutputPlus out) throws IOException + { + serializeCollection(set, out, itemSerializer); + } + + @Override + public Set deserialize(DataInputPlus in) throws IOException + { + return deserializeSet(in, itemSerializer); + } + + @Override + public long serializedSize(Set t) + { + return serializedCollectionSize(t, itemSerializer); + } + }; + } + public static UnversionedSerializer> newListSerializer(UnversionedSerializer itemSerializer) { return new UnversionedSerializer>() From 4774648366548f56c99ea7e80df48fec60c48908 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 9 Mar 2026 13:17:01 -0400 Subject: [PATCH 16/46] Fix MutationTrackingSyncCoordinatorTest --- .../MutationTrackingSyncCoordinatorTest.java | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index 499b24322ddf..d8bb7c46a850 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -21,6 +21,7 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; +import org.apache.cassandra.tcm.ClusterMetadata; import org.junit.Test; import org.apache.cassandra.distributed.Cluster; @@ -92,7 +93,7 @@ public void testSyncCoordinatorCompletesWhenNoShards() throws Throwable TimeUUID.Generator.nextTimeUUID(), KS_NAME, "", java.util.List.of(range)); MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( - SharedContext.Global.instance, desc); + SharedContext.Global.instance, desc, null, ClusterMetadata.current()); coordinator.start(); try @@ -113,12 +114,22 @@ public void testSyncCoordinatorCompletesWhenNoShards() throws Throwable @Test public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable { - try (Cluster cluster = builder().withNodes(3).start()) + // Enable repair retries with a short request timeout so that the dropped MT_SYNC_RSP + // from node 1 causes a quick timeout and retry rather than a 2-minute wait and failure. + // After the message filter is reset, the retried MT_SYNC_REQ will get a response. + try (Cluster cluster = builder().withNodes(3) + .withConfig(config -> config.set("request_timeout", "1000ms") + .set("repair.retries.max_attempts", 10) + .set("repair.retries.base_sleep_time", "100ms") + .set("repair.retries.max_sleep_time", "500ms")) + .start()) { createTrackedKeyspace(cluster, "3"); // Block all messages FROM node 1 to prevent write replication - // This ensures that write only succeeds locally on node 1 + // and also to drop MT_SYNC_RSP from node 1 back to the coordinator. + // This ensures that write only succeeds locally on node 1 and the + // sync coordinator can't get node 1's sync response. cluster.filters().allVerbs().from(1).drop(); cluster.coordinator(1).execute( @@ -126,20 +137,24 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable ConsistencyLevel.ONE ); - // Start MutationTrackingSyncCoordinator on node 2 in a separate thread - // It should wait for offsets to sync since node 1's data hasn't propagated yet + // Start MutationTrackingSyncCoordinator on node 2 in a separate thread. + // It should wait for offsets to sync since node 1's sync response is being dropped. + // The coordinator sends MT_SYNC_REQ to nodes 1 and 3. Node 3's response comes back + // but node 1's response is dropped. The coordinator stays blocked because + // pendingSyncResponses still contains node 1. After the filter is reset, the + // retried MT_SYNC_REQ will succeed and the coordinator can proceed. CompletableFuture coordinatorFuture = CompletableFuture.supplyAsync(() -> cluster.get(2).callOnInstance(() -> { Range range = fullTokenRange(); RepairJobDesc desc = new RepairJobDesc(TimeUUID.Generator.nextTimeUUID(), TimeUUID.Generator.nextTimeUUID(), KS_NAME + '3', "", java.util.List.of(range)); MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( - SharedContext.Global.instance, desc); + SharedContext.Global.instance, desc, null, ClusterMetadata.current()); coordinator.start(); try { - return coordinator.awaitCompletion(10, TimeUnit.SECONDS); + return coordinator.awaitCompletion(30, TimeUnit.SECONDS); } catch (InterruptedException e) { @@ -167,14 +182,21 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable assertEquals("Node " + i + " should not have data yet", 0, results.length); } - // Verify coordinator stays blocked for at least 2 seconds + // Verify coordinator stays blocked for at least 2 seconds while node 1's + // sync response is being dropped. The coordinator can't complete because + // pendingSyncResponses still contains node 1. Awaitility.await() .during(Duration.ofSeconds(2)) .atMost(Duration.ofSeconds(3)) .until(() -> !coordinatorFuture.isDone()); + // Reset filter so that retried MT_SYNC_REQ to node 1 can get a response, + // and offset broadcasts from node 1 can reach other nodes. cluster.filters().reset(); + // Force offset broadcasts on all nodes to drive reconciliation. + // After the sync response from node 1 establishes targets, the coordinator + // needs to see that all replicas have caught up via offset broadcasts. for (int i = 1; i <= 3; i++) cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); @@ -228,7 +250,7 @@ public void testSyncCoordinatorCancel() throws Throwable TimeUUID.Generator.nextTimeUUID(), KS_NAME + '4', "", java.util.List.of(range)); MutationTrackingSyncCoordinator coordinator = new MutationTrackingSyncCoordinator( - SharedContext.Global.instance, desc); + SharedContext.Global.instance, desc, null, ClusterMetadata.current()); coordinator.start(); try From d591087c3bc134e950187ae6b60cb08eb0bc949c Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 9 Mar 2026 14:25:10 -0400 Subject: [PATCH 17/46] Clean up/fix result handling when pairing incremental repair with MT repair --- .../cassandra/repair/RepairCoordinator.java | 24 ++++++++++++------- .../apache/cassandra/repair/RepairResult.java | 4 ++++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index a51a62eb1e25..4e208e3397a6 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -86,6 +86,7 @@ import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.WrappedRunnable; +import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.progress.ProgressEvent; @@ -550,16 +551,21 @@ else if (useMutationTracking) RepairTask mtTask = new MutationTrackingIncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); if (mutationTrackingMigrationInProgress) { - // During migration, run incremental repair first, then mutation tracking sync + // During migration, run incremental repair first, then mutation tracking sync. + // Propagate the IR result on success since it drives migration advancement. RepairTask incrementalTask = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); - return incrementalTask.perform(executor, validationScheduler) - .flatMap(irResult -> { - if (irResult.hasFailed()) - return ImmediateFuture.success(Pair.create(irResult, incrementalTask::successMessage)); - return mtTask.perform(executor, validationScheduler) - .>>map(r -> Pair.create(r, mtTask::successMessage)); - }) - .addCallback((s, f) -> executor.shutdown()); + AsyncPromise>> result = new AsyncPromise<>(); + incrementalTask.perform(executor, validationScheduler).addCallback( + irResult -> { + Pair> irPair = Pair.create(irResult, incrementalTask::successMessage); + mtTask.perform(executor, validationScheduler) + .addCallback( + mtResult -> result.trySuccess(irPair), + result::tryFailure); + }, + result::tryFailure + ); + return result.addCallback((s, f) -> executor.shutdown()); } return mtTask.perform(executor, validationScheduler) .>>map(r -> Pair.create(r, mtTask::successMessage)) diff --git a/src/java/org/apache/cassandra/repair/RepairResult.java b/src/java/org/apache/cassandra/repair/RepairResult.java index d3a8a71d40f0..ce5964dc1ee7 100644 --- a/src/java/org/apache/cassandra/repair/RepairResult.java +++ b/src/java/org/apache/cassandra/repair/RepairResult.java @@ -22,6 +22,8 @@ import org.apache.cassandra.service.consensus.migration.ConsensusMigrationRepairResult; import org.apache.cassandra.service.replication.migration.MutationTrackingMigrationRepairResult; +import javax.annotation.Nullable; + /** * RepairJob's result */ @@ -29,7 +31,9 @@ public class RepairResult { public final RepairJobDesc desc; public final List stats; + @Nullable public final ConsensusMigrationRepairResult consensusMigrationRepairResult; + @Nullable public final MutationTrackingMigrationRepairResult mutationTrackingMigrationRepairResult; public RepairResult(RepairJobDesc desc, List stats, ConsensusMigrationRepairResult consensusMigrationRepairResult, MutationTrackingMigrationRepairResult mutationTrackingMigrationRepairResult) From 42e23c2803278b1339c882ef8a663ebc12834f76 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 12 Mar 2026 14:11:04 -0400 Subject: [PATCH 18/46] Fix java.lang.IllegalStateException: Attempted to create a new keyspace shard for keyspace distributed_test_keyspace, but it already exists on bounce --- .../replication/MutationTrackingService.java | 2 +- .../tracking/MutationTrackingBounceTest.java | 93 ++++++++++++------- 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index ea97e0319e75..c6412a4aaa5e 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -1074,7 +1074,7 @@ static UpdateDecision decisionForTopologyChange(String keyspace, ClusterMetadata } if (prevKsm == null) - return nextKsm.useMutationTracking() ? UpdateDecision.CREATE : UpdateDecision.NONE; + return nextKsm.useMutationTracking() ? (hasExisting ? UpdateDecision.REPLICA_GROUP : UpdateDecision.CREATE) : UpdateDecision.NONE; if (nextKsm == null) return prevKsm.useMutationTracking() ? UpdateDecision.DROP : UpdateDecision.NONE; diff --git a/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java b/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java index 935cd3db6563..0a6279a5c4ee 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java @@ -45,48 +45,75 @@ public void bounceTest() throws Throwable { try (Cluster cluster = builder().withNodes(1).start()) { - int tables = 10; - int writesPerKey = 2; - int pks = 100; - withRandom(rng -> { - cluster.schemaChange(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1} " + - "AND replication_type='tracked'", - KEYSPACE)); - - List builders = new ArrayList<>(); - for (int i = 0; i < tables; i++) - { - Generator schemaGen = SchemaGenerators.trivialSchema(KEYSPACE, () -> "mutation_tracking_bounce_" + (builders.size() + 1), POPULATION, - SchemaSpec.optionsBuilder()); - - SchemaSpec schema = schemaGen.generate(rng); - cluster.schemaChange(schema.compile()); - builders.add(new ReplayingHistoryBuilder(schema.valueGenerators, - hb -> InJvmDTestVisitExecutor.builder() - .consistencyLevel(ConsistencyLevel.QUORUM) - .build(schema, hb, cluster))); - } + bounceTest(cluster, 1, 1); + } + } + + @Test + public void bounceTestMultiNode() throws Throwable + { + try (Cluster cluster = builder().withNodes(3).start()) + { + bounceTest(cluster, 3, 1); + } + } + + @Test + public void doubleBounceTestMultiNode() throws Throwable + { + try (Cluster cluster = builder().withNodes(3).start()) + { + bounceTest(cluster, 3, 2); + } + } - int counter = 0; - for (int pk = 0; pk < pks; pk++) { - for (HistoryBuilder history : builders) - for (int i = 0; i < writesPerKey; i++) - history.insert(pk); + private void bounceTest(Cluster cluster, int rf, int bounces) throws Throwable + { + int tables = 10; + int writesPerKey = 2; + int pks = 100; + withRandom(rng -> { + cluster.schemaChange(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': %d} " + + "AND replication_type='tracked'", + KEYSPACE, rf)); + + List builders = new ArrayList<>(); + for (int i = 0; i < tables; i++) + { + Generator schemaGen = SchemaGenerators.trivialSchema(KEYSPACE, () -> "mutation_tracking_bounce_" + (builders.size() + 1), POPULATION, + SchemaSpec.optionsBuilder()); + + SchemaSpec schema = schemaGen.generate(rng); + cluster.schemaChange(schema.compile()); + builders.add(new ReplayingHistoryBuilder(schema.valueGenerators, + hb -> InJvmDTestVisitExecutor.builder() + .consistencyLevel(ConsistencyLevel.QUORUM) + .build(schema, hb, cluster))); + } + + int counter = 0; + for (int pk = 0; pk < pks; pk++) + { + for (HistoryBuilder history : builders) + for (int i = 0; i < writesPerKey; i++) + history.insert(pk); if (++counter % 10 == 0) cluster.get(1).runOnInstance(() -> MutationJournal.instance().closeCurrentSegmentForTestingIfNonEmpty()); } + for (int bounce = 0; bounce < bounces; bounce++) + { ClusterUtils.stopUnchecked(cluster.get(1)); cluster.get(1).startup(); + } - for (int pk = 0; pk < pks; pk++) - for (HistoryBuilder history : builders) - for (int i = 0; i < 10; i++) - history.selectPartition(pk); + for (int pk = 0; pk < pks; pk++) + for (HistoryBuilder history : builders) + for (int i = 0; i < 10; i++) + history.selectPartition(pk); - cluster.get(1).runOnInstance(new MutationTrackingBounce_ValidateRunnable(tables * pks * writesPerKey)); - }); - } + cluster.get(1).runOnInstance(new MutationTrackingBounce_ValidateRunnable(tables * pks * writesPerKey)); + }); } } \ No newline at end of file From 65a366e38808d084c5b737994c5db160f7dc1bd2 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 12 Mar 2026 13:44:49 -0400 Subject: [PATCH 19/46] Add MutationTrackingRepairTest --- .../repair/MutationTrackingRepairTest.java | 606 ++++++++++++++++++ 1 file changed, 606 insertions(+) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java new file mode 100644 index 000000000000..08bca5dca6a4 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -0,0 +1,606 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.distributed.test.repair; + +import java.io.IOException; +import java.net.UnknownHostException; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.After; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IMessageFilters; +import org.apache.cassandra.distributed.api.NodeToolResult; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.repair.MutationTrackingIncrementalRepairTask; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.FBUtilities; + +import static org.junit.Assert.*; + +/** + * End-to-end tests for mutation tracking repair. + * + * Tests that invoke repair via nodetool and verify that data is actually repaired, + * migration progresses correctly, and options like --force and --in-hosts work. + * + * Each test creates a unique keyspace. Keyspaces are not dropped between tests because + * dropping a tracked keyspace while background offset broadcasts are in flight causes + * NoSuchElementException in MutationTrackingService.getOrCreateShards (the broadcast + * tries to look up the dropped keyspace's metadata). The keyspaces are cleaned up when + * the cluster is closed at the end of the test class. + */ +public class MutationTrackingRepairTest extends TestBaseImpl +{ + private static Cluster CLUSTER; + private static final AtomicInteger ksCounter = new AtomicInteger(); + + @BeforeClass + public static void setupCluster() throws IOException + { + CLUSTER = Cluster.build() + .withNodes(3) + .withConfig(cfg -> cfg.set("mutation_tracking_enabled", true) + .set("mutation_tracking_sync_timeout", "10s") + .set("request_timeout", "1000ms") + .set("repair.retries.max_attempts", 10) + .set("repair.retries.base_sleep_time", "100ms") + .set("repair.retries.max_sleep_time", "500ms") + .with(Feature.GOSSIP)) + .start(); + } + + @AfterClass + public static void teardownCluster() + { + if (CLUSTER != null) + CLUSTER.close(); + } + + @After + public void tearDown() + { + CLUSTER.filters().reset(); + } + + private static String nextKsName() + { + return "mt_repair_" + ksCounter.incrementAndGet(); + } + + private void createTrackedKeyspace(String ksName) + { + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + } + + private void createUntrackedKeyspace(String ksName) + { + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); + } + + private void createTable(String ksName, String tableName) + { + CLUSTER.schemaChange("CREATE TABLE " + ksName + '.' + tableName + " (k int PRIMARY KEY, v int)"); + } + + private void insertData(String ksName, String tableName, int start, int count) + { + for (int i = start; i < start + count; i++) + { + CLUSTER.coordinator(1).execute( + "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", + ConsistencyLevel.ALL, i, i); + } + } + + private void assertDataOnAllNodes(String ksName, String tableName, int start, int count) + { + for (int node = 1; node <= CLUSTER.size(); node++) + { + if (CLUSTER.get(node).isShutdown()) + continue; + for (int i = start; i < start + count; i++) + { + Object[][] results = CLUSTER.get(node).executeInternal( + "SELECT k, v FROM " + ksName + '.' + tableName + " WHERE k = ?", i); + assertEquals("Node " + node + " missing row k=" + i, 1, results.length); + assertEquals(i, results[0][0]); + assertEquals(i, results[0][1]); + } + } + } + + private void broadcastOffsets() + { + for (int i = 1; i <= CLUSTER.size(); i++) + { + if (!CLUSTER.get(i).isShutdown()) + CLUSTER.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); + } + } + + private boolean isMigrationInProgress(String ksName) + { + return CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ksName); + }); + } + + private boolean isMigrationComplete(String ksName) + { + return CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName); + return info == null; + }); + } + + private NodeToolResult nodetoolRepair(int node, String... args) + { + String[] cmd = new String[args.length + 1]; + cmd[0] = "repair"; + System.arraycopy(args, 0, cmd, 1, args.length); + return CLUSTER.get(node).nodetoolResult(cmd); + } + + private String getBroadcastAddress(int node) + { + return CLUSTER.get(node).callOnInstance(() -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); + } + + private void alterKeyspaceToTracked(String ksName) + { + CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + } + + private void alterKeyspaceToUntracked(String ksName) + { + CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); + } + + private void repairFromAllNodes(String ksName) + { + for (int node = 1; node <= CLUSTER.size(); node++) + { + if (CLUSTER.get(node).isShutdown()) + continue; + broadcastOffsets(); + nodetoolRepair(node, ksName).asserts().success(); + } + } + + /** + * Block all messages to/from a node and force the failure detector to mark it dead. + * Uses message filters instead of actually stopping the node to avoid + * MutationTrackingService restart bugs with shard re-creation. + */ + private void isolateNode(int nodeToIsolate, int observerNode) + { + CLUSTER.filters().allVerbs().from(nodeToIsolate).drop(); + CLUSTER.filters().allVerbs().to(nodeToIsolate).drop(); + + String isolatedAddress = CLUSTER.get(nodeToIsolate).callOnInstance( + () -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); + CLUSTER.get(observerNode).runOnInstance(() -> { + try + { + InetAddressAndPort neighbor = InetAddressAndPort.getByName(isolatedAddress); + FailureDetector.instance.forceConviction(neighbor); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + }); + } + + @Test + public void testBasicRepairHappyPath() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 100); + broadcastOffsets(); + + nodetoolRepair(1, ksName).asserts().success(); + + assertDataOnAllNodes(ksName, "tbl", 0, 100); + } + + @Test + public void testRepairConvergesInconsistentReplicas() throws Exception + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + + // Isolate node 2 completely so it receives no writes or offset broadcasts + CLUSTER.filters().allVerbs().from(2).drop(); + CLUSTER.filters().allVerbs().to(2).drop(); + + for (int i = 0; i < 50; i++) + { + CLUSTER.coordinator(1).execute( + "INSERT INTO " + ksName + ".tbl (k, v) VALUES (?, ?)", + ConsistencyLevel.ONE, i, i); + } + + // Node 2 should have no data + Object[][] results = CLUSTER.get(2).executeInternal( + "SELECT k FROM " + ksName + ".tbl"); + assertEquals("Node 2 should not have data while isolated", 0, results.length); + + broadcastOffsets(); + + // Start repair in background — it will be stuck waiting for node 2 + Future repairFuture = Executors.newSingleThreadExecutor().submit(() -> + nodetoolRepair(1, ksName).asserts().success() + ); + + // Let repair start and get stuck, then remove isolation so it can complete + Thread.sleep(2000); + CLUSTER.filters().reset(); + + repairFuture.get(30, TimeUnit.SECONDS); + + assertDataOnAllNodes(ksName, "tbl", 0, 50); + } + + @Test + public void testRepairMultipleTables() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl1"); + createTable(ksName, "tbl2"); + createTable(ksName, "tbl3"); + + insertData(ksName, "tbl1", 0, 30); + insertData(ksName, "tbl2", 100, 30); + insertData(ksName, "tbl3", 200, 30); + broadcastOffsets(); + + nodetoolRepair(1, ksName).asserts().success(); + + assertDataOnAllNodes(ksName, "tbl1", 0, 30); + assertDataOnAllNodes(ksName, "tbl2", 100, 30); + assertDataOnAllNodes(ksName, "tbl3", 200, 30); + } + + @Test + public void testForceRepairWithNodeDown() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 50); + broadcastOffsets(); + + // Isolate node 2 via message filters so failure detector marks it dead. + // We don't actually stop the JVM to avoid MutationTrackingService restart bugs. + isolateNode(2, 1); + + // Repair without --force should fail + nodetoolRepair(1, ksName).asserts().failure(); + + // Repair with --force should succeed + nodetoolRepair(1, ksName, "--force").asserts().success(); + + // Verify data on live nodes + for (int node : new int[]{ 1, 3 }) + { + for (int i = 0; i < 50; i++) + { + Object[][] results = CLUSTER.get(node).executeInternal( + "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); + assertEquals("Node " + node + " missing row k=" + i, 1, results.length); + } + } + } + + @Test + public void testForceRepairWithAllNodesUp() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 50); + broadcastOffsets(); + + nodetoolRepair(1, ksName, "--force").asserts().success(); + + assertDataOnAllNodes(ksName, "tbl", 0, 50); + } + + @Test + public void testRepairWithSpecificHosts() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 50); + broadcastOffsets(); + + String addr1 = getBroadcastAddress(1); + String addr3 = getBroadcastAddress(3); + + nodetoolRepair(1, ksName, "--in-hosts", addr1 + ',' + addr3).asserts().success(); + } + + @Test + public void testMigrationUntrackedToTrackedCompletesViaRepair() + { + String ksName = nextKsName(); + createUntrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 100); + + alterKeyspaceToTracked(ksName); + assertTrue("Migration should be in progress after ALTER", isMigrationInProgress(ksName)); + + repairFromAllNodes(ksName); + + assertTrue("Migration should complete after repair", isMigrationComplete(ksName)); + assertDataOnAllNodes(ksName, "tbl", 0, 100); + } + + @Test + public void testDataAccessibleDuringMigration() + { + String ksName = nextKsName(); + createUntrackedKeyspace(ksName); + createTable(ksName, "tbl"); + + // Pre-migration data + insertData(ksName, "tbl", 0, 50); + + // Start migration + alterKeyspaceToTracked(ksName); + + // Read pre-migration data + Object[][] results = CLUSTER.coordinator(1).execute( + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + assertEquals("Pre-migration data should be readable", 50, results.length); + + // Write new data during migration + insertData(ksName, "tbl", 50, 50); + + // Read all data during migration + results = CLUSTER.coordinator(1).execute( + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + assertEquals("All data should be readable during migration", 100, results.length); + + // Complete migration via repair + repairFromAllNodes(ksName); + + // Verify all data still readable post-migration + results = CLUSTER.coordinator(1).execute( + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + assertEquals("All data should be readable after migration", 100, results.length); + + // Write and read more data post-migration + insertData(ksName, "tbl", 100, 50); + results = CLUSTER.coordinator(1).execute( + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + assertEquals("All data including post-migration should be readable", 150, results.length); + + assertDataOnAllNodes(ksName, "tbl", 0, 150); + } + + @Test + public void testMigrationTrackedToUntrackedCompletesViaRepair() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 100); + broadcastOffsets(); + + alterKeyspaceToUntracked(ksName); + assertTrue("Migration should be in progress after ALTER", isMigrationInProgress(ksName)); + + repairFromAllNodes(ksName); + + assertTrue("Migration should complete after repair", isMigrationComplete(ksName)); + assertDataOnAllNodes(ksName, "tbl", 0, 100); + } + + @Test + public void testRepairAdvancesMigrationState() + { + String ksName = nextKsName(); + createUntrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 100); + + alterKeyspaceToTracked(ksName); + assertTrue("Full ring should be pending", isMigrationInProgress(ksName)); + + broadcastOffsets(); + nodetoolRepair(1, ksName).asserts().success(); + + if (!isMigrationComplete(ksName)) + { + for (int node = 2; node <= CLUSTER.size(); node++) + { + broadcastOffsets(); + nodetoolRepair(node, ksName).asserts().success(); + if (isMigrationComplete(ksName)) + break; + } + } + + assertTrue("Migration should complete after full repair", isMigrationComplete(ksName)); + } + + @Test + public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() + { + String ksName = nextKsName(); + createUntrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 50); + + alterKeyspaceToTracked(ksName); + assertTrue("Migration should be in progress", isMigrationInProgress(ksName)); + + // Isolate node 2 via message filters so failure detector marks it dead + isolateNode(2, 1); + + // Broadcast offsets on live nodes + for (int n : new int[]{ 1, 3 }) + CLUSTER.get(n).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); + + // Force repair should succeed + nodetoolRepair(1, ksName, "--force").asserts().success(); + + // Migration should NOT be advanced because deadNodesExcluded=true makes it INELIGIBLE + assertTrue("Migration should not advance with dead nodes excluded", + CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName); + return info != null; + })); + } + + @Test + public void testPreviewRepairDoesNotAdvanceMigration() + { + String ksName = nextKsName(); + createUntrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 50); + + alterKeyspaceToTracked(ksName); + assertTrue("Migration should be in progress", isMigrationInProgress(ksName)); + + broadcastOffsets(); + + nodetoolRepair(1, ksName, "--preview").asserts().success(); + + assertTrue("Migration should not advance with preview repair", isMigrationInProgress(ksName)); + } + + @Test + public void testSubrangeRepair() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 100); + broadcastOffsets(); + + nodetoolRepair(1, ksName, "-st", "0", "-et", "1000000000").asserts().success(); + } + + @Test + public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() + { + String ksName = nextKsName(); + createUntrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 100); + + alterKeyspaceToTracked(ksName); + assertTrue("Full ring should be pending", isMigrationInProgress(ksName)); + + broadcastOffsets(); + + nodetoolRepair(1, ksName, "-st", "0", "-et", "1000000000").asserts().success(); + + // Migration should NOT be complete yet (only repaired a subrange) + assertTrue("Migration should not be complete after subrange repair", + isMigrationInProgress(ksName)); + } + + @Test + public void testRepairTimeout() + { + String ksName = nextKsName(); + createTrackedKeyspace(ksName); + createTable(ksName, "tbl"); + insertData(ksName, "tbl", 0, 50); + + // Block MT_SYNC_REQ messages to prevent sync from completing + IMessageFilters.Filter filter = CLUSTER.filters() + .verbs(Verb.MT_SYNC_REQ.id) + .drop(); + try + { + nodetoolRepair(1, ksName).asserts().failure(); + } + finally + { + filter.off(); + } + } + + @Test + public void testWriteDuringMigrationPreservedAfterCompletion() + { + String ksName = nextKsName(); + createUntrackedKeyspace(ksName); + createTable(ksName, "tbl"); + + // Pre-migration data + insertData(ksName, "tbl", 0, 50); + + // Start migration + alterKeyspaceToTracked(ksName); + + // During-migration data + insertData(ksName, "tbl", 50, 50); + + // Complete migration + repairFromAllNodes(ksName); + + // Post-migration data + insertData(ksName, "tbl", 100, 50); + + // All 150 rows should be present on all nodes + assertDataOnAllNodes(ksName, "tbl", 0, 150); + + // Also verify via coordinator read at CL.ALL + Object[][] results = CLUSTER.coordinator(1).execute( + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + assertEquals("All 150 rows should be readable at CL.ALL", 150, results.length); + } +} From 886e4bd8a65300a34191a92340ce04f670daab4b Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 12 Mar 2026 16:20:06 -0400 Subject: [PATCH 20/46] checkpoint before big mess --- .../repair/MutationTrackingRepairTest.java | 529 +++++++++--------- 1 file changed, 272 insertions(+), 257 deletions(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 08bca5dca6a4..420de9da7d96 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -19,6 +19,9 @@ import java.io.IOException; import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; @@ -26,6 +29,7 @@ import org.junit.After; import org.junit.AfterClass; +import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -35,23 +39,22 @@ import org.apache.cassandra.distributed.api.IMessageFilters; import org.apache.cassandra.distributed.api.NodeToolResult; import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Verb; import org.apache.cassandra.repair.MutationTrackingIncrementalRepairTask; -import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.FBUtilities; +import static java.util.function.Predicate.not; import static org.junit.Assert.*; /** * End-to-end tests for mutation tracking repair. * - * Tests that invoke repair via nodetool and verify that data is actually repaired, - * migration progresses correctly, and options like --force and --in-hosts work. - * * Each test creates a unique keyspace. Keyspaces are not dropped between tests because * dropping a tracked keyspace while background offset broadcasts are in flight causes * NoSuchElementException in MutationTrackingService.getOrCreateShards (the broadcast @@ -61,11 +64,15 @@ public class MutationTrackingRepairTest extends TestBaseImpl { private static Cluster CLUSTER; + private static ExecutorService executor; private static final AtomicInteger ksCounter = new AtomicInteger(); + private String ksName; + @BeforeClass public static void setupCluster() throws IOException { + executor = Executors.newCachedThreadPool(); CLUSTER = Cluster.build() .withNodes(3) .withConfig(cfg -> cfg.set("mutation_tracking_enabled", true) @@ -81,41 +88,73 @@ public static void setupCluster() throws IOException @AfterClass public static void teardownCluster() { + executor.shutdownNow(); if (CLUSTER != null) CLUSTER.close(); } + @Before + public void setUp() + { + ksName = "mt_repair_" + ksCounter.incrementAndGet(); + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); + CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); + } + @After public void tearDown() { CLUSTER.filters().reset(); + for (int i = 1; i <= CLUSTER.size(); i++) + { + CLUSTER.get(i).runOnInstance(() -> { + Gossiper.runInGossipStageBlocking(() -> { + for (var entry : Gossiper.instance.endpointStateMap.entrySet()) + { + InetAddressAndPort ep = entry.getKey(); + EndpointState state = entry.getValue(); + if (!ep.equals(FBUtilities.getBroadcastAddressAndPort()) && !state.isAlive()) + { + FailureDetector.instance.report(ep); + Gossiper.instance.realMarkAlive(ep, state); + } + } + }); + }); + } } - private static String nextKsName() + private void setupUntracked() { - return "mt_repair_" + ksCounter.incrementAndGet(); + ksName = "mt_repair_" + ksCounter.incrementAndGet(); + CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); + CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); } - private void createTrackedKeyspace(String ksName) + private void createTable(String tableName) { - CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='tracked'"); + CLUSTER.schemaChange("CREATE TABLE " + ksName + '.' + tableName + " (k int PRIMARY KEY, v int)"); } - private void createUntrackedKeyspace(String ksName) + private void alterKeyspaceToTracked() { - CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + + CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='untracked'"); + "AND replication_type='tracked'"); } - private void createTable(String ksName, String tableName) + private void alterKeyspaceToUntracked() { - CLUSTER.schemaChange("CREATE TABLE " + ksName + '.' + tableName + " (k int PRIMARY KEY, v int)"); + CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); } - private void insertData(String ksName, String tableName, int start, int count) + private void insertData(String tableName, int start, int count) { for (int i = start; i < start + count; i++) { @@ -125,7 +164,7 @@ private void insertData(String ksName, String tableName, int start, int count) } } - private void assertDataOnAllNodes(String ksName, String tableName, int start, int count) + private void assertDataOnAllNodes(String tableName, int start, int count) { for (int node = 1; node <= CLUSTER.size(); node++) { @@ -142,117 +181,162 @@ private void assertDataOnAllNodes(String ksName, String tableName, int start, in } } - private void broadcastOffsets() + private NodeToolResult nodetoolRepair(int node, String... args) + { + String[] cmd = new String[args.length + 1]; + cmd[0] = "repair"; + System.arraycopy(args, 0, cmd, 1, args.length); + return CLUSTER.get(node).nodetoolResult(cmd); + } + + private List getLiveNodes() { + List nodes = new ArrayList<>(); for (int i = 1; i <= CLUSTER.size(); i++) - { if (!CLUSTER.get(i).isShutdown()) - CLUSTER.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); + nodes.add(i); + return nodes; + } + + private List repairConcurrently(List nodes, String... args) + { + List> futures = new ArrayList<>(); + for (int node : nodes) + { + int n = node; + futures.add(executor.submit(() -> nodetoolRepair(n, args))); } + List results = new ArrayList<>(); + for (Future f : futures) + { + try + { + results.add(f.get(60, TimeUnit.SECONDS)); + } + catch (Exception e) + { + throw new RuntimeException("Repair future failed", e); + } + } + return results; } - private boolean isMigrationInProgress(String ksName) + private void assertAllSuccess(List results) { - return CLUSTER.get(1).callOnInstance(() -> { - ClusterMetadata metadata = ClusterMetadata.current(); - return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ksName); - }); + for (NodeToolResult r : results) + r.asserts().success(); } - private boolean isMigrationComplete(String ksName) + private void assertAllFailure(List results) { - return CLUSTER.get(1).callOnInstance(() -> { - ClusterMetadata metadata = ClusterMetadata.current(); - KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName); - return info == null; - }); + for (NodeToolResult r : results) + r.asserts().failure(); } - private NodeToolResult nodetoolRepair(int node, String... args) + private String[] withPR(String... args) { - String[] cmd = new String[args.length + 1]; - cmd[0] = "repair"; - System.arraycopy(args, 0, cmd, 1, args.length); - return CLUSTER.get(node).nodetoolResult(cmd); + String[] result = new String[args.length + 1]; + System.arraycopy(args, 0, result, 0, args.length); + result[args.length] = "-pr"; + return result; } - private String getBroadcastAddress(int node) + private void repairAllSuccess(String... args) { - return CLUSTER.get(node).callOnInstance(() -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); + String[] prArgs = withPR(args); + List liveNodes = getLiveNodes(); + assertAllSuccess(repairConcurrently(liveNodes, prArgs)); + // Run a second time to make sure repair can be run multiple times without failing + assertAllSuccess(repairConcurrently(liveNodes, prArgs)); } - private void alterKeyspaceToTracked(String ksName) + private void repairAllFailure(String... args) { - CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='tracked'"); + assertAllFailure(repairConcurrently(getLiveNodes(), withPR(args))); } - private void alterKeyspaceToUntracked(String ksName) + private void repairFromNodesSuccess(List nodes, String... args) { - CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='untracked'"); + String[] prArgs = withPR(args); + assertAllSuccess(repairConcurrently(nodes, prArgs)); + assertAllSuccess(repairConcurrently(nodes, prArgs)); } - private void repairFromAllNodes(String ksName) + private boolean isMigrationInProgress() { - for (int node = 1; node <= CLUSTER.size(); node++) - { - if (CLUSTER.get(node).isShutdown()) - continue; - broadcastOffsets(); - nodetoolRepair(node, ksName).asserts().success(); - } + String ks = ksName; + return CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + return MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, ks); + }); + } + + private boolean isMigrationComplete() + { + String ks = ksName; + return CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ks); + return info == null; + }); } - /** - * Block all messages to/from a node and force the failure detector to mark it dead. - * Uses message filters instead of actually stopping the node to avoid - * MutationTrackingService restart bugs with shard re-creation. - */ - private void isolateNode(int nodeToIsolate, int observerNode) + private String getBroadcastAddress(int node) + { + return CLUSTER.get(node).callOnInstance(() -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); + } + + private void isolateNode(int nodeToIsolate, int... observerNodes) { CLUSTER.filters().allVerbs().from(nodeToIsolate).drop(); CLUSTER.filters().allVerbs().to(nodeToIsolate).drop(); String isolatedAddress = CLUSTER.get(nodeToIsolate).callOnInstance( () -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); - CLUSTER.get(observerNode).runOnInstance(() -> { - try - { - InetAddressAndPort neighbor = InetAddressAndPort.getByName(isolatedAddress); - FailureDetector.instance.forceConviction(neighbor); - } - catch (UnknownHostException e) - { - throw new RuntimeException(e); - } - }); + for (int observer : observerNodes) + { + CLUSTER.get(observer).runOnInstance(() -> { + try + { + InetAddressAndPort neighbor = InetAddressAndPort.getByName(isolatedAddress); + FailureDetector.instance.forceConviction(neighbor); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + }); + } } @Test public void testBasicRepairHappyPath() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 100); - broadcastOffsets(); + insertData("tbl", 0, 100); - nodetoolRepair(1, ksName).asserts().success(); + repairAllSuccess(ksName); - assertDataOnAllNodes(ksName, "tbl", 0, 100); + assertDataOnAllNodes("tbl", 0, 100); } @Test - public void testRepairConvergesInconsistentReplicas() throws Exception + public void testRepairSpecificTable() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); + createTable("tbl1"); + createTable("tbl2"); + + insertData("tbl1", 0, 50); + insertData("tbl2", 100, 50); - // Isolate node 2 completely so it receives no writes or offset broadcasts + repairAllSuccess(ksName, "tbl1"); + + assertDataOnAllNodes("tbl1", 0, 50); + assertDataOnAllNodes("tbl2", 100, 50); + } + + @Test + public void testRepairConvergesInconsistentReplicas() throws Exception + { CLUSTER.filters().allVerbs().from(2).drop(); CLUSTER.filters().allVerbs().to(2).drop(); @@ -260,72 +344,65 @@ public void testRepairConvergesInconsistentReplicas() throws Exception { CLUSTER.coordinator(1).execute( "INSERT INTO " + ksName + ".tbl (k, v) VALUES (?, ?)", - ConsistencyLevel.ONE, i, i); + ConsistencyLevel.QUORUM, i, i); } - // Node 2 should have no data Object[][] results = CLUSTER.get(2).executeInternal( "SELECT k FROM " + ksName + ".tbl"); assertEquals("Node 2 should not have data while isolated", 0, results.length); - broadcastOffsets(); - - // Start repair in background — it will be stuck waiting for node 2 - Future repairFuture = Executors.newSingleThreadExecutor().submit(() -> - nodetoolRepair(1, ksName).asserts().success() - ); + // Start -pr repair from all nodes in background — will get stuck waiting for node 2 + String[] prArgs = withPR(ksName); + List> futures = new ArrayList<>(); + for (int node = 1; node <= CLUSTER.size(); node++) + { + int n = node; + futures.add(executor.submit(() -> nodetoolRepair(n, prArgs))); + } - // Let repair start and get stuck, then remove isolation so it can complete Thread.sleep(2000); + assertTrue(futures.stream().allMatch(not(Future::isDone))); CLUSTER.filters().reset(); - repairFuture.get(30, TimeUnit.SECONDS); + List repairResults = new ArrayList<>(); + for (Future f : futures) + repairResults.add(f.get(30, TimeUnit.SECONDS)); + assertAllSuccess(repairResults); - assertDataOnAllNodes(ksName, "tbl", 0, 50); + assertDataOnAllNodes("tbl", 0, 50); } @Test public void testRepairMultipleTables() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl1"); - createTable(ksName, "tbl2"); - createTable(ksName, "tbl3"); + createTable("tbl1"); + createTable("tbl2"); + createTable("tbl3"); - insertData(ksName, "tbl1", 0, 30); - insertData(ksName, "tbl2", 100, 30); - insertData(ksName, "tbl3", 200, 30); - broadcastOffsets(); + insertData("tbl1", 0, 30); + insertData("tbl2", 100, 30); + insertData("tbl3", 200, 30); - nodetoolRepair(1, ksName).asserts().success(); + repairAllSuccess(ksName); - assertDataOnAllNodes(ksName, "tbl1", 0, 30); - assertDataOnAllNodes(ksName, "tbl2", 100, 30); - assertDataOnAllNodes(ksName, "tbl3", 200, 30); + assertDataOnAllNodes("tbl1", 0, 30); + assertDataOnAllNodes("tbl2", 100, 30); + assertDataOnAllNodes("tbl3", 200, 30); } @Test public void testForceRepairWithNodeDown() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 50); - broadcastOffsets(); + insertData("tbl", 0, 50); - // Isolate node 2 via message filters so failure detector marks it dead. - // We don't actually stop the JVM to avoid MutationTrackingService restart bugs. - isolateNode(2, 1); + isolateNode(2, 1, 3); - // Repair without --force should fail - nodetoolRepair(1, ksName).asserts().failure(); + List liveNodes = List.of(1, 3); + assertAllFailure(repairConcurrently(liveNodes, withPR(ksName))); - // Repair with --force should succeed - nodetoolRepair(1, ksName, "--force").asserts().success(); + repairFromNodesSuccess(liveNodes, ksName, "--force"); - // Verify data on live nodes - for (int node : new int[]{ 1, 3 }) + for (int node : liveNodes) { for (int i = 0; i < 50; i++) { @@ -339,164 +416,123 @@ public void testForceRepairWithNodeDown() @Test public void testForceRepairWithAllNodesUp() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 50); - broadcastOffsets(); + insertData("tbl", 0, 50); - nodetoolRepair(1, ksName, "--force").asserts().success(); + repairAllSuccess(ksName, "--force"); - assertDataOnAllNodes(ksName, "tbl", 0, 50); + assertDataOnAllNodes("tbl", 0, 50); } @Test public void testRepairWithSpecificHosts() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 50); - broadcastOffsets(); + insertData("tbl", 0, 50); String addr1 = getBroadcastAddress(1); String addr3 = getBroadcastAddress(3); - nodetoolRepair(1, ksName, "--in-hosts", addr1 + ',' + addr3).asserts().success(); + // -pr is incompatible with --in-hosts + List nodes = List.of(1, 3); + assertAllSuccess(repairConcurrently(nodes, ksName, "--in-hosts", addr1 + ',' + addr3)); + assertAllSuccess(repairConcurrently(nodes, ksName, "--in-hosts", addr1 + ',' + addr3)); } @Test public void testMigrationUntrackedToTrackedCompletesViaRepair() { - String ksName = nextKsName(); - createUntrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 100); + setupUntracked(); + insertData("tbl", 0, 100); - alterKeyspaceToTracked(ksName); - assertTrue("Migration should be in progress after ALTER", isMigrationInProgress(ksName)); + alterKeyspaceToTracked(); + assertTrue("Migration should be in progress after ALTER", isMigrationInProgress()); - repairFromAllNodes(ksName); + repairAllSuccess(ksName); + assertTrue("Migration should complete after repair", isMigrationComplete()); - assertTrue("Migration should complete after repair", isMigrationComplete(ksName)); - assertDataOnAllNodes(ksName, "tbl", 0, 100); + assertDataOnAllNodes("tbl", 0, 100); } @Test public void testDataAccessibleDuringMigration() { - String ksName = nextKsName(); - createUntrackedKeyspace(ksName); - createTable(ksName, "tbl"); + setupUntracked(); - // Pre-migration data - insertData(ksName, "tbl", 0, 50); + insertData("tbl", 0, 50); - // Start migration - alterKeyspaceToTracked(ksName); + alterKeyspaceToTracked(); - // Read pre-migration data Object[][] results = CLUSTER.coordinator(1).execute( "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("Pre-migration data should be readable", 50, results.length); - // Write new data during migration - insertData(ksName, "tbl", 50, 50); + insertData("tbl", 50, 50); - // Read all data during migration results = CLUSTER.coordinator(1).execute( "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All data should be readable during migration", 100, results.length); - // Complete migration via repair - repairFromAllNodes(ksName); + repairAllSuccess(ksName); + assertTrue("Migration should complete after repair", isMigrationComplete()); - // Verify all data still readable post-migration results = CLUSTER.coordinator(1).execute( "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All data should be readable after migration", 100, results.length); - // Write and read more data post-migration - insertData(ksName, "tbl", 100, 50); + insertData("tbl", 100, 50); results = CLUSTER.coordinator(1).execute( "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All data including post-migration should be readable", 150, results.length); - assertDataOnAllNodes(ksName, "tbl", 0, 150); + assertDataOnAllNodes("tbl", 0, 150); } @Test public void testMigrationTrackedToUntrackedCompletesViaRepair() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 100); - broadcastOffsets(); + insertData("tbl", 0, 100); - alterKeyspaceToUntracked(ksName); - assertTrue("Migration should be in progress after ALTER", isMigrationInProgress(ksName)); + alterKeyspaceToUntracked(); + assertTrue("Migration should be in progress after ALTER", isMigrationInProgress()); - repairFromAllNodes(ksName); + repairAllSuccess(ksName); + assertTrue("Migration should complete after repair", isMigrationComplete()); - assertTrue("Migration should complete after repair", isMigrationComplete(ksName)); - assertDataOnAllNodes(ksName, "tbl", 0, 100); + assertDataOnAllNodes("tbl", 0, 100); } @Test public void testRepairAdvancesMigrationState() { - String ksName = nextKsName(); - createUntrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 100); - - alterKeyspaceToTracked(ksName); - assertTrue("Full ring should be pending", isMigrationInProgress(ksName)); - - broadcastOffsets(); - nodetoolRepair(1, ksName).asserts().success(); + setupUntracked(); + insertData("tbl", 0, 100); - if (!isMigrationComplete(ksName)) - { - for (int node = 2; node <= CLUSTER.size(); node++) - { - broadcastOffsets(); - nodetoolRepair(node, ksName).asserts().success(); - if (isMigrationComplete(ksName)) - break; - } - } + alterKeyspaceToTracked(); + assertTrue("Full ring should be pending", isMigrationInProgress()); - assertTrue("Migration should complete after full repair", isMigrationComplete(ksName)); + repairAllSuccess(ksName); + assertTrue("Migration should complete after repair", isMigrationComplete()); } @Test public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() { - String ksName = nextKsName(); - createUntrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 50); + setupUntracked(); + insertData("tbl", 0, 50); - alterKeyspaceToTracked(ksName); - assertTrue("Migration should be in progress", isMigrationInProgress(ksName)); + alterKeyspaceToTracked(); + assertTrue("Migration should be in progress", isMigrationInProgress()); - // Isolate node 2 via message filters so failure detector marks it dead - isolateNode(2, 1); + isolateNode(2, 1, 3); - // Broadcast offsets on live nodes - for (int n : new int[]{ 1, 3 }) - CLUSTER.get(n).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); + List liveNodes = List.of(1, 3); + repairFromNodesSuccess(liveNodes, ksName, "--force"); - // Force repair should succeed - nodetoolRepair(1, ksName, "--force").asserts().success(); - - // Migration should NOT be advanced because deadNodesExcluded=true makes it INELIGIBLE + // deadNodesExcluded=true makes the result INELIGIBLE to advance migration + String ks = ksName; assertTrue("Migration should not advance with dead nodes excluded", CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); - KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName); + KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ks); return info != null; })); } @@ -504,68 +540,55 @@ public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() @Test public void testPreviewRepairDoesNotAdvanceMigration() { - String ksName = nextKsName(); - createUntrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 50); - - alterKeyspaceToTracked(ksName); - assertTrue("Migration should be in progress", isMigrationInProgress(ksName)); + setupUntracked(); + insertData("tbl", 0, 50); - broadcastOffsets(); + alterKeyspaceToTracked(); + assertTrue("Migration should be in progress", isMigrationInProgress()); - nodetoolRepair(1, ksName, "--preview").asserts().success(); + repairAllSuccess(ksName, "--preview"); - assertTrue("Migration should not advance with preview repair", isMigrationInProgress(ksName)); + assertTrue("Migration should not advance with preview repair", isMigrationInProgress()); } @Test public void testSubrangeRepair() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 100); - broadcastOffsets(); + insertData("tbl", 0, 100); - nodetoolRepair(1, ksName, "-st", "0", "-et", "1000000000").asserts().success(); + List liveNodes = getLiveNodes(); + assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); + assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); } @Test public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() { - String ksName = nextKsName(); - createUntrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 100); - - alterKeyspaceToTracked(ksName); - assertTrue("Full ring should be pending", isMigrationInProgress(ksName)); + setupUntracked(); + insertData("tbl", 0, 100); - broadcastOffsets(); + alterKeyspaceToTracked(); + assertTrue("Full ring should be pending", isMigrationInProgress()); - nodetoolRepair(1, ksName, "-st", "0", "-et", "1000000000").asserts().success(); + List liveNodes = getLiveNodes(); + assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); + assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); - // Migration should NOT be complete yet (only repaired a subrange) assertTrue("Migration should not be complete after subrange repair", - isMigrationInProgress(ksName)); + isMigrationInProgress()); } @Test public void testRepairTimeout() { - String ksName = nextKsName(); - createTrackedKeyspace(ksName); - createTable(ksName, "tbl"); - insertData(ksName, "tbl", 0, 50); + insertData("tbl", 0, 50); - // Block MT_SYNC_REQ messages to prevent sync from completing IMessageFilters.Filter filter = CLUSTER.filters() .verbs(Verb.MT_SYNC_REQ.id) .drop(); try { - nodetoolRepair(1, ksName).asserts().failure(); + repairAllFailure(ksName); } finally { @@ -576,31 +599,23 @@ public void testRepairTimeout() @Test public void testWriteDuringMigrationPreservedAfterCompletion() { - String ksName = nextKsName(); - createUntrackedKeyspace(ksName); - createTable(ksName, "tbl"); + setupUntracked(); - // Pre-migration data - insertData(ksName, "tbl", 0, 50); + insertData("tbl", 0, 50); - // Start migration - alterKeyspaceToTracked(ksName); + alterKeyspaceToTracked(); - // During-migration data - insertData(ksName, "tbl", 50, 50); + insertData("tbl", 50, 50); - // Complete migration - repairFromAllNodes(ksName); + repairAllSuccess(ksName); + assertTrue("Migration should complete after repair", isMigrationComplete()); - // Post-migration data - insertData(ksName, "tbl", 100, 50); + insertData("tbl", 100, 50); - // All 150 rows should be present on all nodes - assertDataOnAllNodes(ksName, "tbl", 0, 150); + assertDataOnAllNodes("tbl", 0, 150); - // Also verify via coordinator read at CL.ALL Object[][] results = CLUSTER.coordinator(1).execute( "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All 150 rows should be readable at CL.ALL", 150, results.length); } -} +} \ No newline at end of file From 2e27603bd5e32ead3c21d5e35abf28fabede4bc8 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 16 Mar 2026 17:06:45 -0400 Subject: [PATCH 21/46] During migration incremental repair might legitimately need to add sstables --- src/java/org/apache/cassandra/db/lifecycle/Tracker.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java index a5eedea941da..3f299c029a1e 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java +++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java @@ -272,7 +272,10 @@ public void updateInitialSSTableSize(Iterable sstables) public void addSSTables(Collection sstables) { - Preconditions.checkState(!cfstore.metadata().replicationType().isTracked()); + // TODO (REVIEW): Is there another reason why it's not ok for IR to do this during migration? @Blake Eggleston + // Note: tracked tables may legitimately use this path during migration from untracked to tracked, + // when incremental repair streams SSTables that were written before tracking was enabled. + // Preconditions.checkState(!cfstore.metadata().replicationType().isTracked()); addSSTablesInternal(sstables, false, true, true); } From d42314ef1fb08bf38b0417a5284a5437b533321d Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 16 Mar 2026 17:35:34 -0400 Subject: [PATCH 22/46] During migration SSTableWriter.finalizeMetadata() needs to accept IR marking sstables repaired to effect the migration --- .../io/sstable/format/SSTableWriter.java | 29 ++++++++++++++----- .../MutationTrackingSyncCoordinatorTest.java | 4 +-- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java index 4c64b0f31c7a..9706cc8b8754 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java @@ -38,6 +38,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.service.ActiveRepairService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.Clock; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.compaction.OperationType; @@ -59,10 +64,6 @@ import org.apache.cassandra.io.sstable.metadata.MetadataType; import org.apache.cassandra.io.sstable.metadata.StatsMetadata; import org.apache.cassandra.io.util.MmappedRegionsCache; -import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; -import org.apache.cassandra.replication.MutationTrackingService; -import org.apache.cassandra.service.ActiveRepairService; -import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.Transactional; @@ -337,13 +338,27 @@ public final void abort() protected Map finalizeMetadata() { - // Reconciliation should not occur before activation for coordinated transfer streams for tracked keyspaces. + // Reconciliation should not occur before activation for coordinated transfer streams for tracked keyspaces. boolean reconcile = txn.opType() != OperationType.STREAM; + // Migration from incremental repair to mutation tracking will be supported, but support for mixing + // incremental repair and mutation tracking is not planned. if (metadata().replicationType().isTracked() && repairedAt == ActiveRepairService.UNREPAIRED_SSTABLE && reconcile) { - Preconditions.checkState(Objects.equals(pendingRepair, ActiveRepairService.NO_PENDING_REPAIR)); - if (MutationTrackingService.instance().isDurablyReconciled(coordinatorLogOffsets)) + // During migration, incremental repair may write SSTables with pendingRepair set to a tracked table. + // Outside of migration, pendingRepair should never be set on a tracked table. + if (!Objects.equals(pendingRepair, ActiveRepairService.NO_PENDING_REPAIR)) + { + Preconditions.checkState(ClusterMetadata.current().mutationTrackingMigrationState + .getKeyspaceInfo(metadata().keyspace) != null, + "pendingRepair set on tracked table %s.%s outside of migration", + metadata().keyspace, metadata().name); + } + // Only attempt mutation tracking reconciliation when there is no pending repair. + // SSTables with empty coordinator log offsets were written before mutation tracking was enabled + // (e.g. during migration from untracked to tracked). They should not be marked as reconciled + // since they were never tracked and need to go through incremental repair first. + else if (!coordinatorLogOffsets.isEmpty() && MutationTrackingService.instance().isDurablyReconciled(coordinatorLogOffsets)) { repairedAt = Clock.Global.currentTimeMillis(); logger.debug("Marking SSTable {} as reconciled with repairedAt {}", descriptor, repairedAt); diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index d8bb7c46a850..1b9a2b206574 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -67,7 +67,7 @@ private String tableName(String suffix) private void pauseOffsetBroadcasts(Cluster cluster, boolean pause) { for (int i = 1; i <= cluster.size(); i++) - cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.pauseOffsetBroadcast(pause)); + cluster.get(i).runOnInstance(() -> MutationTrackingService.instance().pauseOffsetBroadcast(pause)); } private static Range fullTokenRange() @@ -198,7 +198,7 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable // After the sync response from node 1 establishes targets, the coordinator // needs to see that all replicas have caught up via offset broadcasts. for (int i = 1; i <= 3; i++) - cluster.get(i).runOnInstance(() -> MutationTrackingService.instance.broadcastOffsetsForTesting()); + cluster.get(i).runOnInstance(() -> MutationTrackingService.instance().broadcastOffsetsForTesting()); // Wait for coordinator to complete Awaitility.await() From 94cf54b994d228ec205a9222f48fd9efbe9d835e Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 16 Mar 2026 17:41:46 -0400 Subject: [PATCH 23/46] Can't run MT sync during migration away from mutation tracking because it will hang, so just use IR instead --- .../repair/MutationTrackingIncrementalRepairTask.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index a43c6b872227..515ae3ece941 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -181,10 +181,13 @@ public static boolean shouldUseMutationTrackingRepair(ClusterMetadata metadata, if (ksm.useMutationTracking()) return true; - // Check if keyspace is in migration (either direction) - // TODO (required): What we do depends on direction right? Migration to MT requires incremental repair, migration away requires MT repair - KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); - return migrationInfo != null; + // For tracked→untracked migration (keyspace is currently untracked but migration is in progress), + // use regular incremental repair instead of MT repair. The MT sync step can't complete for this + // direction because streaming doesn't update mutation tracking offsets, and the keyspace is moving + // away from tracking. Regular incremental repair will sync the data and the RepairJob callback + // handler will still advance the migration state. + // TODO (desired): This is an over simplification in that depending on which ranges are migrated we might be able to just run MT sync but running IR should also be fine + return false; } /** From 9cf77b918b7c4fee06cb0ff97a3754fb95653f9a Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 16 Mar 2026 17:49:45 -0400 Subject: [PATCH 24/46] RepairCoordinator add logging --- .../org/apache/cassandra/repair/RepairCoordinator.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index 4e208e3397a6..237a95410b16 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -88,7 +88,6 @@ import org.apache.cassandra.utils.WrappedRunnable; import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; -import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.progress.ProgressEvent; import org.apache.cassandra.utils.progress.ProgressEventNotifier; import org.apache.cassandra.utils.progress.ProgressEventType; @@ -555,15 +554,20 @@ else if (useMutationTracking) // Propagate the IR result on success since it drives migration advancement. RepairTask incrementalTask = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); AsyncPromise>> result = new AsyncPromise<>(); + logger.info("Migration to mutation tracking in progress for {}; running incremental repair before MT sync", state.keyspace); incrementalTask.perform(executor, validationScheduler).addCallback( irResult -> { + logger.info("Incremental repair completed for migration keyspace {}: hasFailed={}", state.keyspace, irResult.hasFailed()); Pair> irPair = Pair.create(irResult, incrementalTask::successMessage); mtTask.perform(executor, validationScheduler) .addCallback( mtResult -> result.trySuccess(irPair), result::tryFailure); }, - result::tryFailure + failure -> { + logger.warn("Incremental repair FAILED for migration keyspace {}", state.keyspace, failure); + result.tryFailure(failure); + } ); return result.addCallback((s, f) -> executor.shutdown()); } From adf3638b96e395ef684177197038a8e4bb3ca502 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Tue, 17 Mar 2026 14:56:45 -0400 Subject: [PATCH 25/46] During mutation tracking migration restrict repairs to being either entirely inside migrated ranges or entirely outside, but not both --- .../cassandra/db/ReadRepairVerbHandler.java | 28 ++++++- .../cassandra/repair/RepairCoordinator.java | 17 +++++ .../migration/KeyspaceMigrationInfo.java | 74 +++++++++++++++++++ .../cassandra/streaming/StreamPlan.java | 51 ++++++++++++- 4 files changed, 164 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java index d40359c14472..593b8a73f31c 100644 --- a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java +++ b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java @@ -17,9 +17,14 @@ */ package org.apache.cassandra.db; +import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.replication.TrackedWriteRequest; +import org.apache.cassandra.service.replication.migration.MigrationRouter; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.concurrent.AsyncPromise; public class ReadRepairVerbHandler extends AbstractMutationVerbHandler { @@ -32,7 +37,26 @@ public void applyMutation(Mutation mutation) void applyMutation(Message message, InetAddressAndPort respondToAddress) { - applyMutation(message.payload); - MessagingService.instance().send(message.emptyResponse(), respondToAddress); + Mutation mutation = message.payload; + if (MigrationRouter.isFullyTracked(mutation)) + { + // During migration, read repair mutations go through the tracked write coordinator path + // so they get an ID, are journaled, and are sent to all replicas with failure retries. + // Send the response asynchronously when local application completes to avoid blocking + // the mutation stage (which would deadlock since local apply is scheduled on the same stage). + AsyncPromise localApplication = new AsyncPromise<>(); + TrackedWriteRequest.perform(mutation, ConsistencyLevel.ANY, Dispatcher.RequestTime.forImmediateExecution(), localApplication); + localApplication.addCallback((success, failure) -> { + if (failure == null) + MessagingService.instance().send(message.emptyResponse(), respondToAddress); + else + MessagingService.instance().send(message.failureResponse(RequestFailureReason.UNKNOWN, failure), respondToAddress); + }); + } + else + { + mutation.apply(); + MessagingService.instance().send(message.emptyResponse(), respondToAddress); + } } } diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index 237a95410b16..86192374bc4c 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -53,6 +53,7 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.NormalizedRanges; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.RepairException; @@ -67,12 +68,14 @@ import org.apache.cassandra.repair.messages.RepairOption; import org.apache.cassandra.repair.state.CoordinatorState; import org.apache.cassandra.repair.state.ParticipateState; +import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.SystemDistributedKeyspace; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus; import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; @@ -142,6 +145,20 @@ public static RepairCoordinator create(StorageService storageService, int cmd, R options = options.withIncremental(false); } + // During migration, validate that repair ranges don't partially overlap with pending migration ranges. + // Ranges must be entirely inside or entirely outside the pending set so that a compatible repair + // behavior (MT vs normal IR) can be selected. + if (mtMigration) + { + KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); + if (migrationInfo != null) + { + NormalizedRanges repairRanges = NormalizedRanges.normalizedRanges(options.getRanges()); + KeyspaceMetadata ksm = metadata.schema.getKeyspaceMetadata(keyspace); + migrationInfo.areRangesPendingMigration(repairRanges, ksm, options.getColumnFamilies()); + } + } + return new RepairCoordinator(SharedContext.Global.instance, (ks, tables) -> storageService.getValidColumnFamilies(false, false, ks, tables), storageService::getLocalReplicas, diff --git a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java index ed193d61c293..9c03617accca 100644 --- a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java +++ b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java @@ -19,6 +19,7 @@ package org.apache.cassandra.service.replication.migration; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -36,7 +37,9 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; @@ -246,6 +249,77 @@ public boolean shouldUseTrackedForWrites(boolean isTracked, TableId tableId, Tok return isTracked || isTokenInPendingRange(tableId, token); } + /** + * Check whether any of the specified tables have the given ranges in their migration pending set. + * During migration, reads for pending ranges continue to use the untracked read path with blocking + * read repair, so it is safe to exclude mutation tracking streaming for those ranges. However, + * ranges must be either entirely inside or entirely outside the pending set for each table. + * + * @param ranges the normalized ranges to check + * @param tables the tables to check against + * @return true if any of the specified tables have these ranges pending migration + * @throws IllegalStateException if ranges partially overlap with pending ranges for any table + */ + public boolean areRangesPendingMigration(@Nonnull NormalizedRanges ranges, + @Nonnull Iterable tables) + { + boolean anyPending = false; + for (TableMetadata table : tables) + { + NormalizedRanges pendingRanges = getPendingRangesForTable(table.id); + if (pendingRanges.isEmpty()) + continue; + + NormalizedRanges overlap = pendingRanges.intersection(ranges); + if (overlap.isEmpty()) + continue; + + // Some ranges overlap with pending — verify ALL ranges are pending for this table + NormalizedRanges outside = ranges.subtract(pendingRanges); + if (!outside.isEmpty()) + throw new IllegalStateException(String.format( + "Ranges for keyspace %s partially overlap with migration pending ranges for table %s. " + + "Ranges must be entirely inside or entirely outside the pending set.", + keyspace, table.name)); + + anyPending = true; + } + return anyPending; + } + + /** + * Convenience overload that resolves column family names to table metadata before checking. + * If columnFamilies is empty, all tables in the keyspace are checked. + * + * @param ranges the normalized ranges to check + * @param ksm the keyspace metadata for resolving table names + * @param columnFamilies specific table names to check, or empty for all tables + * @return true if any of the specified tables have these ranges pending migration + * @throws IllegalStateException if ranges partially overlap with pending ranges for any table + */ + public boolean areRangesPendingMigration(@Nonnull NormalizedRanges ranges, + @Nonnull KeyspaceMetadata ksm, + @Nonnull Collection columnFamilies) + { + Iterable tables; + if (!columnFamilies.isEmpty()) + { + List tableList = new ArrayList<>(columnFamilies.size()); + for (String cf : columnFamilies) + { + TableMetadata table = ksm.tables.getNullable(cf); + if (table != null) + tableList.add(table); + } + tables = tableList; + } + else + { + tables = ksm.tables; + } + return areRangesPendingMigration(ranges, tables); + } + @Override public String toString() { diff --git a/src/java/org/apache/cassandra/streaming/StreamPlan.java b/src/java/org/apache/cassandra/streaming/StreamPlan.java index 8fa5a9a2314a..c426681e1e97 100644 --- a/src/java/org/apache/cassandra/streaming/StreamPlan.java +++ b/src/java/org/apache/cassandra/streaming/StreamPlan.java @@ -21,12 +21,16 @@ import com.google.common.annotations.VisibleForTesting; +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.replication.ReconciledKeyspaceOffsets; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.TimeUUID; @@ -116,7 +120,13 @@ public StreamPlan requestRanges(InetAddressAndPort from, String keyspace, Ranges // TODO: add flag for fully reconciled data only if this is for a tracked keyspace session.addStreamRequest(keyspace, fullRanges, transientRanges, Arrays.asList(columnFamilies)); +<<<<<<< HEAD if (includeMutationLogs(keyspace, session)) +======= + // Automatically include mutation logs for tracked keyspaces + if (shouldIncludeMutationTracking(keyspace, columnFamilies, fullRanges, transientRanges)) + { +>>>>>>> d5846f54d0 (During mutation tracking migration restrict repairs to being either entirely inside migrated ranges or entirely outside, but not both) session.addMutationLogRequest(keyspace, fullRanges, transientRanges); return this; @@ -134,7 +144,13 @@ public StreamPlan requestRanges(InetAddressAndPort from, String keyspace, Ranges public StreamPlan transferRanges(InetAddressAndPort to, String keyspace, RangesAtEndpoint replicas, String... columnFamilies) { StreamSession session = coordinator.getOrCreateOutboundSession(to); +<<<<<<< HEAD ReconciledKeyspaceOffsets reconciledKeyspaceOffsets = includeMutationLogs(keyspace, session) +======= + + // Automatically include mutation logs for tracked keyspaces + ReconciledKeyspaceOffsets reconciledKeyspaceOffsets = shouldIncludeMutationTracking(keyspace, columnFamilies, replicas) +>>>>>>> d5846f54d0 (During mutation tracking migration restrict repairs to being either entirely inside migrated ranges or entirely outside, but not both) ? session.addMutationLogTransfer(keyspace, replicas) : null; @@ -272,13 +288,40 @@ public static boolean hasAccordTables(KeyspaceMetadata ksm) } /** - * Check if the given keyspace uses tracked replication, which requires mutation log streaming. + * Whether mutation tracking streaming (mutation log transfers) should be included for the given + * keyspace and ranges. Returns true only when the keyspace uses tracked replication AND the + * ranges have completed migration (are not in the pending set). During migration, ranges still + * in the pending set use normal streaming without mutation tracking. + * + * This is safe because during migration (both to and from mutation tracking), reads for pending + * ranges continue to use the untracked read path with blocking read repair, which ensures + * consistency without requiring mutation log streaming for those ranges. * * @param keyspace the keyspace name - * @return true if the keyspace uses tracked replication + * @param columnFamilies the specific tables being streamed; empty means all tables + * @param allRanges the ranges being streamed + * @throws IllegalStateException if ranges partially overlap with pending migration ranges for any involved table */ - private boolean isTrackedReplicationEnabled(String keyspace) + private boolean shouldIncludeMutationTracking(String keyspace, String[] columnFamilies, RangesAtEndpoint... allRanges) { - return ClusterMetadata.current().schema.getKeyspaceMetadata(keyspace).useMutationTracking(); + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMetadata ksm = metadata.schema.getKeyspaceMetadata(keyspace); + if (!ksm.useMutationTracking()) + return false; + + KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); + if (migrationInfo == null) + return true; + + List> rangeList = new ArrayList<>(); + for (RangesAtEndpoint ranges : allRanges) + for (Replica r : ranges) + rangeList.add(r.range()); + + if (rangeList.isEmpty()) + return false; + + NormalizedRanges streamRanges = NormalizedRanges.normalizedRanges(rangeList); + return !migrationInfo.areRangesPendingMigration(streamRanges, ksm, Arrays.asList(columnFamilies)); } } From b4b06b3af11e9cb8750e75d1583750ca1b090684 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Tue, 17 Mar 2026 14:58:48 -0400 Subject: [PATCH 26/46] Spurious error trying to set failure on promise when it has already been completed, use tryFailure instead --- .../org/apache/cassandra/service/ActiveRepairService.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java index 47687551cfd7..7872d207da88 100644 --- a/src/java/org/apache/cassandra/service/ActiveRepairService.java +++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java @@ -780,7 +780,7 @@ public void onFailure(InetAddressAndPort from, RequestFailure failure) if (failure.reason == RequestFailureReason.TIMEOUT) { pending.set(-1); - promise.setFailure(failRepairException(parentRepairSession, "Did not get replies from all endpoints.")); + promise.tryFailure(failRepairException(parentRepairSession, "Did not get replies from all endpoints.")); } else { @@ -798,7 +798,7 @@ private void ack() } else { - promise.setFailure(failRepairException(parentRepairSession, "Got negative replies from endpoints " + failedNodes)); + promise.tryFailure(failRepairException(parentRepairSession, "Got negative replies from endpoints " + failedNodes)); } } } From 3f6f640021eca18068650b0c4b335bfcbbb456c4 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Tue, 17 Mar 2026 16:05:48 -0400 Subject: [PATCH 27/46] Allow read repair mutations to be applied during migration --- .../db/AbstractMutationVerbHandler.java | 7 +++++ .../db/CassandraKeyspaceWriteHandler.java | 3 +- .../org/apache/cassandra/db/Keyspace.java | 4 +-- .../org/apache/cassandra/db/Mutation.java | 23 +++++++++++++-- .../cassandra/db/ReadRepairVerbHandler.java | 29 +++---------------- .../repair/RepairMessageVerbHandler.java | 2 +- 6 files changed, 37 insertions(+), 31 deletions(-) diff --git a/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java b/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java index cb7fe9d8c358..e35b4c4dde96 100644 --- a/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java +++ b/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java @@ -202,6 +202,13 @@ else if (message.epoch().isBefore(metadata.schema.lastModified())) */ protected ClusterMetadata checkReplicationMigration(ClusterMetadata metadata, Message message, InetAddressAndPort respondTo) { + // Read repair mutations always bypass mutation tracking and use the untracked + // write path, so skip the replication migration routing check. The isReadRepair + // flag on the mutation hasn't been set yet at this point — it's set later in + // applyMutation() — so we check the handler type instead. + if (this instanceof ReadRepairVerbHandler) + return metadata; + IMutation mutation = message.payload; MutationRouting expected = mutation.id().isNone() ? MutationRouting.UNTRACKED : MutationRouting.TRACKED; if (expected == MigrationRouter.getMutationRouting(metadata, mutation)) diff --git a/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java b/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java index 9b68acc708e1..0648233ba34a 100644 --- a/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java +++ b/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java @@ -47,7 +47,8 @@ public WriteContext beginWrite(Mutation mutation, boolean makeDurable) throws Re { group = Keyspace.writeOrder.start(); - MigrationRouter.validateUntrackedMutation(mutation); + if (!mutation.isReadRepair()) + MigrationRouter.validateUntrackedMutation(mutation); // write the mutation to the commitlog and memtables CommitLogPosition position = null; if (makeDurable) diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java index 5e5f412a9b46..92e633bb1bca 100644 --- a/src/java/org/apache/cassandra/db/Keyspace.java +++ b/src/java/org/apache/cassandra/db/Keyspace.java @@ -437,7 +437,7 @@ public void apply(final Mutation mutation, boolean updateIndexes, boolean isDroppable) { - if (MigrationRouter.isFullyTracked(mutation)) + if (MigrationRouter.isFullyTracked(mutation) && !mutation.isReadRepair()) applyInternalTracked(mutation, null); else applyInternal(mutation, makeDurable, updateIndexes, isDroppable, false, null); @@ -460,7 +460,7 @@ private Future applyInternal(final Mutation mutation, boolean isDeferrable, Promise future) { - Preconditions.checkState(!getMetadata().useMutationTracking() && mutation.id().isNone()); + Preconditions.checkState((!getMetadata().useMutationTracking() || mutation.isReadRepair()) && mutation.id().isNone()); if (TEST_FAIL_WRITES && getMetadata().name.equals(TEST_FAIL_WRITES_KS)) throw new RuntimeException("Testing write failures"); diff --git a/src/java/org/apache/cassandra/db/Mutation.java b/src/java/org/apache/cassandra/db/Mutation.java index 1b2850bd58c1..4aa39c19a91d 100644 --- a/src/java/org/apache/cassandra/db/Mutation.java +++ b/src/java/org/apache/cassandra/db/Mutation.java @@ -108,6 +108,21 @@ public class Mutation implements IMutation, Supplier, Commitable // because it is being applied by one or in a context where transaction conflicts don't occur private PotentialTxnConflicts potentialTxnConflicts; + // Transient: not serialized on the wire. Set by ReadRepairVerbHandler on the + // receiving side so downstream code (Keyspace.apply, write handlers) can route + // read repair mutations through the untracked write path during migration. + private transient boolean isReadRepair; + + public void setReadRepair(boolean readRepair) + { + this.isReadRepair = readRepair; + } + + public boolean isReadRepair() + { + return isReadRepair; + } + public Mutation(MutationId id, PartitionUpdate update) { this(id, update.metadata().keyspace, update.partitionKey(), ImmutableMap.of(update.metadata().id, update), approxTime.now(), update.metadata().params.cdc, PotentialTxnConflicts.DISALLOW); @@ -148,7 +163,9 @@ public MutationId id() @Override public Mutation withMutationId(MutationId mutationId) { - return new Mutation(mutationId, keyspaceName, key, modifications, approxCreatedAtNanos, cdcEnabled, potentialTxnConflicts); + Mutation m = new Mutation(mutationId, keyspaceName, key, modifications, approxCreatedAtNanos, cdcEnabled, potentialTxnConflicts); + m.isReadRepair = this.isReadRepair; + return m; } private static boolean cdcEnabled(Iterable modifications) @@ -182,7 +199,9 @@ private static boolean cdcEnabled(Iterable modifications) Map updates = builder.build(); checkState(!updates.isEmpty(), "Updates should not be empty"); - return new Mutation(id, keyspaceName, key, builder.build(), approxCreatedAtNanos, potentialTxnConflicts); + Mutation result = new Mutation(id, keyspaceName, key, builder.build(), approxCreatedAtNanos, potentialTxnConflicts); + result.isReadRepair = this.isReadRepair; + return result; } public @Nullable Mutation without(TableId tableId) diff --git a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java index 593b8a73f31c..fb9bff3bb1eb 100644 --- a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java +++ b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java @@ -17,14 +17,9 @@ */ package org.apache.cassandra.db; -import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.replication.TrackedWriteRequest; -import org.apache.cassandra.service.replication.migration.MigrationRouter; -import org.apache.cassandra.transport.Dispatcher; -import org.apache.cassandra.utils.concurrent.AsyncPromise; public class ReadRepairVerbHandler extends AbstractMutationVerbHandler { @@ -32,31 +27,15 @@ public class ReadRepairVerbHandler extends AbstractMutationVerbHandler public void applyMutation(Mutation mutation) { + mutation.setReadRepair(true); mutation.apply(); } void applyMutation(Message message, InetAddressAndPort respondToAddress) { Mutation mutation = message.payload; - if (MigrationRouter.isFullyTracked(mutation)) - { - // During migration, read repair mutations go through the tracked write coordinator path - // so they get an ID, are journaled, and are sent to all replicas with failure retries. - // Send the response asynchronously when local application completes to avoid blocking - // the mutation stage (which would deadlock since local apply is scheduled on the same stage). - AsyncPromise localApplication = new AsyncPromise<>(); - TrackedWriteRequest.perform(mutation, ConsistencyLevel.ANY, Dispatcher.RequestTime.forImmediateExecution(), localApplication); - localApplication.addCallback((success, failure) -> { - if (failure == null) - MessagingService.instance().send(message.emptyResponse(), respondToAddress); - else - MessagingService.instance().send(message.failureResponse(RequestFailureReason.UNKNOWN, failure), respondToAddress); - }); - } - else - { - mutation.apply(); - MessagingService.instance().send(message.emptyResponse(), respondToAddress); - } + mutation.setReadRepair(true); + mutation.apply(); + MessagingService.instance().send(message.emptyResponse(), respondToAddress); } } diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java index 2060ec1badb0..32e6cae668a9 100644 --- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java +++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java @@ -35,7 +35,7 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; -import org.apache.cassandra.net.Verb; + import org.apache.cassandra.replication.CoordinatorLogId; import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.replication.Offsets; From 20c464ce847b4b705fce923e0cf6b11d8536a11a Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Wed, 18 Mar 2026 16:19:26 -0400 Subject: [PATCH 28/46] Finish MutationTrackingRepairTest --- .../repair/MutationTrackingRepairTest.java | 411 +++++++++++------- 1 file changed, 256 insertions(+), 155 deletions(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 420de9da7d96..77ddf5672427 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.net.UnknownHostException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; @@ -36,17 +37,23 @@ import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.Feature; -import org.apache.cassandra.distributed.api.IMessageFilters; + import org.apache.cassandra.distributed.api.NodeToolResult; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Verb; import org.apache.cassandra.repair.MutationTrackingIncrementalRepairTask; +import org.apache.cassandra.service.StorageService; import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import static java.util.function.Predicate.not; @@ -63,6 +70,9 @@ */ public class MutationTrackingRepairTest extends TestBaseImpl { + private static final int NUM_NODES = 3; + private static final List ALL_NODES = List.of(1, 2, 3); + private static Cluster CLUSTER; private static ExecutorService executor; private static final AtomicInteger ksCounter = new AtomicInteger(); @@ -74,14 +84,14 @@ public static void setupCluster() throws IOException { executor = Executors.newCachedThreadPool(); CLUSTER = Cluster.build() - .withNodes(3) + .withNodes(NUM_NODES) .withConfig(cfg -> cfg.set("mutation_tracking_enabled", true) .set("mutation_tracking_sync_timeout", "10s") .set("request_timeout", "1000ms") .set("repair.retries.max_attempts", 10) .set("repair.retries.base_sleep_time", "100ms") .set("repair.retries.max_sleep_time", "500ms") - .with(Feature.GOSSIP)) + .with(Feature.GOSSIP, Feature.NETWORK)) .start(); } @@ -164,23 +174,57 @@ private void insertData(String tableName, int start, int count) } } - private void assertDataOnAllNodes(String tableName, int start, int count) + private void insertDataWithInconsistency(String tableName, int start, int count) + { + insertDataWithInconsistency(2, tableName, start, count); + } + + private void insertDataWithInconsistency(int isolatedNode, String tableName, int start, int count) + { + // Isolate a node so background reconcilation has some work to do + CLUSTER.filters().allVerbs().to(isolatedNode).drop(); + CLUSTER.filters().allVerbs().from(isolatedNode).drop(); + + for (int i = start; i < start + count; i++) + { + CLUSTER.coordinator(1).execute( + "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", + ConsistencyLevel.QUORUM, i, i); + } + + CLUSTER.filters().reset(); + + // Verify the isolated node is actually missing the data we just wrote + Object[][] results = CLUSTER.get(isolatedNode).executeInternal( + "SELECT k FROM " + ksName + '.' + tableName + " WHERE k >= ? AND k < ? ALLOW FILTERING", + start, start + count); + assertEquals("Node " + isolatedNode + " should not have data written while isolated", + 0, results.length); + } + + private void assertDataOnAllNodes(String tableName, List keys) { for (int node = 1; node <= CLUSTER.size(); node++) { - if (CLUSTER.get(node).isShutdown()) - continue; - for (int i = start; i < start + count; i++) + for (int key : keys) { Object[][] results = CLUSTER.get(node).executeInternal( - "SELECT k, v FROM " + ksName + '.' + tableName + " WHERE k = ?", i); - assertEquals("Node " + node + " missing row k=" + i, 1, results.length); - assertEquals(i, results[0][0]); - assertEquals(i, results[0][1]); + "SELECT k, v FROM " + ksName + '.' + tableName + " WHERE k = ?", key); + assertEquals("Node " + node + " missing row k=" + key, 1, results.length); + assertEquals(key, results[0][0]); + assertEquals(key, results[0][1]); } } } + private void assertDataOnAllNodes(String tableName, int start, int count) + { + List keys = new ArrayList<>(count); + for (int i = start; i < start + count; i++) + keys.add(i); + assertDataOnAllNodes(tableName, keys); + } + private NodeToolResult nodetoolRepair(int node, String... args) { String[] cmd = new String[args.length + 1]; @@ -189,15 +233,6 @@ private NodeToolResult nodetoolRepair(int node, String... args) return CLUSTER.get(node).nodetoolResult(cmd); } - private List getLiveNodes() - { - List nodes = new ArrayList<>(); - for (int i = 1; i <= CLUSTER.size(); i++) - if (!CLUSTER.get(i).isShutdown()) - nodes.add(i); - return nodes; - } - private List repairConcurrently(List nodes, String... args) { List> futures = new ArrayList<>(); @@ -241,18 +276,37 @@ private String[] withPR(String... args) return result; } - private void repairAllSuccess(String... args) + private void repairResolvingInconsistency(String... args) throws Exception { - String[] prArgs = withPR(args); - List liveNodes = getLiveNodes(); - assertAllSuccess(repairConcurrently(liveNodes, prArgs)); - // Run a second time to make sure repair can be run multiple times without failing - assertAllSuccess(repairConcurrently(liveNodes, prArgs)); + repairResolvingInconsistency(2, ALL_NODES, withPR(args)); } - private void repairAllFailure(String... args) + private void repairResolvingInconsistency(int isolatedNode, List nodes, String... args) throws Exception { - assertAllFailure(repairConcurrently(getLiveNodes(), withPR(args))); + // Dropping messages is to check that repair retries messages if needed + CLUSTER.filters().allVerbs().to(isolatedNode).drop(); + CLUSTER.filters().allVerbs().from(isolatedNode).drop(); + + List> futures = new ArrayList<>(); + for (int node : nodes) + { + int n = node; + futures.add(executor.submit(() -> nodetoolRepair(n, args))); + } + + Thread.sleep(2000); + assertTrue("Repair should be blocked while node " + isolatedNode + " is isolated", + futures.stream().allMatch(not(Future::isDone))); + + CLUSTER.filters().reset(); + + List results = new ArrayList<>(); + for (Future f : futures) + results.add(f.get(30, TimeUnit.SECONDS)); + assertAllSuccess(results); + + // Run a second time to make sure repair can be run multiple times without failing + assertAllSuccess(repairConcurrently(nodes, args)); } private void repairFromNodesSuccess(List nodes, String... args) @@ -281,6 +335,41 @@ private boolean isMigrationComplete() }); } + /** + * Get the primary token range for a node as [start, end] token values. + * With SimpleStrategy RF=3 and 3 nodes, each node has exactly one primary range. + */ + private long[] getPrimaryRangeTokens(int node) + { + String ks = ksName; + return CLUSTER.get(node).callOnInstance(() -> { + var ranges = StorageService.instance.getPrimaryRanges(ks); + assertEquals(1, ranges.size()); + Range range = ranges.iterator().next(); + return new long[]{ + ((Murmur3Partitioner.LongToken) range.left).token, + ((Murmur3Partitioner.LongToken) range.right).token + }; + }); + } + + /** + * Compute which integer keys from [start, start+count) hash into the given token range. + */ + private List keysInTokenRange(int start, int count, long rangeStart, long rangeEnd) + { + Range range = new Range<>(new Murmur3Partitioner.LongToken(rangeStart), + new Murmur3Partitioner.LongToken(rangeEnd)); + List keys = new ArrayList<>(); + for (int i = start; i < start + count; i++) + { + Token token = Murmur3Partitioner.instance.getToken(ByteBufferUtil.bytes(i)); + if (range.contains(token)) + keys.add(i); + } + return keys; + } + private String getBroadcastAddress(int node) { return CLUSTER.get(node).callOnInstance(() -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); @@ -310,80 +399,51 @@ private void isolateNode(int nodeToIsolate, int... observerNodes) } @Test - public void testBasicRepairHappyPath() + public void testBasicRepairHappyPath() throws Exception { - insertData("tbl", 0, 100); + insertDataWithInconsistency("tbl", 0, 100); - repairAllSuccess(ksName); + repairResolvingInconsistency(ksName); assertDataOnAllNodes("tbl", 0, 100); } @Test - public void testRepairSpecificTable() + public void testRepairSpecificTable() throws Exception { createTable("tbl1"); createTable("tbl2"); - insertData("tbl1", 0, 50); - insertData("tbl2", 100, 50); - - repairAllSuccess(ksName, "tbl1"); - + // Repair only tbl1 + insertDataWithInconsistency("tbl1", 0, 50); + repairResolvingInconsistency(ksName, "tbl1"); assertDataOnAllNodes("tbl1", 0, 50); - assertDataOnAllNodes("tbl2", 100, 50); - } - - @Test - public void testRepairConvergesInconsistentReplicas() throws Exception - { - CLUSTER.filters().allVerbs().from(2).drop(); - CLUSTER.filters().allVerbs().to(2).drop(); - - for (int i = 0; i < 50; i++) - { - CLUSTER.coordinator(1).execute( - "INSERT INTO " + ksName + ".tbl (k, v) VALUES (?, ?)", - ConsistencyLevel.QUORUM, i, i); - } - - Object[][] results = CLUSTER.get(2).executeInternal( - "SELECT k FROM " + ksName + ".tbl"); - assertEquals("Node 2 should not have data while isolated", 0, results.length); - - // Start -pr repair from all nodes in background — will get stuck waiting for node 2 - String[] prArgs = withPR(ksName); - List> futures = new ArrayList<>(); - for (int node = 1; node <= CLUSTER.size(); node++) - { - int n = node; - futures.add(executor.submit(() -> nodetoolRepair(n, prArgs))); - } - - Thread.sleep(2000); - assertTrue(futures.stream().allMatch(not(Future::isDone))); - CLUSTER.filters().reset(); - List repairResults = new ArrayList<>(); - for (Future f : futures) - repairResults.add(f.get(30, TimeUnit.SECONDS)); - assertAllSuccess(repairResults); + // Repair only tbl2 while tbl1 already has repaired data + insertDataWithInconsistency("tbl2", 0, 50); + repairResolvingInconsistency(ksName, "tbl2"); + assertDataOnAllNodes("tbl2", 0, 50); - assertDataOnAllNodes("tbl", 0, 50); + // Repair both tables together + insertDataWithInconsistency("tbl1", 50, 50); + insertDataWithInconsistency("tbl2", 50, 50); + repairResolvingInconsistency(ksName, "tbl1", "tbl2"); + assertDataOnAllNodes("tbl1", 0, 100); + assertDataOnAllNodes("tbl2", 0, 100); } @Test - public void testRepairMultipleTables() + public void testRepairAllTables() throws Exception { createTable("tbl1"); createTable("tbl2"); createTable("tbl3"); - insertData("tbl1", 0, 30); - insertData("tbl2", 100, 30); - insertData("tbl3", 200, 30); + insertDataWithInconsistency("tbl1", 0, 30); + insertDataWithInconsistency("tbl2", 100, 30); + insertDataWithInconsistency("tbl3", 200, 30); - repairAllSuccess(ksName); + repairResolvingInconsistency(ksName); assertDataOnAllNodes("tbl1", 0, 30); assertDataOnAllNodes("tbl2", 100, 30); @@ -393,7 +453,7 @@ public void testRepairMultipleTables() @Test public void testForceRepairWithNodeDown() { - insertData("tbl", 0, 50); + insertDataWithInconsistency(3, "tbl", 0, 50); isolateNode(2, 1, 3); @@ -414,11 +474,11 @@ public void testForceRepairWithNodeDown() } @Test - public void testForceRepairWithAllNodesUp() + public void testForceRepairWithAllNodesUp() throws Exception { - insertData("tbl", 0, 50); + insertDataWithInconsistency("tbl", 0, 50); - repairAllSuccess(ksName, "--force"); + repairResolvingInconsistency(ksName, "--force"); assertDataOnAllNodes("tbl", 0, 50); } @@ -426,40 +486,67 @@ public void testForceRepairWithAllNodesUp() @Test public void testRepairWithSpecificHosts() { - insertData("tbl", 0, 50); - String addr1 = getBroadcastAddress(1); String addr3 = getBroadcastAddress(3); - // -pr is incompatible with --in-hosts - List nodes = List.of(1, 3); - assertAllSuccess(repairConcurrently(nodes, ksName, "--in-hosts", addr1 + ',' + addr3)); - assertAllSuccess(repairConcurrently(nodes, ksName, "--in-hosts", addr1 + ',' + addr3)); + insertDataWithInconsistency(3, "tbl", 0, 50); + + // Node 2 is down, so normal repair should fail + isolateNode(2, 1, 3); + + List liveNodes = List.of(1, 3); + assertAllFailure(repairConcurrently(liveNodes, withPR(ksName))); + + // Repair with --in-hosts scoped to only the live nodes should succeed + // Note: --in-hosts cannot be combined with -pr + String[] args = new String[]{ ksName, "--in-hosts", addr1 + ',' + addr3 }; + assertAllSuccess(repairConcurrently(liveNodes, args)); + assertAllSuccess(repairConcurrently(liveNodes, args)); + + for (int node : liveNodes) + { + for (int i = 0; i < 50; i++) + { + Object[][] results = CLUSTER.get(node).executeInternal( + "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); + assertEquals("Node " + node + " missing row k=" + i, 1, results.length); + } + } } @Test - public void testMigrationUntrackedToTrackedCompletesViaRepair() + public void testMigrationUntrackedToTrackedCompletesViaRepair() throws Exception { setupUntracked(); - insertData("tbl", 0, 100); + insertDataWithInconsistency("tbl", 0, 100); alterKeyspaceToTracked(); assertTrue("Migration should be in progress after ALTER", isMigrationInProgress()); - repairAllSuccess(ksName); + repairResolvingInconsistency(ksName); assertTrue("Migration should complete after repair", isMigrationComplete()); assertDataOnAllNodes("tbl", 0, 100); } @Test - public void testDataAccessibleDuringMigration() + public void testDataAccessibleDuringMigrationToTracked() throws Exception { setupUntracked(); + dataAccessibleDuringMigration(() -> alterKeyspaceToTracked()); + } - insertData("tbl", 0, 50); + @Test + public void testDataAccessibleDuringMigrationToUntracked() throws Exception + { + dataAccessibleDuringMigration(() -> alterKeyspaceToUntracked()); + } - alterKeyspaceToTracked(); + private void dataAccessibleDuringMigration(Runnable alterKeyspace) throws Exception + { + insertDataWithInconsistency("tbl", 0, 50); + + alterKeyspace.run(); Object[][] results = CLUSTER.coordinator(1).execute( "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); @@ -471,7 +558,7 @@ public void testDataAccessibleDuringMigration() "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All data should be readable during migration", 100, results.length); - repairAllSuccess(ksName); + repairResolvingInconsistency(ksName); assertTrue("Migration should complete after repair", isMigrationComplete()); results = CLUSTER.coordinator(1).execute( @@ -487,37 +574,24 @@ public void testDataAccessibleDuringMigration() } @Test - public void testMigrationTrackedToUntrackedCompletesViaRepair() + public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception { - insertData("tbl", 0, 100); + insertDataWithInconsistency("tbl", 0, 100); alterKeyspaceToUntracked(); assertTrue("Migration should be in progress after ALTER", isMigrationInProgress()); - repairAllSuccess(ksName); + repairResolvingInconsistency(ksName); assertTrue("Migration should complete after repair", isMigrationComplete()); assertDataOnAllNodes("tbl", 0, 100); } - @Test - public void testRepairAdvancesMigrationState() - { - setupUntracked(); - insertData("tbl", 0, 100); - - alterKeyspaceToTracked(); - assertTrue("Full ring should be pending", isMigrationInProgress()); - - repairAllSuccess(ksName); - assertTrue("Migration should complete after repair", isMigrationComplete()); - } - @Test public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() { setupUntracked(); - insertData("tbl", 0, 50); + insertDataWithInconsistency(3, "tbl", 0, 50); alterKeyspaceToTracked(); assertTrue("Migration should be in progress", isMigrationInProgress()); @@ -527,7 +601,6 @@ public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() List liveNodes = List.of(1, 3); repairFromNodesSuccess(liveNodes, ksName, "--force"); - // deadNodesExcluded=true makes the result INELIGIBLE to advance migration String ks = ksName; assertTrue("Migration should not advance with dead nodes excluded", CLUSTER.get(1).callOnInstance(() -> { @@ -538,44 +611,82 @@ public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() } @Test - public void testPreviewRepairDoesNotAdvanceMigration() + public void testPreviewRepairDoesNotAdvanceMigration() throws Exception { setupUntracked(); - insertData("tbl", 0, 50); + insertDataWithInconsistency("tbl", 0, 50); alterKeyspaceToTracked(); assertTrue("Migration should be in progress", isMigrationInProgress()); - repairAllSuccess(ksName, "--preview"); + repairResolvingInconsistency(ksName, "--preview"); assertTrue("Migration should not advance with preview repair", isMigrationInProgress()); } @Test - public void testSubrangeRepair() + public void testSubrangeRepair() throws Exception { - insertData("tbl", 0, 100); + long[] primaryRange = getPrimaryRangeTokens(1); + String st = Long.toString(primaryRange[0]); + String et = Long.toString(primaryRange[1]); + + insertDataWithInconsistency("tbl", 0, 100); + + repairResolvingInconsistency(2, ALL_NODES, ksName, "-st", st, "-et", et); - List liveNodes = getLiveNodes(); - assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); - assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); + List keysInRange = keysInTokenRange(0, 100, primaryRange[0], primaryRange[1]); + assertFalse("Should have keys hashing into node 1's primary range", keysInRange.isEmpty()); + + assertDataOnAllNodes("tbl", keysInRange); } @Test - public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() + public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Exception { setupUntracked(); - insertData("tbl", 0, 100); + long[] primaryRange = getPrimaryRangeTokens(1); + String st = Long.toString(primaryRange[0]); + String et = Long.toString(primaryRange[1]); + + insertDataWithInconsistency("tbl", 0, 100); alterKeyspaceToTracked(); assertTrue("Full ring should be pending", isMigrationInProgress()); - List liveNodes = getLiveNodes(); - assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); - assertAllSuccess(repairConcurrently(liveNodes, ksName, "-st", "0", "-et", "1000000000")); + // During migration, subrange repair uses incremental repair. Running from all nodes + // on the same subrange causes anti-compaction conflicts, so repair from a single node. + repairResolvingInconsistency(2, List.of(1), ksName, "-st", st, "-et", et); assertTrue("Migration should not be complete after subrange repair", isMigrationInProgress()); + + // Verify the repaired range is no longer pending but other ranges still are + String ks = ksName; + long rangeStart = primaryRange[0]; + long rangeEnd = primaryRange[1]; + CLUSTER.get(1).runOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ks); + assertNotNull("Migration info should still exist", info); + + Range repairedRange = new Range<>(new Murmur3Partitioner.LongToken(rangeStart), + new Murmur3Partitioner.LongToken(rangeEnd)); + for (var entry : info.pendingRangesPerTable.entrySet()) + { + for (Range pending : entry.getValue()) + { + assertFalse("Repaired range should not overlap with pending ranges for table " + entry.getKey(), + repairedRange.intersects(pending)); + } + } + }); + + // Verify all keys in the repaired range are present on all nodes + List keysInRange = keysInTokenRange(0, 100, primaryRange[0], primaryRange[1]); + assertFalse("Should have keys hashing into node 1's primary range", keysInRange.isEmpty()); + + assertDataOnAllNodes("tbl", keysInRange); } @Test @@ -583,39 +694,29 @@ public void testRepairTimeout() { insertData("tbl", 0, 50); - IMessageFilters.Filter filter = CLUSTER.filters() - .verbs(Verb.MT_SYNC_REQ.id) - .drop(); - try - { - repairAllFailure(ksName); - } - finally - { - filter.off(); - } + CLUSTER.filters().allVerbs().to(2).drop(); + CLUSTER.filters().allVerbs().from(2).drop(); + + List results = repairConcurrently(ALL_NODES, withPR(ksName)); + assertAllFailure(results); + for (NodeToolResult r : results) + assertTrue("Expected timeout error but got: " + r.getStderr(), + r.getStderr().contains("Did not get replies from all endpoints")); } @Test - public void testWriteDuringMigrationPreservedAfterCompletion() + public void testRepairSyncTimeout() { - setupUntracked(); - - insertData("tbl", 0, 50); - - alterKeyspaceToTracked(); + insertDataWithInconsistency("tbl", 0, 50); - insertData("tbl", 50, 50); - - repairAllSuccess(ksName); - assertTrue("Migration should complete after repair", isMigrationComplete()); - - insertData("tbl", 100, 50); + // Drop only offset broadcasts so MT_SYNC_REQ/RSP can succeed but + // reconciliation never completes, triggering mutation_tracking_sync_timeout + CLUSTER.filters().verbs(Verb.BROADCAST_LOG_OFFSETS.id).drop(); - assertDataOnAllNodes("tbl", 0, 150); - - Object[][] results = CLUSTER.coordinator(1).execute( - "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); - assertEquals("All 150 rows should be readable at CL.ALL", 150, results.length); + List results = repairConcurrently(ALL_NODES, withPR(ksName)); + assertAllFailure(results); + for (NodeToolResult r : results) + assertTrue("Expected sync timeout error but got: " + r.getStderr(), + r.getStderr().contains("Mutation tracking sync timed out")); } } \ No newline at end of file From 04c49bad7f6177c51c393890b753425fc32cc430 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 19 Mar 2026 13:18:10 -0400 Subject: [PATCH 29/46] Final test and bug fixes after rebase --- .../io/sstable/format/SSTableWriter.java | 10 +- .../cassandra/streaming/StreamPlan.java | 51 +-- ...tionTrackingIncrementalRepairTaskTest.java | 5 +- .../repair/MutationTrackingRepairTest.java | 363 +++++++++--------- ...ckingMutationVerbHandlerMigrationTest.java | 1 - 5 files changed, 189 insertions(+), 241 deletions(-) diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java index 9706cc8b8754..cab0c54a29a6 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java @@ -38,11 +38,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; -import org.apache.cassandra.replication.MutationTrackingService; -import org.apache.cassandra.service.ActiveRepairService; -import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.utils.Clock; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.compaction.OperationType; @@ -64,6 +59,11 @@ import org.apache.cassandra.io.sstable.metadata.MetadataType; import org.apache.cassandra.io.sstable.metadata.StatsMetadata; import org.apache.cassandra.io.util.MmappedRegionsCache; +import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.service.ActiveRepairService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.Transactional; diff --git a/src/java/org/apache/cassandra/streaming/StreamPlan.java b/src/java/org/apache/cassandra/streaming/StreamPlan.java index c426681e1e97..8fa5a9a2314a 100644 --- a/src/java/org/apache/cassandra/streaming/StreamPlan.java +++ b/src/java/org/apache/cassandra/streaming/StreamPlan.java @@ -21,16 +21,12 @@ import com.google.common.annotations.VisibleForTesting; -import org.apache.cassandra.dht.NormalizedRanges; -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.replication.ReconciledKeyspaceOffsets; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.TimeUUID; @@ -120,13 +116,7 @@ public StreamPlan requestRanges(InetAddressAndPort from, String keyspace, Ranges // TODO: add flag for fully reconciled data only if this is for a tracked keyspace session.addStreamRequest(keyspace, fullRanges, transientRanges, Arrays.asList(columnFamilies)); -<<<<<<< HEAD if (includeMutationLogs(keyspace, session)) -======= - // Automatically include mutation logs for tracked keyspaces - if (shouldIncludeMutationTracking(keyspace, columnFamilies, fullRanges, transientRanges)) - { ->>>>>>> d5846f54d0 (During mutation tracking migration restrict repairs to being either entirely inside migrated ranges or entirely outside, but not both) session.addMutationLogRequest(keyspace, fullRanges, transientRanges); return this; @@ -144,13 +134,7 @@ >>>>>>> d5846f54d0 (During mutation tracking migration restrict repairs to being public StreamPlan transferRanges(InetAddressAndPort to, String keyspace, RangesAtEndpoint replicas, String... columnFamilies) { StreamSession session = coordinator.getOrCreateOutboundSession(to); -<<<<<<< HEAD ReconciledKeyspaceOffsets reconciledKeyspaceOffsets = includeMutationLogs(keyspace, session) -======= - - // Automatically include mutation logs for tracked keyspaces - ReconciledKeyspaceOffsets reconciledKeyspaceOffsets = shouldIncludeMutationTracking(keyspace, columnFamilies, replicas) ->>>>>>> d5846f54d0 (During mutation tracking migration restrict repairs to being either entirely inside migrated ranges or entirely outside, but not both) ? session.addMutationLogTransfer(keyspace, replicas) : null; @@ -288,40 +272,13 @@ public static boolean hasAccordTables(KeyspaceMetadata ksm) } /** - * Whether mutation tracking streaming (mutation log transfers) should be included for the given - * keyspace and ranges. Returns true only when the keyspace uses tracked replication AND the - * ranges have completed migration (are not in the pending set). During migration, ranges still - * in the pending set use normal streaming without mutation tracking. - * - * This is safe because during migration (both to and from mutation tracking), reads for pending - * ranges continue to use the untracked read path with blocking read repair, which ensures - * consistency without requiring mutation log streaming for those ranges. + * Check if the given keyspace uses tracked replication, which requires mutation log streaming. * * @param keyspace the keyspace name - * @param columnFamilies the specific tables being streamed; empty means all tables - * @param allRanges the ranges being streamed - * @throws IllegalStateException if ranges partially overlap with pending migration ranges for any involved table + * @return true if the keyspace uses tracked replication */ - private boolean shouldIncludeMutationTracking(String keyspace, String[] columnFamilies, RangesAtEndpoint... allRanges) + private boolean isTrackedReplicationEnabled(String keyspace) { - ClusterMetadata metadata = ClusterMetadata.current(); - KeyspaceMetadata ksm = metadata.schema.getKeyspaceMetadata(keyspace); - if (!ksm.useMutationTracking()) - return false; - - KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); - if (migrationInfo == null) - return true; - - List> rangeList = new ArrayList<>(); - for (RangesAtEndpoint ranges : allRanges) - for (Replica r : ranges) - rangeList.add(r.range()); - - if (rangeList.isEmpty()) - return false; - - NormalizedRanges streamRanges = NormalizedRanges.normalizedRanges(rangeList); - return !migrationInfo.areRangesPendingMigration(streamRanges, ksm, Arrays.asList(columnFamilies)); + return ClusterMetadata.current().schema.getKeyspaceMetadata(keyspace).useMutationTracking(); } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java index 052c1869ca7b..b93481f04a75 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingIncrementalRepairTaskTest.java @@ -48,8 +48,7 @@ public static void setupCluster() throws IOException { CLUSTER = Cluster.build() .withNodes(3) - .withConfig(cfg -> cfg.with(Feature.NETWORK, Feature.GOSSIP) - .set("mutation_tracking_enabled", true)) + .withConfig(cfg -> cfg.with(Feature.NETWORK, Feature.GOSSIP)) .start(); } @@ -197,7 +196,7 @@ public void testMigrationFromTrackedToUntracked() throws Throwable ClusterMetadata metadata = ClusterMetadata.current(); return MutationTrackingIncrementalRepairTask.shouldUseMutationTrackingRepair(metadata, ksName); }); - assertTrue("Keyspace migrating from tracked should still use mutation tracking repair", shouldUseAfter); + assertFalse("Keyspace migrating from tracked should not use mutation tracking repair", shouldUseAfter); Boolean migrationAfter = CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 77ddf5672427..9920c0b73193 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.net.UnknownHostException; -import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; @@ -43,7 +42,6 @@ import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.gms.Gossiper; -import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; @@ -68,8 +66,7 @@ * tries to look up the dropped keyspace's metadata). The keyspaces are cleaned up when * the cluster is closed at the end of the test class. */ -public class MutationTrackingRepairTest extends TestBaseImpl -{ +public class MutationTrackingRepairTest extends TestBaseImpl { private static final int NUM_NODES = 3; private static final List ALL_NODES = List.of(1, 2, 3); @@ -80,53 +77,45 @@ public class MutationTrackingRepairTest extends TestBaseImpl private String ksName; @BeforeClass - public static void setupCluster() throws IOException - { + public static void setupCluster() throws IOException { executor = Executors.newCachedThreadPool(); CLUSTER = Cluster.build() - .withNodes(NUM_NODES) - .withConfig(cfg -> cfg.set("mutation_tracking_enabled", true) - .set("mutation_tracking_sync_timeout", "10s") - .set("request_timeout", "1000ms") - .set("repair.retries.max_attempts", 10) - .set("repair.retries.base_sleep_time", "100ms") - .set("repair.retries.max_sleep_time", "500ms") - .with(Feature.GOSSIP, Feature.NETWORK)) - .start(); + .withNodes(NUM_NODES) + .withConfig(cfg -> cfg.set("mutation_tracking_sync_timeout", "10s") + .set("request_timeout", "1000ms") + .set("repair.retries.max_attempts", 10) + .set("repair.retries.base_sleep_time", "100ms") + .set("repair.retries.max_sleep_time", "500ms") + .with(Feature.GOSSIP, Feature.NETWORK)) + .start(); } @AfterClass - public static void teardownCluster() - { + public static void teardownCluster() { executor.shutdownNow(); if (CLUSTER != null) CLUSTER.close(); } @Before - public void setUp() - { + public void setUp() { ksName = "mt_repair_" + ksCounter.incrementAndGet(); CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='tracked'"); + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); } @After - public void tearDown() - { + public void tearDown() { CLUSTER.filters().reset(); - for (int i = 1; i <= CLUSTER.size(); i++) - { + for (int i = 1; i <= CLUSTER.size(); i++) { CLUSTER.get(i).runOnInstance(() -> { Gossiper.runInGossipStageBlocking(() -> { - for (var entry : Gossiper.instance.endpointStateMap.entrySet()) - { + for (var entry : Gossiper.instance.endpointStateMap.entrySet()) { InetAddressAndPort ep = entry.getKey(); EndpointState state = entry.getValue(); - if (!ep.equals(FBUtilities.getBroadcastAddressAndPort()) && !state.isAlive()) - { + if (!ep.equals(FBUtilities.getBroadcastAddressAndPort()) && !state.isAlive()) { FailureDetector.instance.report(ep); Gossiper.instance.realMarkAlive(ep, state); } @@ -136,80 +125,68 @@ public void tearDown() } } - private void setupUntracked() - { + private void setupUntracked() { ksName = "mt_repair_" + ksCounter.incrementAndGet(); CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='untracked'"); + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); } - private void createTable(String tableName) - { + private void createTable(String tableName) { CLUSTER.schemaChange("CREATE TABLE " + ksName + '.' + tableName + " (k int PRIMARY KEY, v int)"); } - private void alterKeyspaceToTracked() - { + private void alterKeyspaceToTracked() { CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='tracked'"); + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='tracked'"); } - private void alterKeyspaceToUntracked() - { + private void alterKeyspaceToUntracked() { CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + - "{'class': 'SimpleStrategy', 'replication_factor': 3} " + - "AND replication_type='untracked'"); + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + + "AND replication_type='untracked'"); } - private void insertData(String tableName, int start, int count) - { - for (int i = start; i < start + count; i++) - { + private void insertData(String tableName, int start, int count) { + for (int i = start; i < start + count; i++) { CLUSTER.coordinator(1).execute( - "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", - ConsistencyLevel.ALL, i, i); + "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", + ConsistencyLevel.ALL, i, i); } } - private void insertDataWithInconsistency(String tableName, int start, int count) - { + private void insertDataWithInconsistency(String tableName, int start, int count) { insertDataWithInconsistency(2, tableName, start, count); } - private void insertDataWithInconsistency(int isolatedNode, String tableName, int start, int count) - { + private void insertDataWithInconsistency(int isolatedNode, String tableName, int start, int count) { // Isolate a node so background reconcilation has some work to do CLUSTER.filters().allVerbs().to(isolatedNode).drop(); CLUSTER.filters().allVerbs().from(isolatedNode).drop(); - for (int i = start; i < start + count; i++) - { + for (int i = start; i < start + count; i++) { CLUSTER.coordinator(1).execute( - "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", - ConsistencyLevel.QUORUM, i, i); + "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", + ConsistencyLevel.QUORUM, i, i); } CLUSTER.filters().reset(); // Verify the isolated node is actually missing the data we just wrote Object[][] results = CLUSTER.get(isolatedNode).executeInternal( - "SELECT k FROM " + ksName + '.' + tableName + " WHERE k >= ? AND k < ? ALLOW FILTERING", - start, start + count); + "SELECT k FROM " + ksName + '.' + tableName + " WHERE k >= ? AND k < ? ALLOW FILTERING", + start, start + count); assertEquals("Node " + isolatedNode + " should not have data written while isolated", - 0, results.length); + 0, results.length); } - private void assertDataOnAllNodes(String tableName, List keys) - { - for (int node = 1; node <= CLUSTER.size(); node++) - { - for (int key : keys) - { + private void assertDataOnAllNodes(String tableName, List keys) { + for (int node = 1; node <= CLUSTER.size(); node++) { + for (int key : keys) { Object[][] results = CLUSTER.get(node).executeInternal( - "SELECT k, v FROM " + ksName + '.' + tableName + " WHERE k = ?", key); + "SELECT k, v FROM " + ksName + '.' + tableName + " WHERE k = ?", key); assertEquals("Node " + node + " missing row k=" + key, 1, results.length); assertEquals(key, results[0][0]); assertEquals(key, results[0][1]); @@ -217,86 +194,72 @@ private void assertDataOnAllNodes(String tableName, List keys) } } - private void assertDataOnAllNodes(String tableName, int start, int count) - { + private void assertDataOnAllNodes(String tableName, int start, int count) { List keys = new ArrayList<>(count); for (int i = start; i < start + count; i++) keys.add(i); assertDataOnAllNodes(tableName, keys); } - private NodeToolResult nodetoolRepair(int node, String... args) - { + private NodeToolResult nodetoolRepair(int node, String... args) { String[] cmd = new String[args.length + 1]; cmd[0] = "repair"; System.arraycopy(args, 0, cmd, 1, args.length); return CLUSTER.get(node).nodetoolResult(cmd); } - private List repairConcurrently(List nodes, String... args) - { + private List repairConcurrently(List nodes, String... args) { List> futures = new ArrayList<>(); - for (int node : nodes) - { + for (int node : nodes) { int n = node; futures.add(executor.submit(() -> nodetoolRepair(n, args))); } List results = new ArrayList<>(); - for (Future f : futures) - { - try - { + for (Future f : futures) { + try { results.add(f.get(60, TimeUnit.SECONDS)); - } - catch (Exception e) - { + } catch (Exception e) { throw new RuntimeException("Repair future failed", e); } } return results; } - private void assertAllSuccess(List results) - { + private void assertAllSuccess(List results) { for (NodeToolResult r : results) r.asserts().success(); } - private void assertAllFailure(List results) - { + private void assertAllFailure(List results) { for (NodeToolResult r : results) r.asserts().failure(); } - private String[] withPR(String... args) - { + private String[] withPR(String... args) { String[] result = new String[args.length + 1]; System.arraycopy(args, 0, result, 0, args.length); result[args.length] = "-pr"; return result; } - private void repairResolvingInconsistency(String... args) throws Exception - { + private void repairResolvingInconsistency(String... args) throws Exception { repairResolvingInconsistency(2, ALL_NODES, withPR(args)); } - private void repairResolvingInconsistency(int isolatedNode, List nodes, String... args) throws Exception - { + private void repairResolvingInconsistency(int isolatedNode, List nodes, String... args) throws Exception { // Dropping messages is to check that repair retries messages if needed CLUSTER.filters().allVerbs().to(isolatedNode).drop(); CLUSTER.filters().allVerbs().from(isolatedNode).drop(); List> futures = new ArrayList<>(); - for (int node : nodes) - { + for (int node : nodes) { int n = node; futures.add(executor.submit(() -> nodetoolRepair(n, args))); } Thread.sleep(2000); assertTrue("Repair should be blocked while node " + isolatedNode + " is isolated", - futures.stream().allMatch(not(Future::isDone))); + futures.stream().allMatch(not(Future::isDone))); CLUSTER.filters().reset(); @@ -309,15 +272,13 @@ private void repairResolvingInconsistency(int isolatedNode, List nodes, assertAllSuccess(repairConcurrently(nodes, args)); } - private void repairFromNodesSuccess(List nodes, String... args) - { + private void repairFromNodesSuccess(List nodes, String... args) { String[] prArgs = withPR(args); assertAllSuccess(repairConcurrently(nodes, prArgs)); assertAllSuccess(repairConcurrently(nodes, prArgs)); } - private boolean isMigrationInProgress() - { + private boolean isMigrationInProgress() { String ks = ksName; return CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); @@ -325,8 +286,7 @@ private boolean isMigrationInProgress() }); } - private boolean isMigrationComplete() - { + private boolean isMigrationComplete() { String ks = ksName; return CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); @@ -339,16 +299,15 @@ private boolean isMigrationComplete() * Get the primary token range for a node as [start, end] token values. * With SimpleStrategy RF=3 and 3 nodes, each node has exactly one primary range. */ - private long[] getPrimaryRangeTokens(int node) - { + private long[] getPrimaryRangeTokens(int node) { String ks = ksName; return CLUSTER.get(node).callOnInstance(() -> { var ranges = StorageService.instance.getPrimaryRanges(ks); assertEquals(1, ranges.size()); Range range = ranges.iterator().next(); return new long[]{ - ((Murmur3Partitioner.LongToken) range.left).token, - ((Murmur3Partitioner.LongToken) range.right).token + ((Murmur3Partitioner.LongToken) range.left).token, + ((Murmur3Partitioner.LongToken) range.right).token }; }); } @@ -356,13 +315,11 @@ private long[] getPrimaryRangeTokens(int node) /** * Compute which integer keys from [start, start+count) hash into the given token range. */ - private List keysInTokenRange(int start, int count, long rangeStart, long rangeEnd) - { + private List keysInTokenRange(int start, int count, long rangeStart, long rangeEnd) { Range range = new Range<>(new Murmur3Partitioner.LongToken(rangeStart), - new Murmur3Partitioner.LongToken(rangeEnd)); + new Murmur3Partitioner.LongToken(rangeEnd)); List keys = new ArrayList<>(); - for (int i = start; i < start + count; i++) - { + for (int i = start; i < start + count; i++) { Token token = Murmur3Partitioner.instance.getToken(ByteBufferUtil.bytes(i)); if (range.contains(token)) keys.add(i); @@ -370,28 +327,22 @@ private List keysInTokenRange(int start, int count, long rangeStart, lo return keys; } - private String getBroadcastAddress(int node) - { + private String getBroadcastAddress(int node) { return CLUSTER.get(node).callOnInstance(() -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); } - private void isolateNode(int nodeToIsolate, int... observerNodes) - { + private void isolateNode(int nodeToIsolate, int... observerNodes) { CLUSTER.filters().allVerbs().from(nodeToIsolate).drop(); CLUSTER.filters().allVerbs().to(nodeToIsolate).drop(); String isolatedAddress = CLUSTER.get(nodeToIsolate).callOnInstance( - () -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); - for (int observer : observerNodes) - { + () -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); + for (int observer : observerNodes) { CLUSTER.get(observer).runOnInstance(() -> { - try - { + try { InetAddressAndPort neighbor = InetAddressAndPort.getByName(isolatedAddress); FailureDetector.instance.forceConviction(neighbor); - } - catch (UnknownHostException e) - { + } catch (UnknownHostException e) { throw new RuntimeException(e); } }); @@ -399,8 +350,7 @@ private void isolateNode(int nodeToIsolate, int... observerNodes) } @Test - public void testBasicRepairHappyPath() throws Exception - { + public void testBasicRepairHappyPath() throws Exception { insertDataWithInconsistency("tbl", 0, 100); repairResolvingInconsistency(ksName); @@ -409,8 +359,7 @@ public void testBasicRepairHappyPath() throws Exception } @Test - public void testRepairSpecificTable() throws Exception - { + public void testRepairSpecificTable() throws Exception { createTable("tbl1"); createTable("tbl2"); @@ -433,8 +382,7 @@ public void testRepairSpecificTable() throws Exception } @Test - public void testRepairAllTables() throws Exception - { + public void testRepairAllTables() throws Exception { createTable("tbl1"); createTable("tbl2"); createTable("tbl3"); @@ -451,8 +399,7 @@ public void testRepairAllTables() throws Exception } @Test - public void testForceRepairWithNodeDown() - { + public void testForceRepairWithNodeDown() { insertDataWithInconsistency(3, "tbl", 0, 50); isolateNode(2, 1, 3); @@ -462,20 +409,17 @@ public void testForceRepairWithNodeDown() repairFromNodesSuccess(liveNodes, ksName, "--force"); - for (int node : liveNodes) - { - for (int i = 0; i < 50; i++) - { + for (int node : liveNodes) { + for (int i = 0; i < 50; i++) { Object[][] results = CLUSTER.get(node).executeInternal( - "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); + "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); assertEquals("Node " + node + " missing row k=" + i, 1, results.length); } } } @Test - public void testForceRepairWithAllNodesUp() throws Exception - { + public void testForceRepairWithAllNodesUp() throws Exception { insertDataWithInconsistency("tbl", 0, 50); repairResolvingInconsistency(ksName, "--force"); @@ -484,8 +428,7 @@ public void testForceRepairWithAllNodesUp() throws Exception } @Test - public void testRepairWithSpecificHosts() - { + public void testRepairWithSpecificHosts() { String addr1 = getBroadcastAddress(1); String addr3 = getBroadcastAddress(3); @@ -499,24 +442,21 @@ public void testRepairWithSpecificHosts() // Repair with --in-hosts scoped to only the live nodes should succeed // Note: --in-hosts cannot be combined with -pr - String[] args = new String[]{ ksName, "--in-hosts", addr1 + ',' + addr3 }; + String[] args = new String[]{ksName, "--in-hosts", addr1 + ',' + addr3}; assertAllSuccess(repairConcurrently(liveNodes, args)); assertAllSuccess(repairConcurrently(liveNodes, args)); - for (int node : liveNodes) - { - for (int i = 0; i < 50; i++) - { + for (int node : liveNodes) { + for (int i = 0; i < 50; i++) { Object[][] results = CLUSTER.get(node).executeInternal( - "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); + "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); assertEquals("Node " + node + " missing row k=" + i, 1, results.length); } } } @Test - public void testMigrationUntrackedToTrackedCompletesViaRepair() throws Exception - { + public void testMigrationUntrackedToTrackedCompletesViaRepair() throws Exception { setupUntracked(); insertDataWithInconsistency("tbl", 0, 100); @@ -530,52 +470,48 @@ public void testMigrationUntrackedToTrackedCompletesViaRepair() throws Exception } @Test - public void testDataAccessibleDuringMigrationToTracked() throws Exception - { + public void testDataAccessibleDuringMigrationToTracked() throws Exception { setupUntracked(); dataAccessibleDuringMigration(() -> alterKeyspaceToTracked()); } @Test - public void testDataAccessibleDuringMigrationToUntracked() throws Exception - { + public void testDataAccessibleDuringMigrationToUntracked() throws Exception { dataAccessibleDuringMigration(() -> alterKeyspaceToUntracked()); } - private void dataAccessibleDuringMigration(Runnable alterKeyspace) throws Exception - { + private void dataAccessibleDuringMigration(Runnable alterKeyspace) throws Exception { insertDataWithInconsistency("tbl", 0, 50); alterKeyspace.run(); Object[][] results = CLUSTER.coordinator(1).execute( - "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("Pre-migration data should be readable", 50, results.length); insertData("tbl", 50, 50); results = CLUSTER.coordinator(1).execute( - "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All data should be readable during migration", 100, results.length); repairResolvingInconsistency(ksName); assertTrue("Migration should complete after repair", isMigrationComplete()); results = CLUSTER.coordinator(1).execute( - "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All data should be readable after migration", 100, results.length); insertData("tbl", 100, 50); results = CLUSTER.coordinator(1).execute( - "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); + "SELECT k, v FROM " + ksName + ".tbl", ConsistencyLevel.ALL); assertEquals("All data including post-migration should be readable", 150, results.length); assertDataOnAllNodes("tbl", 0, 150); } @Test - public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception - { + public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception { insertDataWithInconsistency("tbl", 0, 100); alterKeyspaceToUntracked(); @@ -588,8 +524,7 @@ public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception } @Test - public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() - { + public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() { setupUntracked(); insertDataWithInconsistency(3, "tbl", 0, 50); @@ -603,16 +538,15 @@ public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() String ks = ksName; assertTrue("Migration should not advance with dead nodes excluded", - CLUSTER.get(1).callOnInstance(() -> { - ClusterMetadata metadata = ClusterMetadata.current(); - KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ks); - return info != null; - })); + CLUSTER.get(1).callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ks); + return info != null; + })); } @Test - public void testPreviewRepairDoesNotAdvanceMigration() throws Exception - { + public void testPreviewRepairDoesNotAdvanceMigration() throws Exception { setupUntracked(); insertDataWithInconsistency("tbl", 0, 50); @@ -625,8 +559,7 @@ public void testPreviewRepairDoesNotAdvanceMigration() throws Exception } @Test - public void testSubrangeRepair() throws Exception - { + public void testSubrangeRepair() throws Exception { long[] primaryRange = getPrimaryRangeTokens(1); String st = Long.toString(primaryRange[0]); String et = Long.toString(primaryRange[1]); @@ -642,8 +575,7 @@ public void testSubrangeRepair() throws Exception } @Test - public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Exception - { + public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Exception { setupUntracked(); long[] primaryRange = getPrimaryRangeTokens(1); String st = Long.toString(primaryRange[0]); @@ -659,7 +591,7 @@ public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Ex repairResolvingInconsistency(2, List.of(1), ksName, "-st", st, "-et", et); assertTrue("Migration should not be complete after subrange repair", - isMigrationInProgress()); + isMigrationInProgress()); // Verify the repaired range is no longer pending but other ranges still are String ks = ksName; @@ -671,13 +603,11 @@ public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Ex assertNotNull("Migration info should still exist", info); Range repairedRange = new Range<>(new Murmur3Partitioner.LongToken(rangeStart), - new Murmur3Partitioner.LongToken(rangeEnd)); - for (var entry : info.pendingRangesPerTable.entrySet()) - { - for (Range pending : entry.getValue()) - { + new Murmur3Partitioner.LongToken(rangeEnd)); + for (var entry : info.pendingRangesPerTable.entrySet()) { + for (Range pending : entry.getValue()) { assertFalse("Repaired range should not overlap with pending ranges for table " + entry.getKey(), - repairedRange.intersects(pending)); + repairedRange.intersects(pending)); } } }); @@ -690,8 +620,34 @@ public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Ex } @Test - public void testRepairTimeout() - { + public void testRepairRejectsMixedMigratedAndPendingRanges() { + setupUntracked(); + insertData("tbl", 0, 50); + + alterKeyspaceToTracked(); + assertTrue("Migration should be in progress after ALTER", isMigrationInProgress()); + + long[] primaryRange = getPrimaryRangeTokens(1); + String st = Long.toString(primaryRange[0]); + String et = Long.toString(primaryRange[1]); + + // Repair node 1's primary range to advance migration for that subrange only. + // Run from single node to avoid anti-compaction conflicts during migration IR. + nodetoolRepair(1, ksName, "-st", st, "-et", et).asserts().success(); + + // Now attempt a repair with a range that straddles the migrated/pending boundary. + // Node 1's primary range has been repaired (no longer pending), but the range + // immediately after is still pending. A range spanning both should be rejected. + String straddleSt = Long.toString(primaryRange[1] - 1); + String straddleEt = Long.toString(primaryRange[1] + 1000); + NodeToolResult result = nodetoolRepair(1, ksName, "-st", straddleSt, "-et", straddleEt); + result.asserts().failure(); + assertTrue("Expected partial overlap error but got: " + result.getStderr(), + result.getStderr().contains("partially overlap with migration pending ranges")); + } + + @Test + public void testRepairTimeout() { insertData("tbl", 0, 50); CLUSTER.filters().allVerbs().to(2).drop(); @@ -701,12 +657,49 @@ public void testRepairTimeout() assertAllFailure(results); for (NodeToolResult r : results) assertTrue("Expected timeout error but got: " + r.getStderr(), - r.getStderr().contains("Did not get replies from all endpoints")); + r.getStderr().contains("Did not get replies from all endpoints")); + } + + /** + * During migration from untracked to tracked, incremental repair runs anti-compaction + * on SSTables that were written before tracking was enabled. When an SSTable partially + * overlaps the repair range, anti-compaction must split it by rewriting through + * SSTableWriter. The "inside repair range" writer gets pendingRepair set to the session ID. + * + * SSTableWriter.finalizeMetadata() must tolerate pendingRepair being set on a tracked + * table during migration. This test uses a narrow subrange to force anti-compaction to + * split SSTables (rather than just mutating fully-contained ones in place). + */ + @Test + public void testMigrationSubrangeRepairAntiCompactionSplitsSSTables() throws Exception { + setupUntracked(); + + // Write data and flush so SSTables span the full token ring on each node. + insertData("tbl", 0, 500); + for (int i = 1; i <= NUM_NODES; i++) + CLUSTER.get(i).flush(ksName); + + alterKeyspaceToTracked(); + assertTrue("Migration should be in progress", isMigrationInProgress()); + + // Use a subrange that's well within one local range but wide enough to contain + // data. With 3 nodes, node 2's primary range is approximately + // (-3074457345618258603, 3074457345618258602]. A range of (0, 3000000000000000000] + // is fully contained in that range and covers ~16% of the ring, so ~80 of our 500 + // rows should hash into it. SSTables from the flush span the entire ring, so they + // will NOT be fully contained in this narrow range. Anti-compaction must split them + // via SSTableWriter, exercising the pendingRepair code path in finalizeMetadata(). + String st = "0"; + String et = "3000000000000000000"; + + // Run repair from a single node to avoid anti-compaction conflicts. + // This should succeed: anti-compaction splits SSTables and the repair completes. + NodeToolResult result = nodetoolRepair(1, ksName, "-st", st, "-et", et); + result.asserts().success(); } @Test - public void testRepairSyncTimeout() - { + public void testRepairSyncTimeout() { insertDataWithInconsistency("tbl", 0, 50); // Drop only offset broadcasts so MT_SYNC_REQ/RSP can succeed but @@ -717,6 +710,6 @@ public void testRepairSyncTimeout() assertAllFailure(results); for (NodeToolResult r : results) assertTrue("Expected sync timeout error but got: " + r.getStderr(), - r.getStderr().contains("Mutation tracking sync timed out")); + r.getStderr().contains("Mutation tracking sync timed out")); } } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/MutationTrackingMutationVerbHandlerMigrationTest.java b/test/unit/org/apache/cassandra/db/MutationTrackingMutationVerbHandlerMigrationTest.java index 47d9e81fb49d..05b19292ea1c 100644 --- a/test/unit/org/apache/cassandra/db/MutationTrackingMutationVerbHandlerMigrationTest.java +++ b/test/unit/org/apache/cassandra/db/MutationTrackingMutationVerbHandlerMigrationTest.java @@ -24,7 +24,6 @@ import org.junit.Test; import org.apache.cassandra.ServerTestUtils; -import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.partitions.PartitionUpdate; From 15a98da77b6c1ceb95355c3809fbde5aad242430 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 23 Mar 2026 16:57:30 -0400 Subject: [PATCH 30/46] A bunch of self review changes/improvements --- .../config/CassandraRelevantProperties.java | 1 - .../cassandra/db/ReadRepairVerbHandler.java | 4 +- .../cassandra/db/lifecycle/Tracker.java | 8 ++- .../db/streaming/CassandraStreamReceiver.java | 18 +++++- src/java/org/apache/cassandra/net/Verb.java | 8 +-- ...MutationTrackingIncrementalRepairTask.java | 14 +++-- .../cassandra/repair/RepairCoordinator.java | 4 +- .../apache/cassandra/repair/RepairJob.java | 24 ++++++- .../repair/RepairMessageVerbHandler.java | 3 - .../MutationTrackingSyncResponse.java | 60 +++--------------- .../replication/BroadcastLogOffsets.java | 6 -- .../MutationTrackingSyncCoordinator.java | 13 ++-- .../replication/Node2OffsetsMap.java | 3 + .../apache/cassandra/replication/Shard.java | 38 +++++------ .../cassandra/service/StorageService.java | 2 +- .../migration/KeyspaceMigrationInfo.java | 63 ++++++++++++++----- .../utils/CollectionSerializers.java | 24 +++++++ .../repair/MutationTrackingRepairTest.java | 30 ++++++++- 18 files changed, 193 insertions(+), 130 deletions(-) diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java index dfd7d0744326..8c0c08bf59cc 100644 --- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java +++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java @@ -447,7 +447,6 @@ public enum CassandraRelevantProperties REPAIR_FAIL_TIMEOUT_SECONDS("cassandra.repair_fail_timeout_seconds", convertToString(Ints.checkedCast(TimeUnit.DAYS.toSeconds(1)))), REPAIR_MUTATION_REPAIR_ROWS_PER_BATCH("cassandra.repair.mutation_repair_rows_per_batch", "100"), REPAIR_STATUS_CHECK_TIMEOUT_SECONDS("cassandra.repair_status_check_timeout_seconds", convertToString(Ints.checkedCast(TimeUnit.HOURS.toSeconds(1)))), - REPAIR_SYNC_TIMEOUT_MINUTES("cassandra.repair_sync_timeout_minutes", "30"), /** * When doing a host replacement its possible that the gossip state is "empty" meaning that the endpoint is known * but the current state isn't known. If the host replacement is needed to repair this state, this property must diff --git a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java index fb9bff3bb1eb..fc2287a96731 100644 --- a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java +++ b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java @@ -33,9 +33,7 @@ public void applyMutation(Mutation mutation) void applyMutation(Message message, InetAddressAndPort respondToAddress) { - Mutation mutation = message.payload; - mutation.setReadRepair(true); - mutation.apply(); + applyMutation(message.payload); MessagingService.instance().send(message.emptyResponse(), respondToAddress); } } diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java index 3f299c029a1e..36b08c1877f8 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java +++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java @@ -63,6 +63,7 @@ import org.apache.cassandra.notifications.TablePreScrubNotification; import org.apache.cassandra.notifications.TruncationNotification; import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; @@ -272,10 +273,11 @@ public void updateInitialSSTableSize(Iterable sstables) public void addSSTables(Collection sstables) { - // TODO (REVIEW): Is there another reason why it's not ok for IR to do this during migration? @Blake Eggleston - // Note: tracked tables may legitimately use this path during migration from untracked to tracked, + // Tracked tables may legitimately use this path during migration from untracked to tracked, // when incremental repair streams SSTables that were written before tracking was enabled. - // Preconditions.checkState(!cfstore.metadata().replicationType().isTracked()); + Preconditions.checkState(!cfstore.metadata().replicationType().isTracked() + || ClusterMetadata.current().mutationTrackingMigrationState + .getKeyspaceInfo(cfstore.metadata().keyspace) != null); addSSTablesInternal(sstables, false, true, true); } diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java index 362bf385c639..f0e43054a5fb 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java @@ -53,6 +53,7 @@ import org.apache.cassandra.service.accord.IAccordService; import org.apache.cassandra.service.accord.TimeOnlyRequestBookkeeping.LatencyRequestBookkeeping; import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.replication.PendingLocalTransfer; import org.apache.cassandra.streaming.IncomingStream; import org.apache.cassandra.streaming.StreamReceiver; @@ -104,6 +105,19 @@ public CassandraStreamReceiver(ColumnFamilyStore cfs, StreamSession session, Lis this.requiresWritePath = requiresWritePath(cfs); } + /** + * Whether this stream should use the tracked transfer path (pending until activation). + * Returns false during mutation tracking migration for ranges that are still pending, + * since migration repair uses the untracked streaming path for those ranges. + */ + private boolean useTrackedTransferPath() + { + if (!cfs.metadata().replicationType().isTracked() || !session.streamOperation().isTrackable()) + return false; + + return KeyspaceMigrationInfo.shouldUseTrackedTransfers(ClusterMetadata.current(), cfs.getKeyspaceName(), cfs.metadata().id, ranges); + } + public static CassandraStreamReceiver fromReceiver(StreamReceiver receiver) { Preconditions.checkArgument(receiver instanceof CassandraStreamReceiver); @@ -135,7 +149,7 @@ public synchronized void received(IncomingStream stream) sstables.addAll(finished); receivedEntireSSTable = file.isEntireSSTable(); - if (cfs.metadata().replicationType().isTracked() && session.streamOperation().isTrackable()) + if (useTrackedTransferPath()) { PendingLocalTransfer transfer = new PendingLocalTransfer(cfs.metadata().id, session.planId(), sstables); MutationTrackingService.instance().received(transfer); @@ -266,7 +280,7 @@ public void finished() logger.debug("[Stream #{}] Received {} sstables from {} ({})", session.planId(), readers.size(), session.peer, readers); // SSTables involved in a coordinated transfer become live when the transfer is activated - if (cfs.metadata().replicationType().isTracked() && session.streamOperation().isTrackable()) + if (useTrackedTransferPath()) return; if (session.streamOperation() == StreamOperation.BOOTSTRAP) diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java index cb5151795ed1..11b1c2fd7ece 100644 --- a/src/java/org/apache/cassandra/net/Verb.java +++ b/src/java/org/apache/cassandra/net/Verb.java @@ -289,10 +289,10 @@ public enum Verb FINALIZE_PROMISE_MSG (111, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> FinalizePromise.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), FINALIZE_COMMIT_MSG (112, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> FinalizeCommit.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), FAILED_SESSION_MSG (113, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> FailSession.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), - STATUS_RSP (115, P1, repairTimeout, ANTI_ENTROPY, () -> StatusResponse.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), - STATUS_REQ (114, P1, repairTimeout, ANTI_ENTROPY, () -> StatusRequest.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), - MT_SYNC_RSP (117, P1, repairWithBackoffTimeout, REQUEST_RESPONSE, () -> MutationTrackingSyncResponse.serializer, RESPONSE_HANDLER), - MT_SYNC_REQ (116, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> MutationTrackingSyncRequest.serializer, () -> RepairMessageVerbHandler.instance(), MT_SYNC_RSP ), + STATUS_RSP (115, P1, repairTimeout, ANTI_ENTROPY, () -> StatusResponse.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), + STATUS_REQ (114, P1, repairTimeout, ANTI_ENTROPY, () -> StatusRequest.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), + MT_SYNC_RSP (117, P1, repairWithBackoffTimeout, REQUEST_RESPONSE, () -> MutationTrackingSyncResponse.serializer, RESPONSE_HANDLER ), + MT_SYNC_REQ (116, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> MutationTrackingSyncRequest.serializer, () -> RepairMessageVerbHandler.instance(), MT_SYNC_RSP ), REPLICATION_DONE_RSP (82, P0, rpcTimeout, MISC, () -> NoPayload.serializer, RESPONSE_HANDLER ), REPLICATION_DONE_REQ (22, P0, rpcTimeout, MISC, () -> NoPayload.serializer, () -> ReplicationDoneVerbHandler.instance, REPLICATION_DONE_RSP), diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index 515ae3ece941..727439f8cccb 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -81,7 +81,7 @@ public Future performUnsafe(ExecutorPlus executor, Sche for (Range range : commonRange.ranges) { RepairJobDesc desc = new RepairJobDesc(parentSession, TimeUUID.Generator.nextTimeUUID(), - keyspace, "", List.of(range)); + keyspace, "Mutation Tracking Sync", List.of(range)); MutationTrackingSyncCoordinator syncCoordinator = new MutationTrackingSyncCoordinator( coordinator.ctx, desc, commonRange.endpoints, metadata); syncCoordinator.start(); @@ -124,17 +124,19 @@ private void waitForSyncCompletion(List syncCoo long remainingNanos = deadlineNanos - coordinator.ctx.clock().nanoTime(); try { - boolean completed = syncCoordinator.awaitCompletion(remainingNanos, TimeUnit.NANOSECONDS); - if (!completed) - { - syncCoordinator.cancel(); + if (!syncCoordinator.awaitCompletion(remainingNanos, TimeUnit.NANOSECONDS)) allSucceeded = false; - } } catch (RuntimeException e) { + allSucceeded = false; error = Throwables.merge(error, e); } + finally + { + if (!allSucceeded) + syncCoordinator.cancel(); + } } if (error != null) diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index 86192374bc4c..7a434553d9ee 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -138,7 +138,7 @@ public static RepairCoordinator create(StorageService storageService, int cmd, R && MutationTrackingIncrementalRepairTask.isMutationTrackingMigrationInProgress(metadata, keyspace); // If using mutation tracking without migration, flip incremental to false - // to prevent anti-compaction since mutation tracking doesn't use repaired/unrepaired distinction + // to prevent anti-compaction since mutation tracking manages marking tables repaired itself if (useMT && !mtMigration) { logger.info("Keyspace {} uses mutation tracking; disabling incremental repair to skip anti-compaction", keyspace); @@ -155,7 +155,7 @@ public static RepairCoordinator create(StorageService storageService, int cmd, R { NormalizedRanges repairRanges = NormalizedRanges.normalizedRanges(options.getRanges()); KeyspaceMetadata ksm = metadata.schema.getKeyspaceMetadata(keyspace); - migrationInfo.areRangesPendingMigration(repairRanges, ksm, options.getColumnFamilies()); + migrationInfo.assertRangesNotMixedMigration(repairRanges, ksm, options.getColumnFamilies()); } } diff --git a/src/java/org/apache/cassandra/repair/RepairJob.java b/src/java/org/apache/cassandra/repair/RepairJob.java index 0a0411a9824c..96ed70442efe 100644 --- a/src/java/org/apache/cassandra/repair/RepairJob.java +++ b/src/java/org/apache/cassandra/repair/RepairJob.java @@ -57,6 +57,7 @@ import org.apache.cassandra.service.accord.repair.AccordRepair; import org.apache.cassandra.service.accord.repair.AccordRepair.AccordRepairResult; import org.apache.cassandra.service.consensus.migration.ConsensusMigrationRepairResult; +import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.service.replication.migration.MutationTrackingMigrationRepairResult; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanup; import org.apache.cassandra.service.paxos.cleanup.PaxosUpdateLowBallot; @@ -261,7 +262,7 @@ protected void runRepair() .flatMap(this::executeTasks, taskExecutor); // For tracked keyspaces, we need to ensure sync'd data is present in the log - boolean isTracked = cfs.metadata().replicationType().isTracked(); + boolean isTracked = useTrackedTransfers(); if (isTracked) syncResults = TransferTrackingService.instance().onRepairSyncCompletion(this, syncResults, taskExecutor); } @@ -347,7 +348,7 @@ private Future createSyncTasks(Future accordRepai else syncTasks = createStandardSyncTasks(trees); - return ks.getMetadata().params.replicationType.isTracked() + return useTrackedTransfers() ? SyncTasks.tracked(ks, syncTasks) : SyncTasks.untracked(syncTasks); }, taskExecutor); @@ -369,6 +370,23 @@ private boolean isMetadataKeyspace() return desc.keyspace.equals(METADATA_KEYSPACE_NAME); } + /** + * Whether tracked repair transfers should be used for this repair job. + * Returns true only when the keyspace uses tracked replication AND the repair ranges + * for this table have completed migration (or no migration is in progress). + * During migration, ranges still in the pending set use the traditional untracked + * streaming path because: + * - The data being streamed is pre-migration data without mutation tracking offsets + * - TrackedRepairTransfer does not support --force (dead node exclusion) + */ + private boolean useTrackedTransfers() + { + if (!cfs.metadata().replicationType().isTracked()) + return false; + + return KeyspaceMigrationInfo.shouldUseTrackedTransfers(ClusterMetadata.current(), desc.keyspace, cfs.metadata().id, desc.ranges); + } + private boolean isTransient(InetAddressAndPort ep) { return session.state.commonRange.transEndpoints.contains(ep); @@ -465,7 +483,7 @@ Future> executeTasks(SyncTasks tasks) if (!tasks.isEmpty()) state.phase.streamSubmitted(); - if (cfs.metadata().replicationType().isTracked()) + if (useTrackedTransfers()) TransferTrackingService.instance().onRepairSyncExecution(tasks); for (SyncTask task : tasks) diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java index 32e6cae668a9..effdaf4655ef 100644 --- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java +++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java @@ -60,7 +60,6 @@ import org.apache.cassandra.service.StorageService; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.tcm.ClusterMetadataService; -import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; @@ -518,8 +517,6 @@ private void handleMutationTrackingSyncRequest(Message message) MutationTrackingSyncResponse response = new MutationTrackingSyncResponse( desc, - FBUtilities.getBroadcastAddressAndPort(), - true, offsets); ctx.messaging().send(message.responseWith(response), message.from()); diff --git a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java index a5d216b8a7bf..d0cc9b277e85 100644 --- a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java +++ b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncResponse.java @@ -18,26 +18,20 @@ package org.apache.cassandra.repair.messages; import java.io.IOException; -import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Objects; -import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.replication.CoordinatorLogId; import org.apache.cassandra.replication.Offsets; import org.apache.cassandra.repair.RepairJobDesc; import org.apache.cassandra.utils.CollectionSerializers; -import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer; - /** * Response from a participant to a {@link MutationTrackingSyncRequest}. Contains the * participant's current witnessed offsets for each shard overlapping the requested ranges. @@ -46,47 +40,14 @@ */ public class MutationTrackingSyncResponse extends RepairMessage { - public final InetAddressAndPort participant; - public final boolean success; /** Per-shard witnessed offsets: shard range -> (logId -> offsets) */ public final Map, Map> offsetsByShard; - /** - * The inner map is keyed by CoordinatorLogId, but the logId is already embedded in each - * Offsets.Immutable, so we serialize only the values as a list and reconstruct the map - * on deserialization. - */ - private static final IVersionedSerializer> offsetsMapSerializer = new IVersionedSerializer<>() - { - public void serialize(Map map, DataOutputPlus out, int version) throws IOException - { - CollectionSerializers.serializeCollection(map.values(), out, version, Offsets.serializer); - } - - public Map deserialize(DataInputPlus in, int version) throws IOException - { - List offsetsList = CollectionSerializers.deserializeList(in, version, Offsets.serializer); - Map map = new HashMap<>(offsetsList.size()); - for (Offsets.Immutable offsets : offsetsList) - map.put(offsets.logId(), offsets); - return map; - } - - public long serializedSize(Map map, int version) - { - return CollectionSerializers.serializedCollectionSize(map.values(), version, Offsets.serializer); - } - }; - public MutationTrackingSyncResponse(RepairJobDesc desc, - InetAddressAndPort participant, - boolean success, Map, Map> offsetsByShard) { super(desc); - assert participant != null; - this.participant = participant; - this.success = success; + Objects.requireNonNull(offsetsByShard); this.offsetsByShard = offsetsByShard; } @@ -97,15 +58,13 @@ public boolean equals(Object o) return false; MutationTrackingSyncResponse other = (MutationTrackingSyncResponse) o; return Objects.equals(desc, other.desc) - && participant.equals(other.participant) - && success == other.success && Objects.equals(offsetsByShard, other.offsetsByShard); } @Override public int hashCode() { - return Objects.hash(desc, participant, success, offsetsByShard); + return Objects.hash(desc, offsetsByShard); } @Override @@ -113,20 +72,19 @@ public String toString() { return "MutationTrackingSyncResponse{" + "desc=" + desc + - ", participant=" + participant + - ", success=" + success + - ", shardCount=" + (offsetsByShard != null ? offsetsByShard.size() : 0) + + ", shardCount=" + offsetsByShard.size() + '}'; } + private static final IVersionedSerializer> offsetsMapSerializer = + CollectionSerializers.newMapSerializer(CoordinatorLogId.serializer, Offsets.serializer); + @SuppressWarnings("unchecked") public static final IVersionedSerializer serializer = new IVersionedSerializer<>() { public void serialize(MutationTrackingSyncResponse response, DataOutputPlus out, int version) throws IOException { RepairJobDesc.serializer.serialize(response.desc, out, version); - inetAddressAndPortSerializer.serialize(response.participant, out, version); - out.writeBoolean(response.success); CollectionSerializers.serializeMap((Map, Map>) (Map) response.offsetsByShard, out, version, Range.tokenSerializer, offsetsMapSerializer); } @@ -134,20 +92,16 @@ public void serialize(MutationTrackingSyncResponse response, DataOutputPlus out, public MutationTrackingSyncResponse deserialize(DataInputPlus in, int version) throws IOException { RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version); - InetAddressAndPort participant = inetAddressAndPortSerializer.deserialize(in, version); - boolean success = in.readBoolean(); Map, Map> raw = CollectionSerializers.deserializeMap(in, version, Range.tokenSerializer, offsetsMapSerializer); Map, Map> offsetsByShard = (Map, Map>) (Map) raw; - return new MutationTrackingSyncResponse(desc, participant, success, offsetsByShard); + return new MutationTrackingSyncResponse(desc, offsetsByShard); } public long serializedSize(MutationTrackingSyncResponse response, int version) { long size = RepairJobDesc.serializer.serializedSize(response.desc, version); - size += inetAddressAndPortSerializer.serializedSize(response.participant, version); - size += TypeSizes.sizeof(response.success); size += CollectionSerializers.serializedMapSize((Map, Map>) (Map) response.offsetsByShard, version, Range.tokenSerializer, offsetsMapSerializer); return size; diff --git a/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java b/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java index 7e2b7b485e57..a9832f90c65d 100644 --- a/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java +++ b/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import org.apache.cassandra.db.TypeSizes; @@ -52,11 +51,6 @@ boolean isEmpty() return replicatedOffsets.isEmpty(); } - public List getOffsets() - { - return Collections.unmodifiableList(replicatedOffsets); - } - @Override public String toString() { diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 6cb0515e97e2..febec73b4b47 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -98,9 +98,7 @@ public void start() if (!started.compareAndSet(false, true)) throw new IllegalStateException("Sync coordinator already started"); - List overlappingShards; - - overlappingShards = new ArrayList<>(); + List overlappingShards = new ArrayList<>(); MutationTrackingService.instance().forEachShardInKeyspace(keyspace, shard -> { if (shard.range.intersects(range)) overlappingShards.add(shard); @@ -182,7 +180,7 @@ private void sendSyncRequests() @Override public void onResponse(Message msg) { - onSyncResponse(msg.payload); + onSyncResponse(msg.from(), msg.payload); } @Override @@ -300,16 +298,17 @@ public void onOffsetsReceived() * MutationTrackingSyncRequest. Updates the shard targets with the offsets from the * response, establishing a happens-before relationship with the repair start. * + * @param from the participant that sent the response * @param response the sync response from a participant */ - public void onSyncResponse(MutationTrackingSyncResponse response) + public void onSyncResponse(InetAddressAndPort from, MutationTrackingSyncResponse response) { if (completionFuture.isDone()) return; // Deduplicate: retries of MT_SYNC_REQ can produce multiple responses from the // same participant. Only process the first one. - if (!pendingSyncResponses.remove(response.participant)) + if (!pendingSyncResponses.remove(from)) return; // Update shard targets with the offsets received from the participant @@ -323,7 +322,7 @@ public void onSyncResponse(MutationTrackingSyncResponse response) } } - logger.trace("Sync coordinator received sync response from {}", response.participant); + logger.trace("Sync coordinator received sync response from {}", from); checkIfReadyToComplete(); } diff --git a/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java b/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java index 8d943feb1050..1f396a513b7d 100644 --- a/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java +++ b/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java @@ -75,6 +75,9 @@ Offsets.Mutable intersection() Offsets.Mutable union() { + if (offsetsMap.isEmpty()) + throw new IllegalStateException("Cannot compute union of empty offsets map"); + Iterator iter = offsetsMap.values().iterator(); if (offsetsMap.size() == 1) return Offsets.Mutable.copy(iter.next()); diff --git a/src/java/org/apache/cassandra/replication/Shard.java b/src/java/org/apache/cassandra/replication/Shard.java index f6462d9e0232..ca133160f522 100644 --- a/src/java/org/apache/cassandra/replication/Shard.java +++ b/src/java/org/apache/cassandra/replication/Shard.java @@ -425,6 +425,25 @@ public Map collectReconciledOffsetsPerLog() return result; } + /** + * Returns the intersection of witnessed offsets scoped to only the specified participant host IDs. + * If liveHostIds is null, behaves the same as {@link #collectReconciledOffsetsPerLog()}. + */ + public Map collectReconciledOffsetsPerLog(Set liveHostIds) + { + if (liveHostIds == null) + return collectReconciledOffsetsPerLog(); + + Map result = new HashMap<>(); + for (CoordinatorLog log : logs.values()) + { + Offsets.Immutable reconciled = log.collectReconciledOffsets(liveHostIds); + if (!reconciled.isEmpty()) + result.put(log.logId, reconciled); + } + return result; + } + /** * Returns the UNION of witnessed offsets from all participants for each coordinator log. * Union = all offsets that ANY replica has witnessed. @@ -460,25 +479,6 @@ public Map collectUnionOfWitnessedOffsetsPe return result; } - /** - * Returns the intersection of witnessed offsets scoped to only the specified participant host IDs. - * If liveHostIds is null, behaves the same as {@link #collectReconciledOffsetsPerLog()}. - */ - public Map collectReconciledOffsetsPerLog(Set liveHostIds) - { - if (liveHostIds == null) - return collectReconciledOffsetsPerLog(); - - Map result = new HashMap<>(); - for (CoordinatorLog log : logs.values()) - { - Offsets.Immutable reconciled = log.collectReconciledOffsets(liveHostIds); - if (!reconciled.isEmpty()) - result.put(log.logId, reconciled); - } - return result; - } - @Override public String toString() { diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 8fb60ff1c2d6..2fe8578aca05 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -5472,7 +5472,7 @@ public long getMutationTrackingSyncTimeout() public void setMutationTrackingSyncTimeout(long timeoutInMillis) { - checkState(timeoutInMillis > 0); + checkArgument(timeoutInMillis > 0); DatabaseDescriptor.setMutationTrackingSyncTimeout(timeoutInMillis); logger.info("MutationTrackingSyncTimeout set to {}ms via JMX", timeoutInMillis); } diff --git a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java index 9c03617accca..f15a83518c49 100644 --- a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java +++ b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java @@ -40,6 +40,7 @@ import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; @@ -250,20 +251,19 @@ public boolean shouldUseTrackedForWrites(boolean isTracked, TableId tableId, Tok } /** - * Check whether any of the specified tables have the given ranges in their migration pending set. - * During migration, reads for pending ranges continue to use the untracked read path with blocking - * read repair, so it is safe to exclude mutation tracking streaming for those ranges. However, - * ranges must be either entirely inside or entirely outside the pending set for each table. + * Asserts that the given ranges are either entirely inside or entirely outside the pending + * migration set for each specified table. During migration, reads for pending ranges continue + * to use the untracked read path with blocking read repair, so it is safe to exclude mutation + * tracking streaming for those ranges. However, partial overlap is not supported — repair must + * operate on ranges that are fully migrated or fully pending. * * @param ranges the normalized ranges to check * @param tables the tables to check against - * @return true if any of the specified tables have these ranges pending migration * @throws IllegalStateException if ranges partially overlap with pending ranges for any table */ - public boolean areRangesPendingMigration(@Nonnull NormalizedRanges ranges, - @Nonnull Iterable tables) + public void assertRangesNotMixedMigration(@Nonnull NormalizedRanges ranges, + @Nonnull Iterable tables) { - boolean anyPending = false; for (TableMetadata table : tables) { NormalizedRanges pendingRanges = getPendingRangesForTable(table.id); @@ -281,10 +281,7 @@ public boolean areRangesPendingMigration(@Nonnull NormalizedRanges ranges "Ranges for keyspace %s partially overlap with migration pending ranges for table %s. " + "Ranges must be entirely inside or entirely outside the pending set.", keyspace, table.name)); - - anyPending = true; } - return anyPending; } /** @@ -294,12 +291,11 @@ public boolean areRangesPendingMigration(@Nonnull NormalizedRanges ranges * @param ranges the normalized ranges to check * @param ksm the keyspace metadata for resolving table names * @param columnFamilies specific table names to check, or empty for all tables - * @return true if any of the specified tables have these ranges pending migration * @throws IllegalStateException if ranges partially overlap with pending ranges for any table */ - public boolean areRangesPendingMigration(@Nonnull NormalizedRanges ranges, - @Nonnull KeyspaceMetadata ksm, - @Nonnull Collection columnFamilies) + public void assertRangesNotMixedMigration(@Nonnull NormalizedRanges ranges, + @Nonnull KeyspaceMetadata ksm, + @Nonnull Collection columnFamilies) { Iterable tables; if (!columnFamilies.isEmpty()) @@ -317,7 +313,42 @@ public boolean areRangesPendingMigration(@Nonnull NormalizedRanges ranges { tables = ksm.tables; } - return areRangesPendingMigration(ranges, tables); + assertRangesNotMixedMigration(ranges, tables); + } + + /** + * Determines whether the given ranges for a table should use the tracked transfer path + * (coordinated activation via TrackedRepairTransfer) or the untracked streaming path. + *

+ * Returns true (use tracked) when: + * - No migration is in progress for this keyspace, OR + * - The ranges don't overlap with pending migration ranges for this table + *

+ * Returns false (use untracked) when: + * - A migration is in progress AND the ranges overlap with pending ranges + * + * @param metadata cluster metadata snapshot + * @param keyspace keyspace name + * @param tableId table to check + * @param ranges the ranges being repaired or streamed + * @return true if tracked transfers should be used + */ + public static boolean shouldUseTrackedTransfers(@Nonnull ClusterMetadata metadata, + @Nonnull String keyspace, + @Nonnull TableId tableId, + @Nonnull Collection> ranges) + { + KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); + if (migrationInfo == null) + return true; + + NormalizedRanges pendingRanges = migrationInfo.getPendingRangesForTable(tableId); + if (pendingRanges.isEmpty()) + return true; + + NormalizedRanges normalizedRanges = NormalizedRanges.normalizedRanges(ranges); + NormalizedRanges overlap = pendingRanges.intersection(normalizedRanges); + return overlap.isEmpty(); } @Override diff --git a/src/java/org/apache/cassandra/utils/CollectionSerializers.java b/src/java/org/apache/cassandra/utils/CollectionSerializers.java index c3fef945c25b..42b48d289770 100644 --- a/src/java/org/apache/cassandra/utils/CollectionSerializers.java +++ b/src/java/org/apache/cassandra/utils/CollectionSerializers.java @@ -693,6 +693,30 @@ public long serializedSize(List t) }; } + public static IVersionedSerializer> newMapSerializer(IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer) + { + return new IVersionedSerializer>() + { + @Override + public void serialize(Map map, DataOutputPlus out, int version) throws IOException + { + serializeMap(map, out, version, keySerializer, valueSerializer); + } + + @Override + public Map deserialize(DataInputPlus in, int version) throws IOException + { + return deserializeMap(in, version, keySerializer, valueSerializer); + } + + @Override + public long serializedSize(Map map, int version) + { + return serializedMapSize(map, version, keySerializer, valueSerializer); + } + }; + } + public static IVersionedSerializer> newListSerializer(IVersionedSerializer itemSerializer) { return new IVersionedSerializer>() diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 9920c0b73193..33b5265c99ab 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -525,6 +525,17 @@ public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception @Test public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() { + repairWithDeadNodeDoesNotAdvanceMigration(withPR(ksName, "--force")); + } + + @Test + public void testInHostsRepairWithDeadNodeDoesNotAdvanceMigration() { + String addr1 = getBroadcastAddress(1); + String addr3 = getBroadcastAddress(3); + repairWithDeadNodeDoesNotAdvanceMigration(ksName, "--in-hosts", addr1 + ',' + addr3); + } + + private void repairWithDeadNodeDoesNotAdvanceMigration(String... repairArgs) { setupUntracked(); insertDataWithInconsistency(3, "tbl", 0, 50); @@ -534,7 +545,7 @@ public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() { isolateNode(2, 1, 3); List liveNodes = List.of(1, 3); - repairFromNodesSuccess(liveNodes, ksName, "--force"); + assertAllSuccess(repairConcurrently(liveNodes, repairArgs)); String ks = ksName; assertTrue("Migration should not advance with dead nodes excluded", @@ -545,6 +556,23 @@ public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() { })); } + @Test + public void testInHostsRepairSucceedsWhenSpecifiedHostIsNetworkBlocked() { + String addr1 = getBroadcastAddress(1); + String addr3 = getBroadcastAddress(3); + + insertDataWithInconsistency(3, "tbl", 0, 50); + + // Block network to node 2 but do NOT mark it down in gossip + CLUSTER.filters().allVerbs().from(2).drop(); + CLUSTER.filters().allVerbs().to(2).drop(); + + // Repair specifying only live hosts should succeed despite node 2 being blocked + List liveNodes = List.of(1, 3); + String[] args = new String[]{ksName, "--in-hosts", addr1 + ',' + addr3}; + assertAllSuccess(repairConcurrently(liveNodes, args)); + } + @Test public void testPreviewRepairDoesNotAdvanceMigration() throws Exception { setupUntracked(); From b4b931da7df1b79f34a849bc70534a35706fe425 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Tue, 24 Mar 2026 15:23:24 -0400 Subject: [PATCH 31/46] SSTableWriter.finalizeMetadata just bypass any attempt and check for mutation tracking based repair marking if the range is pending migration. At that point it's not being managed by mutation tracking --- .../cassandra/dht/NormalizedRanges.java | 13 +++++++ .../io/sstable/format/SSTableWriter.java | 35 ++++++++++--------- .../migration/KeyspaceMigrationInfo.java | 20 +++++++++++ 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/java/org/apache/cassandra/dht/NormalizedRanges.java b/src/java/org/apache/cassandra/dht/NormalizedRanges.java index c9a040a6585b..dc91466ba7e8 100644 --- a/src/java/org/apache/cassandra/dht/NormalizedRanges.java +++ b/src/java/org/apache/cassandra/dht/NormalizedRanges.java @@ -134,6 +134,19 @@ public boolean intersects(T token) return isIn; } + /** + * Check if any of these ranges intersect with the given range. + */ + public boolean intersects(Range range) + { + for (Range r : this) + { + if (r.intersects(range)) + return true; + } + return false; + } + public NormalizedRanges subtract(NormalizedRanges b) { if (b.isEmpty()) diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java index cab0c54a29a6..3246bb8d1223 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java @@ -62,6 +62,7 @@ import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.service.ActiveRepairService; +import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.Throwables; @@ -341,27 +342,29 @@ protected Map finalizeMetadata() // Reconciliation should not occur before activation for coordinated transfer streams for tracked keyspaces. boolean reconcile = txn.opType() != OperationType.STREAM; - // Migration from incremental repair to mutation tracking will be supported, but support for mixing - // incremental repair and mutation tracking is not planned. + // During migration, incremental repair handles repair status for ranges still pending migration. + // Only apply mutation tracking reconciliation for ranges NOT in the migration pending set. + // For SSTables whose range falls within pending migration ranges, IR sets pendingRepair/repairedAt. if (metadata().replicationType().isTracked() && repairedAt == ActiveRepairService.UNREPAIRED_SSTABLE && reconcile) { - // During migration, incremental repair may write SSTables with pendingRepair set to a tracked table. - // Outside of migration, pendingRepair should never be set on a tracked table. - if (!Objects.equals(pendingRepair, ActiveRepairService.NO_PENDING_REPAIR)) + boolean inMigrationPendingRange = false; + KeyspaceMigrationInfo migrationInfo = ClusterMetadata.current().mutationTrackingMigrationState + .getKeyspaceInfo(metadata().keyspace); + if (migrationInfo != null) { - Preconditions.checkState(ClusterMetadata.current().mutationTrackingMigrationState - .getKeyspaceInfo(metadata().keyspace) != null, - "pendingRepair set on tracked table %s.%s outside of migration", - metadata().keyspace, metadata().name); + inMigrationPendingRange = migrationInfo.isRangeInPendingMigration(metadata().id, + first.getToken(), + last.getToken()); } - // Only attempt mutation tracking reconciliation when there is no pending repair. - // SSTables with empty coordinator log offsets were written before mutation tracking was enabled - // (e.g. during migration from untracked to tracked). They should not be marked as reconciled - // since they were never tracked and need to go through incremental repair first. - else if (!coordinatorLogOffsets.isEmpty() && MutationTrackingService.instance().isDurablyReconciled(coordinatorLogOffsets)) + + if (!inMigrationPendingRange) { - repairedAt = Clock.Global.currentTimeMillis(); - logger.debug("Marking SSTable {} as reconciled with repairedAt {}", descriptor, repairedAt); + Preconditions.checkState(Objects.equals(pendingRepair, ActiveRepairService.NO_PENDING_REPAIR)); + if (MutationTrackingService.instance().isDurablyReconciled(coordinatorLogOffsets)) + { + repairedAt = Clock.Global.currentTimeMillis(); + logger.debug("Marking SSTable {} as reconciled with repairedAt {}", descriptor, repairedAt); + } } } diff --git a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java index f15a83518c49..1653711881f6 100644 --- a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java +++ b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java @@ -230,6 +230,26 @@ public boolean isTokenInPendingRange(TableId tableId, Token token) return tableRanges.intersects(token); } + /** + * Check if a range intersects with any pending migration range for the given table. + */ + public boolean isRangeInPendingMigration(TableId tableId, Range range) + { + NormalizedRanges tableRanges = pendingRangesPerTable.get(tableId); + if (tableRanges == null) + return false; + return tableRanges.intersects(range); + } + + /** + * Check if the range defined by startToken and endToken intersects with any pending migration range + * for the given table. + */ + public boolean isRangeInPendingMigration(TableId tableId, Token startToken, Token endToken) + { + return isRangeInPendingMigration(tableId, new Range<>(startToken, endToken)); + } + /** * Determine if read operations on a token should use tracked replication during migration. * From 8e9a6a77bfea46520f3d193de4ce479e2aea70c0 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Wed, 25 Mar 2026 15:54:56 -0400 Subject: [PATCH 32/46] Ignore double bounce test https://issues.apache.org/jira/browse/CASSANDRA-21256 --- .../distributed/test/tracking/MutationTrackingBounceTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java b/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java index 0a6279a5c4ee..8d270d51c2cf 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingBounceTest.java @@ -31,6 +31,7 @@ import org.apache.cassandra.harry.gen.Generator; import org.apache.cassandra.harry.gen.SchemaGenerators; import org.apache.cassandra.replication.MutationJournal; +import org.junit.Ignore; import org.junit.Test; @@ -58,6 +59,7 @@ public void bounceTestMultiNode() throws Throwable } } + @Ignore("https://issues.apache.org/jira/browse/CASSANDRA-21256") @Test public void doubleBounceTestMultiNode() throws Throwable { From c2c5f2beb1db1246362d3b163b9de4a6e17cb335 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 26 Mar 2026 16:28:08 -0400 Subject: [PATCH 33/46] No background reconciliation breaks PartialUpdateHandlingTest when it attempts IR and reconciliation never occurs --- .../distributed/test/sai/PartialUpdateHandlingTest.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/PartialUpdateHandlingTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/PartialUpdateHandlingTest.java index 2b38ba8e26bc..fa932943b07a 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/sai/PartialUpdateHandlingTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/sai/PartialUpdateHandlingTest.java @@ -236,7 +236,11 @@ public void writeRepairedRows() CLUSTER.coordinator(1).execute(insert.toString(), ConsistencyLevel.ALL); } - CLUSTER.get(1).nodetoolResult("repair", specification.keyspaceName()).asserts().success(); + // Background reconciliation doesn't exist/work so incremental repair just hangs waiting for reconciliation that never occurs + if (specification.replicationType.isTracked()) + CLUSTER.get(1).nodetoolResult("repair", "-full", specification.keyspaceName()).asserts().success(); + else + CLUSTER.get(1).nodetoolResult("repair", specification.keyspaceName()).asserts().success(); } public void writeUnrepairedRows() From 76a6aac7576d8a519c5bb5eb75f18424932ec0ff Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Fri, 27 Mar 2026 14:48:14 -0400 Subject: [PATCH 34/46] Fix flaky testFailedMutationRedelivery --- .../test/tracking/MutationTrackingTest.java | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingTest.java b/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingTest.java index 32fdd2505562..c2b99267bdee 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/tracking/MutationTrackingTest.java @@ -418,19 +418,22 @@ public void testFailedMutationRedelivery() throws Throwable Assert.assertEquals(0, summary.get(logId).reconciled.offsetCount()); }); - // resume the reconciler + // resume the reconciler and spin until reconciliation completes. + // The reconciler retries with PUSH_MUTATION_REQ whose response inherits the + // request's expiry (write_request_timeout). Under load the response can arrive + // after that expiry and be silently dropped by InboundMessageHandler, requiring + // a retry cycle. Spinning accommodates multiple retry rounds. cluster.get(1).runOnInstance(() -> MutationTrackingService.instance().resumeActiveReconciler()); - Thread.sleep(1000); // wait for reconiciler to do its job - cluster.get(1).runOnInstance(() -> - { - TableMetadata table = Schema.instance.getTableMetadata(keyspaceName, "tbl"); - DecoratedKey dk = Murmur3Partitioner.instance.decorateKey(ByteBufferUtil.bytes(1)); - MutationSummary summary = MutationTrackingService.instance().createSummaryForKey(dk, table.id, false); - CoordinatorLogId logId = getOnlyLogId(summary); - Assert.assertEquals(0, summary.get(logId).unreconciled.offsetCount()); - Assert.assertEquals(1, summary.get(logId).reconciled.offsetCount()); - }); + Util.spinUntilTrue(() -> + cluster.get(1).callOnInstance(() -> { + TableMetadata table = Schema.instance.getTableMetadata(keyspaceName, "tbl"); + DecoratedKey dk = Murmur3Partitioner.instance.decorateKey(ByteBufferUtil.bytes(1)); + MutationSummary summary = MutationTrackingService.instance().createSummaryForKey(dk, table.id, false); + CoordinatorLogId logId = getOnlyLogId(summary); + return summary.get(logId).unreconciled.offsetCount() == 0 + && summary.get(logId).reconciled.offsetCount() == 1; + }), 10); } } } From 5ac9e2f258c0dc5f31648bff688e75e97d0ad3d0 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Fri, 27 Mar 2026 16:08:12 -0400 Subject: [PATCH 35/46] Reset filters only after checking data is inconsistent --- .../distributed/test/repair/MutationTrackingRepairTest.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 33b5265c99ab..bd1c53e2fb07 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -172,14 +172,13 @@ private void insertDataWithInconsistency(int isolatedNode, String tableName, int ConsistencyLevel.QUORUM, i, i); } - CLUSTER.filters().reset(); - // Verify the isolated node is actually missing the data we just wrote Object[][] results = CLUSTER.get(isolatedNode).executeInternal( "SELECT k FROM " + ksName + '.' + tableName + " WHERE k >= ? AND k < ? ALLOW FILTERING", start, start + count); assertEquals("Node " + isolatedNode + " should not have data written while isolated", 0, results.length); + CLUSTER.filters().reset(); } private void assertDataOnAllNodes(String tableName, List keys) { From 4ab0f2bbe7c2fcd68f8b8aecbe4fb6ec36658992 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Wed, 1 Apr 2026 15:19:03 -0400 Subject: [PATCH 36/46] Fix MutationTrackingRepairTest style --- .../repair/MutationTrackingRepairTest.java | 220 ++++++++++++------ 1 file changed, 147 insertions(+), 73 deletions(-) diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index bd1c53e2fb07..55c7aa9220fe 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -66,7 +66,8 @@ * tries to look up the dropped keyspace's metadata). The keyspaces are cleaned up when * the cluster is closed at the end of the test class. */ -public class MutationTrackingRepairTest extends TestBaseImpl { +public class MutationTrackingRepairTest extends TestBaseImpl +{ private static final int NUM_NODES = 3; private static final List ALL_NODES = List.of(1, 2, 3); @@ -77,7 +78,8 @@ public class MutationTrackingRepairTest extends TestBaseImpl { private String ksName; @BeforeClass - public static void setupCluster() throws IOException { + public static void setupCluster() throws IOException + { executor = Executors.newCachedThreadPool(); CLUSTER = Cluster.build() .withNodes(NUM_NODES) @@ -91,14 +93,16 @@ public static void setupCluster() throws IOException { } @AfterClass - public static void teardownCluster() { + public static void teardownCluster() + { executor.shutdownNow(); if (CLUSTER != null) CLUSTER.close(); } @Before - public void setUp() { + public void setUp() + { ksName = "mt_repair_" + ksCounter.incrementAndGet(); CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + @@ -107,15 +111,19 @@ public void setUp() { } @After - public void tearDown() { + public void tearDown() + { CLUSTER.filters().reset(); - for (int i = 1; i <= CLUSTER.size(); i++) { + for (int i = 1; i <= CLUSTER.size(); i++) + { CLUSTER.get(i).runOnInstance(() -> { Gossiper.runInGossipStageBlocking(() -> { - for (var entry : Gossiper.instance.endpointStateMap.entrySet()) { + for (var entry : Gossiper.instance.endpointStateMap.entrySet()) + { InetAddressAndPort ep = entry.getKey(); EndpointState state = entry.getValue(); - if (!ep.equals(FBUtilities.getBroadcastAddressAndPort()) && !state.isAlive()) { + if (!ep.equals(FBUtilities.getBroadcastAddressAndPort()) && !state.isAlive()) + { FailureDetector.instance.report(ep); Gossiper.instance.realMarkAlive(ep, state); } @@ -125,7 +133,8 @@ public void tearDown() { } } - private void setupUntracked() { + private void setupUntracked() + { ksName = "mt_repair_" + ksCounter.incrementAndGet(); CLUSTER.schemaChange("CREATE KEYSPACE " + ksName + " WITH replication = " + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + @@ -133,40 +142,48 @@ private void setupUntracked() { CLUSTER.schemaChange("CREATE TABLE " + ksName + ".tbl (k int PRIMARY KEY, v int)"); } - private void createTable(String tableName) { + private void createTable(String tableName) + { CLUSTER.schemaChange("CREATE TABLE " + ksName + '.' + tableName + " (k int PRIMARY KEY, v int)"); } - private void alterKeyspaceToTracked() { + private void alterKeyspaceToTracked() + { CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + "AND replication_type='tracked'"); } - private void alterKeyspaceToUntracked() { + private void alterKeyspaceToUntracked() + { CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + "{'class': 'SimpleStrategy', 'replication_factor': 3} " + "AND replication_type='untracked'"); } - private void insertData(String tableName, int start, int count) { - for (int i = start; i < start + count; i++) { + private void insertData(String tableName, int start, int count) + { + for (int i = start; i < start + count; i++) + { CLUSTER.coordinator(1).execute( "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", ConsistencyLevel.ALL, i, i); } } - private void insertDataWithInconsistency(String tableName, int start, int count) { + private void insertDataWithInconsistency(String tableName, int start, int count) + { insertDataWithInconsistency(2, tableName, start, count); } - private void insertDataWithInconsistency(int isolatedNode, String tableName, int start, int count) { + private void insertDataWithInconsistency(int isolatedNode, String tableName, int start, int count) + { // Isolate a node so background reconcilation has some work to do CLUSTER.filters().allVerbs().to(isolatedNode).drop(); CLUSTER.filters().allVerbs().from(isolatedNode).drop(); - for (int i = start; i < start + count; i++) { + for (int i = start; i < start + count; i++) + { CLUSTER.coordinator(1).execute( "INSERT INTO " + ksName + '.' + tableName + " (k, v) VALUES (?, ?)", ConsistencyLevel.QUORUM, i, i); @@ -181,9 +198,12 @@ private void insertDataWithInconsistency(int isolatedNode, String tableName, int CLUSTER.filters().reset(); } - private void assertDataOnAllNodes(String tableName, List keys) { - for (int node = 1; node <= CLUSTER.size(); node++) { - for (int key : keys) { + private void assertDataOnAllNodes(String tableName, List keys) + { + for (int node = 1; node <= CLUSTER.size(); node++) + { + for (int key : keys) + { Object[][] results = CLUSTER.get(node).executeInternal( "SELECT k, v FROM " + ksName + '.' + tableName + " WHERE k = ?", key); assertEquals("Node " + node + " missing row k=" + key, 1, results.length); @@ -193,65 +213,79 @@ private void assertDataOnAllNodes(String tableName, List keys) { } } - private void assertDataOnAllNodes(String tableName, int start, int count) { + private void assertDataOnAllNodes(String tableName, int start, int count) + { List keys = new ArrayList<>(count); for (int i = start; i < start + count; i++) keys.add(i); assertDataOnAllNodes(tableName, keys); } - private NodeToolResult nodetoolRepair(int node, String... args) { + private NodeToolResult nodetoolRepair(int node, String... args) + { String[] cmd = new String[args.length + 1]; cmd[0] = "repair"; System.arraycopy(args, 0, cmd, 1, args.length); return CLUSTER.get(node).nodetoolResult(cmd); } - private List repairConcurrently(List nodes, String... args) { + private List repairConcurrently(List nodes, String... args) + { List> futures = new ArrayList<>(); - for (int node : nodes) { + for (int node : nodes) + { int n = node; futures.add(executor.submit(() -> nodetoolRepair(n, args))); } List results = new ArrayList<>(); - for (Future f : futures) { - try { + for (Future f : futures) + { + try + { results.add(f.get(60, TimeUnit.SECONDS)); - } catch (Exception e) { + } + catch (Exception e) + { throw new RuntimeException("Repair future failed", e); } } return results; } - private void assertAllSuccess(List results) { + private void assertAllSuccess(List results) + { for (NodeToolResult r : results) r.asserts().success(); } - private void assertAllFailure(List results) { + private void assertAllFailure(List results) + { for (NodeToolResult r : results) r.asserts().failure(); } - private String[] withPR(String... args) { + private String[] withPR(String... args) + { String[] result = new String[args.length + 1]; System.arraycopy(args, 0, result, 0, args.length); result[args.length] = "-pr"; return result; } - private void repairResolvingInconsistency(String... args) throws Exception { + private void repairResolvingInconsistency(String... args) throws Exception + { repairResolvingInconsistency(2, ALL_NODES, withPR(args)); } - private void repairResolvingInconsistency(int isolatedNode, List nodes, String... args) throws Exception { + private void repairResolvingInconsistency(int isolatedNode, List nodes, String... args) throws Exception + { // Dropping messages is to check that repair retries messages if needed CLUSTER.filters().allVerbs().to(isolatedNode).drop(); CLUSTER.filters().allVerbs().from(isolatedNode).drop(); List> futures = new ArrayList<>(); - for (int node : nodes) { + for (int node : nodes) + { int n = node; futures.add(executor.submit(() -> nodetoolRepair(n, args))); } @@ -271,13 +305,15 @@ private void repairResolvingInconsistency(int isolatedNode, List nodes, assertAllSuccess(repairConcurrently(nodes, args)); } - private void repairFromNodesSuccess(List nodes, String... args) { + private void repairFromNodesSuccess(List nodes, String... args) + { String[] prArgs = withPR(args); assertAllSuccess(repairConcurrently(nodes, prArgs)); assertAllSuccess(repairConcurrently(nodes, prArgs)); } - private boolean isMigrationInProgress() { + private boolean isMigrationInProgress() + { String ks = ksName; return CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); @@ -285,7 +321,8 @@ private boolean isMigrationInProgress() { }); } - private boolean isMigrationComplete() { + private boolean isMigrationComplete() + { String ks = ksName; return CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); @@ -298,7 +335,8 @@ private boolean isMigrationComplete() { * Get the primary token range for a node as [start, end] token values. * With SimpleStrategy RF=3 and 3 nodes, each node has exactly one primary range. */ - private long[] getPrimaryRangeTokens(int node) { + private long[] getPrimaryRangeTokens(int node) + { String ks = ksName; return CLUSTER.get(node).callOnInstance(() -> { var ranges = StorageService.instance.getPrimaryRanges(ks); @@ -314,11 +352,13 @@ private long[] getPrimaryRangeTokens(int node) { /** * Compute which integer keys from [start, start+count) hash into the given token range. */ - private List keysInTokenRange(int start, int count, long rangeStart, long rangeEnd) { + private List keysInTokenRange(int start, int count, long rangeStart, long rangeEnd) + { Range range = new Range<>(new Murmur3Partitioner.LongToken(rangeStart), new Murmur3Partitioner.LongToken(rangeEnd)); List keys = new ArrayList<>(); - for (int i = start; i < start + count; i++) { + for (int i = start; i < start + count; i++) + { Token token = Murmur3Partitioner.instance.getToken(ByteBufferUtil.bytes(i)); if (range.contains(token)) keys.add(i); @@ -326,22 +366,28 @@ private List keysInTokenRange(int start, int count, long rangeStart, lo return keys; } - private String getBroadcastAddress(int node) { + private String getBroadcastAddress(int node) + { return CLUSTER.get(node).callOnInstance(() -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); } - private void isolateNode(int nodeToIsolate, int... observerNodes) { + private void isolateNode(int nodeToIsolate, int... observerNodes) + { CLUSTER.filters().allVerbs().from(nodeToIsolate).drop(); CLUSTER.filters().allVerbs().to(nodeToIsolate).drop(); String isolatedAddress = CLUSTER.get(nodeToIsolate).callOnInstance( () -> FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort()); - for (int observer : observerNodes) { + for (int observer : observerNodes) + { CLUSTER.get(observer).runOnInstance(() -> { - try { + try + { InetAddressAndPort neighbor = InetAddressAndPort.getByName(isolatedAddress); FailureDetector.instance.forceConviction(neighbor); - } catch (UnknownHostException e) { + } + catch (UnknownHostException e) + { throw new RuntimeException(e); } }); @@ -349,7 +395,8 @@ private void isolateNode(int nodeToIsolate, int... observerNodes) { } @Test - public void testBasicRepairHappyPath() throws Exception { + public void testBasicRepairHappyPath() throws Exception + { insertDataWithInconsistency("tbl", 0, 100); repairResolvingInconsistency(ksName); @@ -358,7 +405,8 @@ public void testBasicRepairHappyPath() throws Exception { } @Test - public void testRepairSpecificTable() throws Exception { + public void testRepairSpecificTable() throws Exception + { createTable("tbl1"); createTable("tbl2"); @@ -381,7 +429,8 @@ public void testRepairSpecificTable() throws Exception { } @Test - public void testRepairAllTables() throws Exception { + public void testRepairAllTables() throws Exception + { createTable("tbl1"); createTable("tbl2"); createTable("tbl3"); @@ -398,7 +447,8 @@ public void testRepairAllTables() throws Exception { } @Test - public void testForceRepairWithNodeDown() { + public void testForceRepairWithNodeDown() + { insertDataWithInconsistency(3, "tbl", 0, 50); isolateNode(2, 1, 3); @@ -408,8 +458,10 @@ public void testForceRepairWithNodeDown() { repairFromNodesSuccess(liveNodes, ksName, "--force"); - for (int node : liveNodes) { - for (int i = 0; i < 50; i++) { + for (int node : liveNodes) + { + for (int i = 0; i < 50; i++) + { Object[][] results = CLUSTER.get(node).executeInternal( "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); assertEquals("Node " + node + " missing row k=" + i, 1, results.length); @@ -418,7 +470,8 @@ public void testForceRepairWithNodeDown() { } @Test - public void testForceRepairWithAllNodesUp() throws Exception { + public void testForceRepairWithAllNodesUp() throws Exception + { insertDataWithInconsistency("tbl", 0, 50); repairResolvingInconsistency(ksName, "--force"); @@ -427,7 +480,8 @@ public void testForceRepairWithAllNodesUp() throws Exception { } @Test - public void testRepairWithSpecificHosts() { + public void testRepairWithSpecificHosts() + { String addr1 = getBroadcastAddress(1); String addr3 = getBroadcastAddress(3); @@ -445,8 +499,10 @@ public void testRepairWithSpecificHosts() { assertAllSuccess(repairConcurrently(liveNodes, args)); assertAllSuccess(repairConcurrently(liveNodes, args)); - for (int node : liveNodes) { - for (int i = 0; i < 50; i++) { + for (int node : liveNodes) + { + for (int i = 0; i < 50; i++) + { Object[][] results = CLUSTER.get(node).executeInternal( "SELECT k, v FROM " + ksName + ".tbl WHERE k = ?", i); assertEquals("Node " + node + " missing row k=" + i, 1, results.length); @@ -455,7 +511,8 @@ public void testRepairWithSpecificHosts() { } @Test - public void testMigrationUntrackedToTrackedCompletesViaRepair() throws Exception { + public void testMigrationUntrackedToTrackedCompletesViaRepair() throws Exception + { setupUntracked(); insertDataWithInconsistency("tbl", 0, 100); @@ -469,17 +526,20 @@ public void testMigrationUntrackedToTrackedCompletesViaRepair() throws Exception } @Test - public void testDataAccessibleDuringMigrationToTracked() throws Exception { + public void testDataAccessibleDuringMigrationToTracked() throws Exception + { setupUntracked(); dataAccessibleDuringMigration(() -> alterKeyspaceToTracked()); } @Test - public void testDataAccessibleDuringMigrationToUntracked() throws Exception { + public void testDataAccessibleDuringMigrationToUntracked() throws Exception + { dataAccessibleDuringMigration(() -> alterKeyspaceToUntracked()); } - private void dataAccessibleDuringMigration(Runnable alterKeyspace) throws Exception { + private void dataAccessibleDuringMigration(Runnable alterKeyspace) throws Exception + { insertDataWithInconsistency("tbl", 0, 50); alterKeyspace.run(); @@ -510,7 +570,8 @@ private void dataAccessibleDuringMigration(Runnable alterKeyspace) throws Except } @Test - public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception { + public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception + { insertDataWithInconsistency("tbl", 0, 100); alterKeyspaceToUntracked(); @@ -523,18 +584,21 @@ public void testMigrationTrackedToUntrackedCompletesViaRepair() throws Exception } @Test - public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() { + public void testForceRepairWithDeadNodeDoesNotAdvanceMigration() + { repairWithDeadNodeDoesNotAdvanceMigration(withPR(ksName, "--force")); } @Test - public void testInHostsRepairWithDeadNodeDoesNotAdvanceMigration() { + public void testInHostsRepairWithDeadNodeDoesNotAdvanceMigration() + { String addr1 = getBroadcastAddress(1); String addr3 = getBroadcastAddress(3); repairWithDeadNodeDoesNotAdvanceMigration(ksName, "--in-hosts", addr1 + ',' + addr3); } - private void repairWithDeadNodeDoesNotAdvanceMigration(String... repairArgs) { + private void repairWithDeadNodeDoesNotAdvanceMigration(String... repairArgs) + { setupUntracked(); insertDataWithInconsistency(3, "tbl", 0, 50); @@ -556,7 +620,8 @@ private void repairWithDeadNodeDoesNotAdvanceMigration(String... repairArgs) { } @Test - public void testInHostsRepairSucceedsWhenSpecifiedHostIsNetworkBlocked() { + public void testInHostsRepairSucceedsWhenSpecifiedHostIsNetworkBlocked() + { String addr1 = getBroadcastAddress(1); String addr3 = getBroadcastAddress(3); @@ -573,7 +638,8 @@ public void testInHostsRepairSucceedsWhenSpecifiedHostIsNetworkBlocked() { } @Test - public void testPreviewRepairDoesNotAdvanceMigration() throws Exception { + public void testPreviewRepairDoesNotAdvanceMigration() throws Exception + { setupUntracked(); insertDataWithInconsistency("tbl", 0, 50); @@ -586,7 +652,8 @@ public void testPreviewRepairDoesNotAdvanceMigration() throws Exception { } @Test - public void testSubrangeRepair() throws Exception { + public void testSubrangeRepair() throws Exception + { long[] primaryRange = getPrimaryRangeTokens(1); String st = Long.toString(primaryRange[0]); String et = Long.toString(primaryRange[1]); @@ -602,7 +669,8 @@ public void testSubrangeRepair() throws Exception { } @Test - public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Exception { + public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Exception + { setupUntracked(); long[] primaryRange = getPrimaryRangeTokens(1); String st = Long.toString(primaryRange[0]); @@ -631,8 +699,10 @@ public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Ex Range repairedRange = new Range<>(new Murmur3Partitioner.LongToken(rangeStart), new Murmur3Partitioner.LongToken(rangeEnd)); - for (var entry : info.pendingRangesPerTable.entrySet()) { - for (Range pending : entry.getValue()) { + for (var entry : info.pendingRangesPerTable.entrySet()) + { + for (Range pending : entry.getValue()) + { assertFalse("Repaired range should not overlap with pending ranges for table " + entry.getKey(), repairedRange.intersects(pending)); } @@ -647,7 +717,8 @@ public void testSubrangeRepairAdvancesMigrationOnlyForSpecifiedRange() throws Ex } @Test - public void testRepairRejectsMixedMigratedAndPendingRanges() { + public void testRepairRejectsMixedMigratedAndPendingRanges() + { setupUntracked(); insertData("tbl", 0, 50); @@ -674,7 +745,8 @@ public void testRepairRejectsMixedMigratedAndPendingRanges() { } @Test - public void testRepairTimeout() { + public void testRepairTimeout() + { insertData("tbl", 0, 50); CLUSTER.filters().allVerbs().to(2).drop(); @@ -698,7 +770,8 @@ public void testRepairTimeout() { * split SSTables (rather than just mutating fully-contained ones in place). */ @Test - public void testMigrationSubrangeRepairAntiCompactionSplitsSSTables() throws Exception { + public void testMigrationSubrangeRepairAntiCompactionSplitsSSTables() throws Exception + { setupUntracked(); // Write data and flush so SSTables span the full token ring on each node. @@ -726,7 +799,8 @@ public void testMigrationSubrangeRepairAntiCompactionSplitsSSTables() throws Exc } @Test - public void testRepairSyncTimeout() { + public void testRepairSyncTimeout() + { insertDataWithInconsistency("tbl", 0, 50); // Drop only offset broadcasts so MT_SYNC_REQ/RSP can succeed but @@ -739,4 +813,4 @@ public void testRepairSyncTimeout() { assertTrue("Expected sync timeout error but got: " + r.getStderr(), r.getStderr().contains("Mutation tracking sync timed out")); } -} \ No newline at end of file +} From c78704457262c80a1058d4718cd076a233c171fc Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 2 Apr 2026 16:06:16 -0400 Subject: [PATCH 37/46] Add MutationTrackingMigrationState.isMigrating --- .../cassandra/db/lifecycle/Tracker.java | 3 +-- ...MutationTrackingIncrementalRepairTask.java | 4 +--- .../replication/MutationTrackingService.java | 2 +- .../MutationTrackingMigrationState.java | 5 +++++ .../test/MutationTrackingMigrationTest.java | 16 +++++++------- .../repair/MutationTrackingRepairTest.java | 6 ++---- .../MutationTrackingMigrationStateTest.java | 6 +++--- .../AdvanceMutationTrackingMigrationTest.java | 6 +++--- .../AlterSchemaMutationTrackingTest.java | 21 +++++++++---------- 9 files changed, 33 insertions(+), 36 deletions(-) diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java index 36b08c1877f8..3ac129d7659f 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java +++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java @@ -276,8 +276,7 @@ public void addSSTables(Collection sstables) // Tracked tables may legitimately use this path during migration from untracked to tracked, // when incremental repair streams SSTables that were written before tracking was enabled. Preconditions.checkState(!cfstore.metadata().replicationType().isTracked() - || ClusterMetadata.current().mutationTrackingMigrationState - .getKeyspaceInfo(cfstore.metadata().keyspace) != null); + || ClusterMetadata.current().mutationTrackingMigrationState.isMigrating(cfstore.metadata().keyspace)); addSSTablesInternal(sstables, false, true, true); } diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index 727439f8cccb..4ad9ead3f16f 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -28,7 +28,6 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.replication.MutationTrackingSyncCoordinator; import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; @@ -203,7 +202,6 @@ public static boolean shouldUseMutationTrackingRepair(ClusterMetadata metadata, */ public static boolean isMutationTrackingMigrationInProgress(ClusterMetadata metadata, String keyspace) { - KeyspaceMigrationInfo migrationInfo = metadata.mutationTrackingMigrationState.getKeyspaceInfo(keyspace); - return migrationInfo != null; + return metadata.mutationTrackingMigrationState.isMigrating(keyspace); } } diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index c6412a4aaa5e..372a09806a64 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -1149,7 +1149,7 @@ private static Set> splitRange(Range range) static KeyspaceShards make(KeyspaceMetadata keyspace, ClusterMetadata cluster, LongSupplier logIdProvider, BiConsumer onNewLog) { - Preconditions.checkArgument(keyspace.params.replicationType.isTracked() || cluster.mutationTrackingMigrationState.getKeyspaceInfo(keyspace.name) != null); + Preconditions.checkArgument(keyspace.params.replicationType.isTracked() || cluster.mutationTrackingMigrationState.isMigrating(keyspace.name)); Map, Shard> shards = new HashMap<>(); Map, VersionedEndpoints.ForRange> groups = new HashMap<>(); diff --git a/src/java/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationState.java b/src/java/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationState.java index 6b7850dad9a9..15fa8a6b39b1 100644 --- a/src/java/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationState.java +++ b/src/java/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationState.java @@ -256,6 +256,11 @@ public KeyspaceMigrationInfo getKeyspaceInfo(String keyspace) return keyspaceInfo.get(keyspace); } + public boolean isMigrating(String keyspace) + { + return keyspaceInfo.containsKey(keyspace); + } + public boolean hasMigratingKeyspaces() { return !keyspaceInfo.isEmpty(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/MutationTrackingMigrationTest.java b/test/distributed/org/apache/cassandra/distributed/test/MutationTrackingMigrationTest.java index 51344e893d57..9b1b6f3c0226 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/MutationTrackingMigrationTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/MutationTrackingMigrationTest.java @@ -32,7 +32,6 @@ import org.apache.cassandra.distributed.api.ICoordinator; import org.apache.cassandra.replication.MutationJournal; import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.service.replication.migration.KeyspaceMigrationInfo; import org.apache.cassandra.service.replication.migration.MutationTrackingMigrationState; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; @@ -41,8 +40,7 @@ import static java.lang.String.format; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; /** @@ -136,31 +134,31 @@ private void verifyKeyspaceState(String keyspace, ExpectedKeyspaceState expected ClusterMetadata metadata = ClusterMetadata.current(); KeyspaceMetadata ksm = expectedState != ExpectedKeyspaceState.DROPPED ? metadata.schema.getKeyspaceMetadata(keyspace) : null; MutationTrackingMigrationState migrationState = metadata.mutationTrackingMigrationState; - KeyspaceMigrationInfo migrationInfo = migrationState.getKeyspaceInfo(keyspace); + boolean migrating = migrationState.isMigrating(keyspace); switch (expectedState) { case UNTRACKED: assertTrue(!ksm.params.replicationType.isTracked()); - assertNull(migrationInfo); + assertFalse(migrating); break; case MIGRATING_TO_TRACKED: assertTrue(ksm.params.replicationType.isTracked()); - assertNotNull(migrationInfo); + assertTrue(migrating); break; case MIGRATING_TO_UNTRACKED: assertTrue(!ksm.params.replicationType.isTracked()); - assertNotNull(migrationInfo); + assertTrue(migrating); break; case TRACKED: assertTrue(ksm.params.replicationType.isTracked()); - assertNull(migrationInfo); + assertFalse(migrating); break; case DROPPED: - assertNull(migrationInfo); + assertFalse(migrating); break; default: throw new AssertionError("Unexpected state: " + expectedState); diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 55c7aa9220fe..9600b244cfcc 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -326,8 +326,7 @@ private boolean isMigrationComplete() String ks = ksName; return CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); - KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ks); - return info == null; + return !metadata.mutationTrackingMigrationState.isMigrating(ks); }); } @@ -614,8 +613,7 @@ private void repairWithDeadNodeDoesNotAdvanceMigration(String... repairArgs) assertTrue("Migration should not advance with dead nodes excluded", CLUSTER.get(1).callOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); - KeyspaceMigrationInfo info = metadata.mutationTrackingMigrationState.getKeyspaceInfo(ks); - return info != null; + return metadata.mutationTrackingMigrationState.isMigrating(ks); })); } diff --git a/test/unit/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationStateTest.java b/test/unit/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationStateTest.java index dc27a3792c6d..36d77d46ac89 100644 --- a/test/unit/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationStateTest.java +++ b/test/unit/org/apache/cassandra/service/replication/migration/MutationTrackingMigrationStateTest.java @@ -153,7 +153,7 @@ public void testWithMigrationsCompleted() assertNotSame(state, completed); assertFalse(completed.hasMigratingKeyspaces()); - assertNull(completed.getKeyspaceInfo("test_ks")); + assertFalse(completed.isMigrating("test_ks")); } @Test @@ -169,7 +169,7 @@ public void testWithMigrationsRemoved() MutationTrackingMigrationState removed = state.dropKeyspaces(epoch, Collections.singleton("test_ks")); assertFalse(removed.hasMigratingKeyspaces()); - assertNull(removed.getKeyspaceInfo("test_ks")); + assertFalse(removed.isMigrating("test_ks")); } @Test @@ -253,7 +253,7 @@ public void testMultipleKeyspaces() state = state.withRangesRepairedForTable("ks1", testTableId, Collections.singleton(fullRing), epoch); assertEquals(1, state.keyspaceInfo.size()); - assertNull(state.getKeyspaceInfo("ks1")); + assertFalse(state.isMigrating("ks1")); // ks2 should still have full ring pending KeyspaceMigrationInfo expectedKs2AfterKs1Complete = createExpectedKeyspaceMigrationInfo( diff --git a/test/unit/org/apache/cassandra/tcm/transformations/AdvanceMutationTrackingMigrationTest.java b/test/unit/org/apache/cassandra/tcm/transformations/AdvanceMutationTrackingMigrationTest.java index c0289e8fc27d..bd6a1db4c2cf 100644 --- a/test/unit/org/apache/cassandra/tcm/transformations/AdvanceMutationTrackingMigrationTest.java +++ b/test/unit/org/apache/cassandra/tcm/transformations/AdvanceMutationTrackingMigrationTest.java @@ -139,7 +139,7 @@ public void testAdvanceRangesCompleteMigration() // Verify migration was auto-completed (keyspace removed from state) assertFalse(updated.mutationTrackingMigrationState.hasMigratingKeyspaces()); - assertNull(updated.mutationTrackingMigrationState.getKeyspaceInfo("test_ks")); + assertFalse(updated.mutationTrackingMigrationState.isMigrating("test_ks")); } @Test @@ -240,7 +240,7 @@ public void testAdvancePartialThenComplete() assertTrue(result2.isSuccess()); ClusterMetadata afterComplete = result2.success().metadata; - assertNull(afterComplete.mutationTrackingMigrationState.getKeyspaceInfo("test_ks")); + assertFalse(afterComplete.mutationTrackingMigrationState.isMigrating("test_ks")); assertFalse(afterComplete.mutationTrackingMigrationState.hasMigratingKeyspaces()); } @@ -285,7 +285,7 @@ public void testAdvanceMultipleTables() assertTrue(result2.isSuccess()); ClusterMetadata afterTable2 = result2.success().metadata; - assertNull(afterTable2.mutationTrackingMigrationState.getKeyspaceInfo("test_ks")); + assertFalse(afterTable2.mutationTrackingMigrationState.isMigrating("test_ks")); assertFalse(afterTable2.mutationTrackingMigrationState.hasMigratingKeyspaces()); } diff --git a/test/unit/org/apache/cassandra/tcm/transformations/AlterSchemaMutationTrackingTest.java b/test/unit/org/apache/cassandra/tcm/transformations/AlterSchemaMutationTrackingTest.java index a82b516d7714..95e8b5371906 100644 --- a/test/unit/org/apache/cassandra/tcm/transformations/AlterSchemaMutationTrackingTest.java +++ b/test/unit/org/apache/cassandra/tcm/transformations/AlterSchemaMutationTrackingTest.java @@ -42,8 +42,7 @@ import static org.apache.cassandra.cql3.CQLTester.schemaChange; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; /** * Tests for AlterSchema auto-starting mutation tracking migration when replication type changes. @@ -80,7 +79,7 @@ public void testAutoStartToTrackedMigration() throws Throwable schemaChange(String.format("CREATE TABLE %s.tbl (pk int PRIMARY KEY, val int)", ksName)); ClusterMetadata metadata = ClusterMetadata.current(); - assertNull(metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertFalse(metadata.mutationTrackingMigrationState.isMigrating(ksName)); // Alter tracked replication schemaChange(String.format("ALTER KEYSPACE %s WITH replication_type = 'tracked'", ksName)); @@ -116,7 +115,7 @@ public void testAutoStartToUntrackedMigration() throws Throwable schemaChange(String.format("CREATE TABLE %s.tbl (pk int PRIMARY KEY, val int)", ksName)); ClusterMetadata metadata = ClusterMetadata.current(); - assertNull(metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertFalse(metadata.mutationTrackingMigrationState.isMigrating(ksName)); // Alter keyspace to untracked schemaChange(String.format("ALTER KEYSPACE %s WITH replication_type = 'untracked'", ksName)); @@ -149,7 +148,7 @@ public void testNoMigrationWhenReplicationTypeUnchanged() throws Throwable "AND replication_type = 'untracked'" ); schemaChange(String.format("CREATE TABLE %s.tbl (pk int PRIMARY KEY, val int)", ksName)); - assertNull(ClusterMetadata.current().mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertFalse(ClusterMetadata.current().mutationTrackingMigrationState.isMigrating(ksName)); // Alter keyspace without changing replication type schemaChange(String.format( @@ -158,7 +157,7 @@ public void testNoMigrationWhenReplicationTypeUnchanged() throws Throwable )); // confirm no migrations were started - assertNull(ClusterMetadata.current().mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertFalse(ClusterMetadata.current().mutationTrackingMigrationState.isMigrating(ksName)); } @Test @@ -221,8 +220,8 @@ public void testReverseMigrationDirection() throws Throwable schemaChange(String.format("CREATE TABLE %s.tbl (pk int PRIMARY KEY, val int)", ksName)); ClusterMetadata metadata = ClusterMetadata.current(); - assertNull("Should have no migration before first alter for " + ksName, - metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertFalse("Should have no migration before first alter for " + ksName, + metadata.mutationTrackingMigrationState.isMigrating(ksName)); // Alter to tracked (untracked → tracked) schemaChange(String.format("ALTER KEYSPACE %s WITH replication_type = 'tracked'", ksName)); @@ -249,7 +248,7 @@ public void testReverseMigrationDirection() throws Throwable // this should auto-complete the migration, since none of the ranges from the initial alter completed migration metadata = ClusterMetadata.current(); - assertNull(metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertFalse(metadata.mutationTrackingMigrationState.isMigrating(ksName)); // Alter back to tracked again schemaChange(String.format("ALTER KEYSPACE %s WITH replication_type = 'tracked'", ksName)); @@ -283,13 +282,13 @@ public void testDropKeyspaceDuringMigration() throws Throwable schemaChange(String.format("ALTER KEYSPACE %s WITH replication_type = 'tracked'", ksName)); ClusterMetadata metadata = ClusterMetadata.current(); - assertNotNull(metadata.mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertTrue(metadata.mutationTrackingMigrationState.isMigrating(ksName)); // Drop the keyspace & confirm migration is also removed schemaChange(String.format("DROP KEYSPACE %s", ksName)); ClusterMetadata afterDrop = ClusterMetadata.current(); - assertNull(afterDrop.mutationTrackingMigrationState.getKeyspaceInfo(ksName)); + assertFalse(afterDrop.mutationTrackingMigrationState.isMigrating(ksName)); } @Test From 3331b668a5ed467d837c3d1ee30cdbf32bac17bc Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 2 Apr 2026 16:08:59 -0400 Subject: [PATCH 38/46] SSTableWriter nit to simplify --- .../cassandra/io/sstable/format/SSTableWriter.java | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java index 3246bb8d1223..dd833965796a 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java @@ -347,15 +347,11 @@ protected Map finalizeMetadata() // For SSTables whose range falls within pending migration ranges, IR sets pendingRepair/repairedAt. if (metadata().replicationType().isTracked() && repairedAt == ActiveRepairService.UNREPAIRED_SSTABLE && reconcile) { - boolean inMigrationPendingRange = false; - KeyspaceMigrationInfo migrationInfo = ClusterMetadata.current().mutationTrackingMigrationState - .getKeyspaceInfo(metadata().keyspace); - if (migrationInfo != null) - { - inMigrationPendingRange = migrationInfo.isRangeInPendingMigration(metadata().id, - first.getToken(), - last.getToken()); - } + KeyspaceMigrationInfo migrationInfo = ClusterMetadata.current().mutationTrackingMigrationState.getKeyspaceInfo(metadata().keyspace); + boolean inMigrationPendingRange = migrationInfo != null + && migrationInfo.isRangeInPendingMigration(metadata().id, + first.getToken(), + last.getToken()); if (!inMigrationPendingRange) { From 0d18d4b7b5f4785a3bc68d8f1ce684ff6edc52af Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 2 Apr 2026 16:13:25 -0400 Subject: [PATCH 39/46] MutationTrackingSyncRequest javadoc formatting --- .../cassandra/repair/messages/MutationTrackingSyncRequest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java index dab889580a3f..b6c2bca86bcb 100644 --- a/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java +++ b/src/java/org/apache/cassandra/repair/messages/MutationTrackingSyncRequest.java @@ -32,7 +32,7 @@ * their current witnessed offsets. This establishes a happens-before relationship: the * participant's response contains offsets captured after receiving this request, which is * sent after the repair starts. - * + *

* The liveHostIds set tells the responder which hosts are participating in this repair, * so that the response only includes offsets witnessed by those hosts. This prevents the * coordinator from setting sync targets that include offsets only known to down nodes. From 2ebab7e8f86c1827b973480d0b35131c9ab7fb91 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Thu, 2 Apr 2026 16:31:34 -0400 Subject: [PATCH 40/46] MutationTrackingIncrementalRepairTask Convert empty ranges handling to checkState since it shouldn't actually happen --- .../repair/MutationTrackingIncrementalRepairTask.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index 4ad9ead3f16f..b0b8c2cad6e8 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -22,6 +22,8 @@ import java.util.List; import java.util.concurrent.TimeUnit; +import com.google.common.base.Preconditions; + import org.apache.cassandra.concurrent.ExecutorPlus; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Range; @@ -34,6 +36,8 @@ import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; +import static com.google.common.base.Preconditions.checkState; + /** Repair task that syncs mutation tracking offsets across replicas */ public class MutationTrackingIncrementalRepairTask extends AbstractRepairTask { @@ -65,12 +69,7 @@ public String name() public Future performUnsafe(ExecutorPlus executor, Scheduler validationScheduler) { List allRanges = neighborsAndRanges.filterCommonRanges(keyspace, cfnames); - - if (allRanges.isEmpty()) - { - logger.info("No common ranges to repair for keyspace {}", keyspace); - return new AsyncPromise().setSuccess(CoordinatedRepairResult.create(List.of(), List.of())); - } + checkState(!allRanges.isEmpty(), "No ranges to repair"); List syncCoordinators = new ArrayList<>(); List>> rangeCollections = new ArrayList<>(); From 294c0247f050ccaccb37f05d891858fd0e7e166d Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 6 Apr 2026 13:36:28 -0400 Subject: [PATCH 41/46] Clean up error handling in MutationTrackingSyncCoordinator/MutationTrackingIncrementalRepairTask and improve test coverage --- ...MutationTrackingIncrementalRepairTask.java | 57 +++++++++++-------- .../MutationTrackingSyncCoordinator.java | 16 +----- .../repair/MutationTrackingRepairTest.java | 42 +++++++++++++- .../MutationTrackingSyncCoordinatorTest.java | 22 +++---- 4 files changed, 83 insertions(+), 54 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java index b0b8c2cad6e8..8e75ca1f2575 100644 --- a/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/MutationTrackingIncrementalRepairTask.java @@ -21,8 +21,7 @@ import java.util.Collection; import java.util.List; import java.util.concurrent.TimeUnit; - -import com.google.common.base.Preconditions; +import java.util.concurrent.TimeoutException; import org.apache.cassandra.concurrent.ExecutorPlus; import org.apache.cassandra.config.DatabaseDescriptor; @@ -80,8 +79,8 @@ public Future performUnsafe(ExecutorPlus executor, Sche { RepairJobDesc desc = new RepairJobDesc(parentSession, TimeUUID.Generator.nextTimeUUID(), keyspace, "Mutation Tracking Sync", List.of(range)); - MutationTrackingSyncCoordinator syncCoordinator = new MutationTrackingSyncCoordinator( - coordinator.ctx, desc, commonRange.endpoints, metadata); + MutationTrackingSyncCoordinator syncCoordinator = + new MutationTrackingSyncCoordinator(coordinator.ctx, desc, commonRange.endpoints, metadata); syncCoordinator.start(); syncCoordinators.add(syncCoordinator); rangeCollections.add(List.of(range)); @@ -99,6 +98,17 @@ public Future performUnsafe(ExecutorPlus executor, Sche { waitForSyncCompletion(syncCoordinators, rangeCollections, resultPromise); } + catch (InterruptedException e) + { + try + { + resultPromise.tryFailure(new RuntimeException("Interrupted waiting for Mutation Tracking sync coordinators to finish", e)); + } + finally + { + Thread.currentThread().interrupt(); + } + } catch (Exception e) { logger.error("Error during mutation tracking repair", e); @@ -111,44 +121,43 @@ public Future performUnsafe(ExecutorPlus executor, Sche private void waitForSyncCompletion(List syncCoordinators, List>> rangeCollections, - AsyncPromise resultPromise) throws InterruptedException + AsyncPromise resultPromise) throws Exception { long deadlineNanos = coordinator.ctx.clock().nanoTime() + TimeUnit.MILLISECONDS.toNanos( DatabaseDescriptor.getMutationTrackingSyncTimeout(TimeUnit.MILLISECONDS)); - boolean allSucceeded = true; - Throwable error = null; + Exception error = null; for (MutationTrackingSyncCoordinator syncCoordinator : syncCoordinators) { long remainingNanos = deadlineNanos - coordinator.ctx.clock().nanoTime(); try { - if (!syncCoordinator.awaitCompletion(remainingNanos, TimeUnit.NANOSECONDS)) - allSucceeded = false; + syncCoordinator.awaitCompletion(remainingNanos, TimeUnit.NANOSECONDS); } - catch (RuntimeException e) + catch (InterruptedException e) { - allSucceeded = false; - error = Throwables.merge(error, e); + try + { + syncCoordinators.forEach(MutationTrackingSyncCoordinator::cancel); + } + finally + { + throw e; + } } - finally + catch (TimeoutException e) { - if (!allSucceeded) - syncCoordinator.cancel(); + error = Throwables.merge(error, new RuntimeException("Mutation tracking sync timed out", e)); + } + catch (Exception e) + { + error = Throwables.merge(error, e); } } if (error != null) { logger.warn("Mutation tracking sync failed for keyspace {}", keyspace, error); - resultPromise.tryFailure(error); - return; - } - - if (!allSucceeded) - { - logger.warn("Mutation tracking sync timed out for keyspace {}", keyspace); - resultPromise.tryFailure(new RuntimeException("Mutation tracking sync timed out for some ranges")); - return; + throw error; } coordinator.notifyProgress("Mutation tracking sync completed for all ranges"); diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index febec73b4b47..52ac17031739 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -344,21 +344,9 @@ public Range getRange() * @param unit Time unit * @return true if completed, false if timed out */ - public boolean awaitCompletion(long timeout, TimeUnit unit) throws InterruptedException + public void awaitCompletion(long timeout, TimeUnit unit) throws Exception { - try - { - completionFuture.get(timeout, unit); - return true; - } - catch (java.util.concurrent.TimeoutException e) - { - return false; - } - catch (java.util.concurrent.ExecutionException e) - { - throw new RuntimeException(e); - } + completionFuture.get(timeout, unit); } public void cancel() diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 9600b244cfcc..7377920e1e67 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -26,6 +26,7 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiPredicate; import org.junit.After; import org.junit.AfterClass; @@ -37,6 +38,7 @@ import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IMessageFilters; import org.apache.cassandra.distributed.api.NodeToolResult; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.gms.EndpointState; @@ -55,6 +57,7 @@ import org.apache.cassandra.utils.FBUtilities; import static java.util.function.Predicate.not; +import static org.apache.cassandra.distributed.api.IMessageFilters.Matcher.of; import static org.junit.Assert.*; /** @@ -747,14 +750,47 @@ public void testRepairTimeout() { insertData("tbl", 0, 50); - CLUSTER.filters().allVerbs().to(2).drop(); - CLUSTER.filters().allVerbs().from(2).drop(); + CLUSTER.filters().verbs(Verb.MT_SYNC_REQ.id).to(2).drop(); + CLUSTER.filters().verbs(Verb.MT_SYNC_REQ.id).from(2).drop(); List results = repairConcurrently(ALL_NODES, withPR(ksName)); assertAllFailure(results); for (NodeToolResult r : results) assertTrue("Expected timeout error but got: " + r.getStderr(), - r.getStderr().contains("Did not get replies from all endpoints")); + r.getStderr().contains("Mutation tracking sync timed out")); + } + + /** + * Exercises the onFailure callback in MutationTrackingSyncCoordinator.sendSyncRequests(). + * Unlike testRepairTimeout (which drops MT_SYNC_REQ entirely so the request times out), + * this test makes the remote handler throw an exception, which sends a FAILURE_RSP back + * to the coordinator, triggering the onFailure -> fail() path. + */ + @Test + public void testSyncFailureResponse() + { + insertData("tbl", 0, 50); + + // The matcher throwing causes uncaught exceptions on the receiving nodes' stage threads. + // These are expected, so filter them out to avoid failing at cluster close. + CLUSTER.setUncaughtExceptionsFilter((nodeNum, throwable) -> + throwable.getMessage() != null && throwable.getMessage().contains("sync failure injected")); + try + { + CLUSTER.verbs(Verb.MT_SYNC_REQ).messagesMatching(of(m -> { + throw new RuntimeException("sync failure injected"); + })).drop(); + + List results = repairConcurrently(ALL_NODES, withPR(ksName)); + assertAllFailure(results); + for (NodeToolResult r : results) + assertTrue("Expected sync failure error but got: " + r.getStderr(), + r.getStderr().contains("Mutation tracking sync failed")); + } + finally + { + CLUSTER.setUncaughtExceptionsFilter((BiPredicate) null); + } } /** diff --git a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java index 1b9a2b206574..ef2581d300f2 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/replication/MutationTrackingSyncCoordinatorTest.java @@ -98,11 +98,11 @@ public void testSyncCoordinatorCompletesWhenNoShards() throws Throwable try { - return coordinator.awaitCompletion(5, TimeUnit.SECONDS); + coordinator.awaitCompletion(5, TimeUnit.SECONDS); + return true; } - catch (InterruptedException e) + catch (Exception e) { - Thread.currentThread().interrupt(); return false; } }); @@ -154,11 +154,11 @@ public void testSyncCoordinatorWaitsForAllReplicasMutations() throws Throwable try { - return coordinator.awaitCompletion(30, TimeUnit.SECONDS); + coordinator.awaitCompletion(30, TimeUnit.SECONDS); + return true; } - catch (InterruptedException e) + catch (Exception e) { - Thread.currentThread().interrupt(); return false; } })); @@ -271,14 +271,10 @@ public void testSyncCoordinatorCancel() throws Throwable coordinator.awaitCompletion(1, TimeUnit.SECONDS); return false; // Should have thrown } - catch (InterruptedException e) - { - Thread.currentThread().interrupt(); - return false; - } - catch (RuntimeException e) + catch (Exception e) { - return e.getMessage() != null && e.getMessage().contains("cancelled"); + Throwable cause = e.getCause() != null ? e.getCause() : e; + return cause.getMessage() != null && cause.getMessage().contains("cancelled"); } }); assertTrue("Sync coordinator should be cancelled", wasCancelled); From aa78ecf702d90be53e195fc7c20921a1f687f612 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 6 Apr 2026 13:51:16 -0400 Subject: [PATCH 42/46] Review feedback reduce duplication --- .../cassandra/repair/RepairCoordinator.java | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index 7a434553d9ee..25b1096c2cf0 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -558,9 +558,7 @@ private Future>> repair(String[] if (state.options.isPreview()) { RepairTask task = new PreviewRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); - return task.perform(executor, validationScheduler) - .>>map(r -> Pair.create(r, task::successMessage)) - .addCallback((s, f) -> executor.shutdown()); + return submitRepairTask(task, executor); } else if (useMutationTracking) { @@ -588,26 +586,27 @@ else if (useMutationTracking) ); return result.addCallback((s, f) -> executor.shutdown()); } - return mtTask.perform(executor, validationScheduler) - .>>map(r -> Pair.create(r, mtTask::successMessage)) - .addCallback((s, f) -> executor.shutdown()); + return submitRepairTask(mtTask, executor); } else if (state.options.isIncremental()) { RepairTask task = new IncrementalRepairTask(this, state.id, neighborsAndRanges, cfnames); - return task.perform(executor, validationScheduler) - .>>map(r -> Pair.create(r, task::successMessage)) - .addCallback((s, f) -> executor.shutdown()); + return submitRepairTask(task, executor); } else { RepairTask task = new NormalRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); - return task.perform(executor, validationScheduler) - .>>map(r -> Pair.create(r, task::successMessage)) - .addCallback((s, f) -> executor.shutdown()); + return submitRepairTask(task, executor); } } + private Future>> submitRepairTask(RepairTask task, ExecutorPlus executor) + { + return task.perform(executor, validationScheduler) + .>>map(r -> Pair.create(r, task::successMessage)) + .addCallback((s, f) -> executor.shutdown()); + } + private ExecutorPlus createExecutor() { return ctx.executorFactory() From 46ab1519bdd0c98d3277ba3d65927703e18fe744 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 6 Apr 2026 14:24:44 -0400 Subject: [PATCH 43/46] Most of MutationTrackingSyncCoordinatorFeedback --- .../MutationTrackingSyncCoordinator.java | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 52ac17031739..3f768bd9e94f 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -19,7 +19,6 @@ package org.apache.cassandra.replication; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; import java.util.HashMap; import java.util.Map; @@ -31,6 +30,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import org.apache.cassandra.dht.Range; @@ -48,6 +48,8 @@ import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.concurrent.AsyncPromise; +import static com.google.common.base.Preconditions.checkState; + public class MutationTrackingSyncCoordinator { private static final Logger logger = LoggerFactory.getLogger(MutationTrackingSyncCoordinator.class); @@ -57,7 +59,6 @@ public class MutationTrackingSyncCoordinator private final String keyspace; private final Range range; private final Set participants; - private final ClusterMetadata metadata; private final AsyncPromise completionFuture = new AsyncPromise<>(); // Per-shard state: tracks what each node has reported for that shard @@ -65,7 +66,7 @@ public class MutationTrackingSyncCoordinator // Host IDs of participants for scoped offset collection/completion. // Null means all shard participants (no filtering). - private Set liveHostIds; + private final Set liveHostIds; private final AtomicBoolean started = new AtomicBoolean(false); @@ -90,7 +91,24 @@ public MutationTrackingSyncCoordinator(SharedContext ctx, RepairJobDesc desc, Se this.keyspace = desc.keyspace; this.range = Iterables.getOnlyElement(desc.ranges); this.participants = participants; - this.metadata = metadata; + + // Convert participant endpoints to host IDs for scoped completion checks. + // If participants is null (no filtering), all shard participants are live. + if (participants != null) + { + ImmutableSet.Builder builder = ImmutableSet.builder(); + for (InetAddressAndPort ep : participants) + { + builder.add(metadata.directory.peerId(ep).id()); + } + // Always include the local node + builder.add(metadata.directory.peerId(ctx.broadcastAddressAndPort()).id()); + liveHostIds = builder.build(); + } + else + { + liveHostIds = null; + } } public void start() @@ -104,24 +122,7 @@ public void start() overlappingShards.add(shard); }); - if (overlappingShards.isEmpty()) - { - completionFuture.setSuccess(null); - return; - } - - // Convert participant endpoints to host IDs for scoped completion checks. - // If participants is null (no filtering), all shard participants are live. - if (participants != null) - { - liveHostIds = new HashSet<>(); - for (InetAddressAndPort ep : participants) - { - liveHostIds.add(metadata.directory.peerId(ep).id()); - } - // Always include the local node - liveHostIds.add(metadata.directory.peerId(ctx.broadcastAddressAndPort()).id()); - } + checkState(!overlappingShards.isEmpty(), "No intersecting shards found for keyspace {} range {}", keyspace, range); for (Shard shard : overlappingShards) { From 04d75e73f00e66180a9b636430f38c1b455f42f2 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 6 Apr 2026 14:37:47 -0400 Subject: [PATCH 44/46] Add test case for topology changes causing failure --- .../repair/MutationTrackingRepairTest.java | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java index 7377920e1e67..4fbce7c16a06 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/MutationTrackingRepairTest.java @@ -21,6 +21,7 @@ import java.net.UnknownHostException; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -847,4 +848,70 @@ public void testRepairSyncTimeout() assertTrue("Expected sync timeout error but got: " + r.getStderr(), r.getStderr().contains("Mutation tracking sync timed out")); } + + /** + * Verifies that a topology change during an active mutation tracking sync causes + * the repair to fail with "topology changed during sync". + * + * The strategy: + * 1. Insert data so sync has work to do + * 2. Drop BROADCAST_LOG_OFFSETS so the sync coordinator stays alive waiting for + * offset reconciliation + * 3. Start repair in a background thread + * 4. Wait until the sync request has been sent (confirming sync is active) + * 5. ALTER KEYSPACE to change RF (3 -> 2), which triggers REPLICA_GROUP -> + * withUpdatedMetadata -> new Shard instances + * 6. Turn off the BROADCAST_LOG_OFFSETS filter so offset broadcasts resume, + * triggering onOffsetsReceived -> recaptureTargets -> checkForTopologyChange + * which detects the identity mismatch and fails the repair + * 7. Assert the repair failed with the expected topology change message + */ + @Test + public void testRepairFailsOnTopologyChange() throws Exception + { + insertData("tbl", 0, 50); + + // Block offset broadcasts so the sync coordinator stays alive waiting + IMessageFilters.Filter offsetFilter = CLUSTER.filters().verbs(Verb.BROADCAST_LOG_OFFSETS.id).drop(); + + // Use a latch to detect when the sync request has been sent, meaning + // the sync coordinator is active and tracking shard references + CountDownLatch syncStarted = new CountDownLatch(1); + IMessageFilters.Filter syncObserver = CLUSTER.verbs(Verb.MT_SYNC_REQ).messagesMatching( + (from, to, msg) -> { + syncStarted.countDown(); + return false; // don't drop the message + }).drop(); + + // Start repair in background + Future repairFuture = executor.submit(() -> nodetoolRepair(1, withPR(ksName))); + + // Wait until sync is active. The latch fires when MT_SYNC_REQ is sent, which + // happens after shardStates is fully populated in start(), so no additional + // delay is needed. + assertTrue("Timed out waiting for sync to start", + syncStarted.await(30, TimeUnit.SECONDS)); + + // ALTER KEYSPACE to change RF from 3 to 2 — this changes the participants for + // every range, triggering REPLICA_GROUP -> withUpdatedMetadata -> new Shard instances. + // The sync coordinator's shardStates still holds references to the old Shard objects. + CLUSTER.schemaChange("ALTER KEYSPACE " + ksName + " WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': 2} " + + "AND replication_type='tracked'"); + + // Remove the sync observer since it's no longer needed + syncObserver.off(); + + // Turn off the offset broadcast filter so broadcasts resume. When an offset + // broadcast arrives, it calls onOffsetsReceived -> recaptureTargets -> + // checkForTopologyChange, which will detect that the current Shard instances + // (new objects from withUpdatedMetadata) differ from the ones stored in + // shardStates (reference equality check), and fail the repair. + offsetFilter.off(); + + NodeToolResult result = repairFuture.get(30, TimeUnit.SECONDS); + result.asserts().failure(); + assertTrue("Expected topology change error but got: " + result.getStderr(), + result.getStderr().contains("topology changed during sync")); + } } From 069a449068123b1b6aab1bcc738c06f1cf52ae02 Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 6 Apr 2026 15:02:44 -0400 Subject: [PATCH 45/46] More feedback/cleanup for MutationSyncCoordinator --- .../MutationTrackingSyncCoordinator.java | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java index 3f768bd9e94f..94bf30da0d6b 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingSyncCoordinator.java @@ -134,7 +134,7 @@ public void start() MutationTrackingService.instance().registerSyncCoordinator(this); // Capture local targets - recaptureTargets(); + captureTargets(); logger.info("Sync coordinator started for keyspace {} range {}, tracking {} shards", keyspace, range, overlappingShards.size()); @@ -200,10 +200,10 @@ public boolean invokeOnFailure() } } - private void recaptureTargets() + private void captureTargets() { - if (checkForTopologyChange()) - return; + checkState(!completionFuture.isDone()); + checkForTopologyChange(); for (ShardSyncState state : shardStates.values()) { @@ -213,20 +213,19 @@ private void recaptureTargets() /** * Checks if any of the shards we're tracking have changed due to topology updates. - * @return true if topology changed (and repair was failed), false if all shards are still current + * If a change is detected, fails the repair via {@link #fail(Throwable)}. */ - private boolean checkForTopologyChange() + private void checkForTopologyChange() { for (ShardSyncState state : shardStates.values()) { Shard currentShard = getCurrentShard(state.shard.range); if (currentShard != state.shard) { - failWithTopologyChange(); - return true; + fail(new RuntimeException("Repair failed: topology changed during sync")); + return; } } - return false; } private Shard getCurrentShard(Range shardRange) @@ -239,11 +238,6 @@ private Shard getCurrentShard(Range shardRange) return result[0]; } - private void failWithTopologyChange() - { - fail(new RuntimeException("Repair failed: topology changed during sync")); - } - private void fail(Throwable cause) { if (completionFuture.tryFailure(cause)) @@ -259,8 +253,9 @@ private void fail(Throwable cause) */ private void checkIfReadyToComplete() { - if (completionFuture.isDone() || checkForTopologyChange()) + if (completionFuture.isDone()) return; + checkForTopologyChange(); if (checkIfComplete()) { @@ -271,6 +266,9 @@ private void checkIfReadyToComplete() private boolean checkIfComplete() { + if (completionFuture.isDone()) + return true; + if (!pendingSyncResponses.isEmpty()) return false; @@ -290,7 +288,6 @@ public void onOffsetsReceived() if (completionFuture.isDone()) return; - recaptureTargets(); checkIfReadyToComplete(); } From 0739dce92b8d02263fb0e6edce4eaa0c4c26320c Mon Sep 17 00:00:00 2001 From: Ariel Weisberg Date: Mon, 6 Apr 2026 15:48:22 -0400 Subject: [PATCH 46/46] Nit assertRangesNotMixedMigration handling of empty columnFamilies --- .../org/apache/cassandra/repair/RepairCoordinator.java | 3 ++- .../replication/migration/KeyspaceMigrationInfo.java | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index 25b1096c2cf0..3035d2a913f1 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -155,7 +155,8 @@ public static RepairCoordinator create(StorageService storageService, int cmd, R { NormalizedRanges repairRanges = NormalizedRanges.normalizedRanges(options.getRanges()); KeyspaceMetadata ksm = metadata.schema.getKeyspaceMetadata(keyspace); - migrationInfo.assertRangesNotMixedMigration(repairRanges, ksm, options.getColumnFamilies()); + Collection cfs = options.getColumnFamilies(); + migrationInfo.assertRangesNotMixedMigration(repairRanges, ksm, cfs.isEmpty() ? null : cfs); } } diff --git a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java index 1653711881f6..38f0f00e75eb 100644 --- a/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java +++ b/src/java/org/apache/cassandra/service/replication/migration/KeyspaceMigrationInfo.java @@ -27,6 +27,7 @@ import java.util.Objects; import java.util.Set; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; @@ -45,6 +46,7 @@ import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; +import static com.google.common.base.Preconditions.checkArgument; import static org.apache.cassandra.db.TypeSizes.sizeof; import static org.apache.cassandra.utils.CollectionSerializers.deserializeList; import static org.apache.cassandra.utils.CollectionSerializers.deserializeMap; @@ -306,19 +308,19 @@ public void assertRangesNotMixedMigration(@Nonnull NormalizedRanges range /** * Convenience overload that resolves column family names to table metadata before checking. - * If columnFamilies is empty, all tables in the keyspace are checked. * * @param ranges the normalized ranges to check * @param ksm the keyspace metadata for resolving table names - * @param columnFamilies specific table names to check, or empty for all tables + * @param columnFamilies specific table names to check, or null for all tables * @throws IllegalStateException if ranges partially overlap with pending ranges for any table */ public void assertRangesNotMixedMigration(@Nonnull NormalizedRanges ranges, @Nonnull KeyspaceMetadata ksm, - @Nonnull Collection columnFamilies) + @Nullable Collection columnFamilies) { + checkArgument(columnFamilies == null || !columnFamilies.isEmpty(), "columnFmilies must not be empty"); Iterable tables; - if (!columnFamilies.isEmpty()) + if (columnFamilies != null) { List tableList = new ArrayList<>(columnFamilies.size()); for (String cf : columnFamilies)