From cb9970edfb4ce8be8b4e35d4317d79278b56dacc Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Wed, 1 Apr 2026 15:14:51 -0400 Subject: [PATCH] CASSANALYTICS-146: Remove shutdownhooks in testing to prevent race on transaction log deletion Patch by Josh McKenzie; reviewed by TBD for CASSANALYTICS-146 --- .../SharedClusterIntegrationTestBase.java | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java b/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java index a81b7df9..0a4d6ac0 100644 --- a/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java +++ b/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java @@ -64,6 +64,7 @@ import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.ICluster; import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.shared.JMXUtil; import org.apache.cassandra.sidecar.cluster.CassandraAdapterDelegate; @@ -188,6 +189,12 @@ protected void setup() throws Exception beforeClusterProvisioning(); cluster = provisionClusterWithRetries(this.testVersion); assertThat(cluster).isNotNull(); + // If we have a test timeout, we'll often get a wall of FSWriteErrors inside tmp files as memtable flushing + // and transaction log I/O races w/shutdowns from our tests here. Rather than get extra insult to injury if a + // test times out, we instead unregister the StorageService shutdown hooks; we don't much care about memtable + // content from a node getting flushed with unit tests that run ephemerally. + // If in the future we start to rely on stopping and starting C* nodes and the StorageService shutdown hooks + removeShutdownHooks(); afterClusterProvisioned(); initializeSchemaForTest(); mtlsTestHelper = new MtlsTestHelper(secretsPath); @@ -316,6 +323,53 @@ protected void afterClusterShutdown() { } + /** + * Removes the StorageService drain shutdown hook from each instance to prevent a race condition + * between the JVM shutdown hook (which runs drain and flushes memtables) and Instance.shutdown() + * (which tears down executors and cleans up data directories). Without this, a SIGTERM or JVM + * exit during teardown can trigger FSWriteError as drain flushes write to directories that no + * longer exist. + * + * In theory this is brittle, but the code we're relying on here is 12 years old so we're probably fine. If it fails + * in the future due to us reflecting in for this, it should be pretty clear why. + * + *

This is safe because Instance.shutdown() already performs its own orderly teardown of flush + * writers and executors - the drain hook is redundant in the test context. + */ + private void removeShutdownHooks() + { + if (cluster == null) + { + return; + } + for (int i = 1; i <= cluster.size(); i++) + { + try + { + IInstance instance = cluster.get(i); + if (!instance.isShutdown()) + { + instance.sync((IIsolatedExecutor.SerializableRunnable) () -> { + try + { + Class ssClass = Class.forName("org.apache.cassandra.service.StorageService"); + Object ssInstance = ssClass.getField("instance").get(null); + ssClass.getMethod("removeShutdownHook").invoke(ssInstance); + } + catch (Exception e) + { + throw new RuntimeException("Failed to remove StorageService shutdown hook", e); + } + }).run(); + } + } + catch (Throwable t) + { + logger.debug("Failed to remove shutdown hook for instance {}", i, t); + } + } + } + protected void createTestKeyspace(QualifiedName name, Map rf) { createTestKeyspace(name.maybeQuotedKeyspace(), rf);