apache · jmckenzie-dev · Apr 1, 2026
diff --git a/.../src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java b/.../src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java
@@ -64,6 +64,7 @@
 import org.apache.cassandra.distributed.api.Feature;
 import org.apache.cassandra.distributed.api.ICluster;
 import org.apache.cassandra.distributed.api.IInstance;
+import org.apache.cassandra.distributed.api.IIsolatedExecutor;
 import org.apache.cassandra.distributed.api.IInstanceConfig;
 import org.apache.cassandra.distributed.shared.JMXUtil;
 import org.apache.cassandra.sidecar.cluster.CassandraAdapterDelegate;
@@ -188,6 +189,12 @@ protected void setup() throws Exception
         beforeClusterProvisioning();
         cluster = provisionClusterWithRetries(this.testVersion);
         assertThat(cluster).isNotNull();
+        // If we have a test timeout, we'll often get a wall of FSWriteErrors inside tmp files as memtable flushing
+        // and transaction log I/O races w/shutdowns from our tests here. Rather than get extra insult to injury if a
+        // test times out, we instead unregister the StorageService shutdown hooks; we don't much care about memtable
+        // content from a node getting flushed with unit tests that run ephemerally.
+        // If in the future we start to rely on stopping and starting C* nodes and the StorageService shutdown hooks
+        removeShutdownHooks();
         afterClusterProvisioned();
         initializeSchemaForTest();
         mtlsTestHelper = new MtlsTestHelper(secretsPath);
@@ -316,6 +323,53 @@ protected void afterClusterShutdown()
     {
     }
 
+    /**
+     * Removes the StorageService drain shutdown hook from each instance to prevent a race condition
+     * between the JVM shutdown hook (which runs drain and flushes memtables) and Instance.shutdown()
+     * (which tears down executors and cleans up data directories). Without this, a SIGTERM or JVM
+     * exit during teardown can trigger FSWriteError as drain flushes write to directories that no
+     * longer exist.
+     *
+     * In theory this is brittle, but the code we're relying on here is 12 years old so we're probably fine. If it fails
+     * in the future due to us reflecting in for this, it should be pretty clear why.
+     *
+     * <p>This is safe because Instance.shutdown() already performs its own orderly teardown of flush
+     * writers and executors - the drain hook is redundant in the test context.
+     */
+    private void removeShutdownHooks()
+    {
+        if (cluster == null)
+        {
+            return;
+        }
+        for (int i = 1; i <= cluster.size(); i++)
+        {
+            try
+            {
+                IInstance instance = cluster.get(i);
+                if (!instance.isShutdown())
+                {
+                    instance.sync((IIsolatedExecutor.SerializableRunnable) () -> {
+                        try
+                        {
+                            Class<?> ssClass = Class.forName("org.apache.cassandra.service.StorageService");
+                            Object ssInstance = ssClass.getField("instance").get(null);
+                            ssClass.getMethod("removeShutdownHook").invoke(ssInstance);
+                        }
+                        catch (Exception e)
+                        {
+                            throw new RuntimeException("Failed to remove StorageService shutdown hook", e);
+                        }
+                    }).run();
+                }
+            }
+            catch (Throwable t)
+            {
+                logger.debug("Failed to remove shutdown hook for instance {}", i, t);
+            }
+        }
+    }
+
     protected void createTestKeyspace(QualifiedName name, Map<String, Integer> rf)
     {
         createTestKeyspace(name.maybeQuotedKeyspace(), rf);