diff --git a/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java b/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java index a81b7df9..0a4d6ac0 100644 --- a/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java +++ b/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java @@ -64,6 +64,7 @@ import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.ICluster; import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.shared.JMXUtil; import org.apache.cassandra.sidecar.cluster.CassandraAdapterDelegate; @@ -188,6 +189,12 @@ protected void setup() throws Exception beforeClusterProvisioning(); cluster = provisionClusterWithRetries(this.testVersion); assertThat(cluster).isNotNull(); + // If we have a test timeout, we'll often get a wall of FSWriteErrors inside tmp files as memtable flushing + // and transaction log I/O races w/shutdowns from our tests here. Rather than get extra insult to injury if a + // test times out, we instead unregister the StorageService shutdown hooks; we don't much care about memtable + // content from a node getting flushed with unit tests that run ephemerally. + // If in the future we start to rely on stopping and starting C* nodes and the StorageService shutdown hooks + removeShutdownHooks(); afterClusterProvisioned(); initializeSchemaForTest(); mtlsTestHelper = new MtlsTestHelper(secretsPath); @@ -316,6 +323,53 @@ protected void afterClusterShutdown() { } + /** + * Removes the StorageService drain shutdown hook from each instance to prevent a race condition + * between the JVM shutdown hook (which runs drain and flushes memtables) and Instance.shutdown() + * (which tears down executors and cleans up data directories). Without this, a SIGTERM or JVM + * exit during teardown can trigger FSWriteError as drain flushes write to directories that no + * longer exist. + * + * In theory this is brittle, but the code we're relying on here is 12 years old so we're probably fine. If it fails + * in the future due to us reflecting in for this, it should be pretty clear why. + * + *
This is safe because Instance.shutdown() already performs its own orderly teardown of flush
+ * writers and executors - the drain hook is redundant in the test context.
+ */
+ private void removeShutdownHooks()
+ {
+ if (cluster == null)
+ {
+ return;
+ }
+ for (int i = 1; i <= cluster.size(); i++)
+ {
+ try
+ {
+ IInstance instance = cluster.get(i);
+ if (!instance.isShutdown())
+ {
+ instance.sync((IIsolatedExecutor.SerializableRunnable) () -> {
+ try
+ {
+ Class> ssClass = Class.forName("org.apache.cassandra.service.StorageService");
+ Object ssInstance = ssClass.getField("instance").get(null);
+ ssClass.getMethod("removeShutdownHook").invoke(ssInstance);
+ }
+ catch (Exception e)
+ {
+ throw new RuntimeException("Failed to remove StorageService shutdown hook", e);
+ }
+ }).run();
+ }
+ }
+ catch (Throwable t)
+ {
+ logger.debug("Failed to remove shutdown hook for instance {}", i, t);
+ }
+ }
+ }
+
protected void createTestKeyspace(QualifiedName name, Map