Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import org.apache.cassandra.distributed.api.Feature;
import org.apache.cassandra.distributed.api.ICluster;
import org.apache.cassandra.distributed.api.IInstance;
import org.apache.cassandra.distributed.api.IIsolatedExecutor;
import org.apache.cassandra.distributed.api.IInstanceConfig;
import org.apache.cassandra.distributed.shared.JMXUtil;
import org.apache.cassandra.sidecar.cluster.CassandraAdapterDelegate;
Expand Down Expand Up @@ -188,6 +189,12 @@ protected void setup() throws Exception
beforeClusterProvisioning();
cluster = provisionClusterWithRetries(this.testVersion);
assertThat(cluster).isNotNull();
// If we have a test timeout, we'll often get a wall of FSWriteErrors inside tmp files as memtable flushing
// and transaction log I/O races w/shutdowns from our tests here. Rather than get extra insult to injury if a
// test times out, we instead unregister the StorageService shutdown hooks; we don't much care about memtable
// content from a node getting flushed with unit tests that run ephemerally.
// If in the future we start to rely on stopping and starting C* nodes and the StorageService shutdown hooks
removeShutdownHooks();
afterClusterProvisioned();
initializeSchemaForTest();
mtlsTestHelper = new MtlsTestHelper(secretsPath);
Expand Down Expand Up @@ -316,6 +323,53 @@ protected void afterClusterShutdown()
{
}

/**
* Removes the StorageService drain shutdown hook from each instance to prevent a race condition
* between the JVM shutdown hook (which runs drain and flushes memtables) and Instance.shutdown()
* (which tears down executors and cleans up data directories). Without this, a SIGTERM or JVM
* exit during teardown can trigger FSWriteError as drain flushes write to directories that no
* longer exist.
*
* In theory this is brittle, but the code we're relying on here is 12 years old so we're probably fine. If it fails
* in the future due to us reflecting in for this, it should be pretty clear why.
*
* <p>This is safe because Instance.shutdown() already performs its own orderly teardown of flush
* writers and executors - the drain hook is redundant in the test context.
*/
private void removeShutdownHooks()
{
if (cluster == null)
{
return;
}
for (int i = 1; i <= cluster.size(); i++)
{
try
{
IInstance instance = cluster.get(i);
if (!instance.isShutdown())
{
instance.sync((IIsolatedExecutor.SerializableRunnable) () -> {
try
{
Class<?> ssClass = Class.forName("org.apache.cassandra.service.StorageService");
Object ssInstance = ssClass.getField("instance").get(null);
ssClass.getMethod("removeShutdownHook").invoke(ssInstance);
}
catch (Exception e)
{
throw new RuntimeException("Failed to remove StorageService shutdown hook", e);
}
}).run();
}
}
catch (Throwable t)
{
logger.debug("Failed to remove shutdown hook for instance {}", i, t);
}
}
}

protected void createTestKeyspace(QualifiedName name, Map<String, Integer> rf)
{
createTestKeyspace(name.maybeQuotedKeyspace(), rf);
Expand Down
Loading