diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java index e35ba861af6b..bb54bc463a4c 100644 --- a/src/java/org/apache/cassandra/gms/Gossiper.java +++ b/src/java/org/apache/cassandra/gms/Gossiper.java @@ -268,7 +268,13 @@ public void run() taskLock.lock(); /* Update the local heartbeat counter. */ - endpointStateMap.get(getBroadcastAddressAndPort()).updateHeartBeat(); + EndpointState epstate = endpointStateMap.get(getBroadcastAddressAndPort()); + if (epstate == null) + { + logger.warn("Node {} is not in gossip, not running GossipTask", getBroadcastAddressAndPort()); + return; + } + epstate.updateHeartBeat(); if (logger.isTraceEnabled()) logger.trace("My heartbeat is now {}", endpointStateMap.get(FBUtilities.getBroadcastAddressAndPort()).getHeartBeatState().getHeartBeatVersion()); final List gDigests = new ArrayList<>(); diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 424df75b97d3..2757cf91c21c 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -1661,15 +1661,12 @@ public boolean resumeBootstrap() public void abortBootstrap(String nodeStr, String endpointStr) { - logger.debug("Aborting bootstrap for {}/{}", nodeStr, endpointStr); + logger.info("Aborting bootstrap for {}", StringUtils.isEmpty(nodeStr) ? endpointStr : nodeStr); ClusterMetadata metadata = ClusterMetadata.current(); - NodeId nodeId; - if (!StringUtils.isEmpty(nodeStr)) - nodeId = NodeId.fromString(nodeStr); - else - nodeId = metadata.directory.peerId(InetAddressAndPort.getByNameUnchecked(endpointStr)); - + NodeId nodeId = parseNodeIdOrEndpoint(metadata, nodeStr, endpointStr); InetAddressAndPort endpoint = metadata.directory.endpoint(nodeId); + if (endpoint == null) + throw new IllegalArgumentException("Can't abort bootstrap for " + nodeId + " - it does not exist in cluster metadata"); if (Gossiper.instance.isKnownEndpoint(endpoint) && FailureDetector.instance.isAlive(endpoint)) throw new RuntimeException("Can't abort bootstrap for " + nodeId + " - it is alive"); NodeState nodeState = metadata.directory.peerState(nodeId); @@ -1692,6 +1689,47 @@ public void abortBootstrap(String nodeStr, String endpointStr) } } + private static NodeId parseNodeIdOrEndpoint(ClusterMetadata metadata, String nodeStr, String endpointStr) + { + NodeId nodeId; + if (!StringUtils.isEmpty(nodeStr)) + { + try + { + nodeId = NodeId.fromString(nodeStr); + } + catch (IllegalArgumentException | UnsupportedOperationException e) + { + String msg = "Unable to parse node id string " + nodeStr; + logger.warn("{}", msg, e); + throw new IllegalArgumentException(msg, e); + } + } + else + { + InetAddressAndPort endpoint; + try + { + endpoint = InetAddressAndPort.getByName(endpointStr); + } + catch (UnknownHostException e) + { + String msg = "Unable to look up endpoint " + endpointStr; + logger.warn("{}", msg, e); + throw new IllegalArgumentException(msg, e); + } + + nodeId = metadata.directory.peerId(endpoint); + if (nodeId == null) + { + String msg = "Unknown endpoint: " + endpoint; + logger.warn(msg); + throw new IllegalArgumentException(msg); + } + } + return nodeId; + } + @Override public void migrateConsensusProtocol(@Nonnull List keyspaceNames, @Nullable List maybeTableNames, @@ -2008,7 +2046,8 @@ private Map getTokenToEndpointMap(boolean withPort) public String getLocalHostId() { - return getLocalHostUUID().toString(); + UUID localHostId = getLocalHostUUID(); + return localHostId != null ? localHostId.toString() : "UNKNOWN"; } public UUID getLocalHostUUID() diff --git a/src/java/org/apache/cassandra/tcm/transformations/Register.java b/src/java/org/apache/cassandra/tcm/transformations/Register.java index 1e31ff3a8597..a7551c70691c 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/Register.java +++ b/src/java/org/apache/cassandra/tcm/transformations/Register.java @@ -176,6 +176,9 @@ else if (NodeId.isValidNodeId(localHostId) && directory.peerIds().contains(NodeI else { NodeId nodeId = directory.peerId(FBUtilities.getBroadcastAddressAndPort()); + if (nodeId == null) + throw new IllegalStateException("Node has host id "+localHostId+" in system.local, but is not present in cluster metadata - not allowing this node to register. " + + "If a bootstrap of this node failed and was aborted with `nodetool abortbootstrap` it should also have its data removed before trying to rebootstrap."); NodeVersion dirVersion = directory.version(nodeId); // If this is a node in the process of upgrading, update the host id in the system.local table diff --git a/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java b/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java index bc7abb85effc..8012399afdee 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java @@ -18,6 +18,8 @@ package org.apache.cassandra.distributed.test.ring; +import java.io.Closeable; +import java.io.IOException; import java.util.Map; import java.util.Objects; import java.util.concurrent.Callable; @@ -48,6 +50,7 @@ import org.apache.cassandra.distributed.api.ICluster; import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.NodeToolResult; import org.apache.cassandra.distributed.api.TokenSupplier; import org.apache.cassandra.distributed.shared.JMXUtil; import org.apache.cassandra.distributed.shared.NetworkTopology; @@ -55,7 +58,6 @@ import org.apache.cassandra.metrics.DefaultNameFactory; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.service.StorageServiceMBean; -import org.apache.cassandra.utils.Closeable; import org.apache.cassandra.utils.concurrent.CountDownLatch; import static net.bytebuddy.matcher.ElementMatchers.named; @@ -228,6 +230,26 @@ public static Object getMetricAttribute(IInvokableInstance instance, String metr } } + @Test + public void testAbortBootstrapBadIp() throws IOException + { + try (Cluster cluster = builder().withNodes(1).start()) + { + NodeToolResult res = cluster.get(1).nodetoolResult("abortbootstrap", "--ip", "127.0.0.55"); + res.asserts().failure(); + assertTrue(res.getStdout().contains("Unknown endpoint")); + res = cluster.get(1).nodetoolResult("abortbootstrap", "--ip", "127.0.0.999"); + res.asserts().failure(); + assertTrue(res.getStdout().contains("Unable to look up endpoint")); + res = cluster.get(1).nodetoolResult("abortbootstrap", "--node", "999"); + res.asserts().failure(); + assertTrue(res.getStdout().contains("does not exist in cluster metadata")); + res = cluster.get(1).nodetoolResult("abortbootstrap", "--node", "hello"); + res.asserts().failure(); + assertTrue(res.getStdout().contains("Unable to parse node id")); + } + } + public static void populate(ICluster cluster, int from, int to) { populate(cluster, from, to, 1, 3, ConsistencyLevel.QUORUM); diff --git a/test/distributed/org/apache/cassandra/distributed/test/tcm/ClusterMetadataAbortedBootstrapRejoinTest.java b/test/distributed/org/apache/cassandra/distributed/test/tcm/ClusterMetadataAbortedBootstrapRejoinTest.java new file mode 100644 index 000000000000..5994fab07c3a --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/tcm/ClusterMetadataAbortedBootstrapRejoinTest.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.tcm; + +import java.io.IOException; +import java.time.Duration; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; + +import com.google.common.util.concurrent.Uninterruptibles; + +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; + +import org.assertj.core.api.Assertions; +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.service.StorageService; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static net.bytebuddy.matcher.ElementMatchers.takesArguments; + + +public class ClusterMetadataAbortedBootstrapRejoinTest extends TestBaseImpl +{ + @Test + public void testFailedBootstrapNotAllowedToJoin() throws IOException, TimeoutException, ExecutionException, InterruptedException + { + TokenSupplier even = TokenSupplier.evenlyDistributedTokens(3); + try (Cluster cluster = init(Cluster.build(2) + .withInstanceInitializer(BBHelper::install) + .withConfig(c -> c.with(Feature.GOSSIP, Feature.NETWORK)) + .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(3, "dc0", "rack0")) + .withTokenSupplier(even::token) + .start())) + { + IInstanceConfig config = cluster.newInstanceConfig() + .set("auto_bootstrap", true); + IInvokableInstance toBootstrap = cluster.bootstrap(config); + toBootstrap.startup(cluster); + toBootstrap.logs().watchFor(Duration.ofSeconds(60), BBHelper.FAILMESSAGE); + toBootstrap.shutdown().get(); + cluster.get(1).runOnInstance(() -> { + int i = 0; + while (FailureDetector.instance.isAlive(InetAddressAndPort.getByNameUnchecked("127.0.0.3")) && i++ < 30) + Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS); + }); + cluster.get(1).nodetoolResult("abortbootstrap", "--ip", "127.0.0.3").asserts().success(); + Assertions.assertThatThrownBy(toBootstrap::startup) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("but is not present in cluster metadata"); + } + } + + public static class BBHelper + { + public static String FAILMESSAGE = "ARTIFICIALLY FAILING BOOTSTRAP"; + public static AtomicBoolean enabled = new AtomicBoolean(true); + public static void install(ClassLoader cl, int i) + { + if (i == 3) + { + new ByteBuddy().rebase(StorageService.class) + .method(named("repairPaxosForTopologyChange").and(takesArguments(1))) + .intercept(MethodDelegation.to(BBHelper.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + } + } + + public static void repairPaxosForTopologyChange(String reason) + { + if (enabled.get()) + { + enabled.set(false); + throw new RuntimeException(FAILMESSAGE); + } + } + + } +}