aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'clustercontroller-core/src/main/java/com/yahoo')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java11
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java52
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisiting.java32
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisitingAdapter.java37
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java17
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java153
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java38
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java173
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java38
11 files changed, 331 insertions, 224 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java
index 2cfaf64fe83..0f119d8de50 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java
@@ -15,7 +15,7 @@ import java.util.Set;
import java.util.TreeMap;
/**
- * Detail information about the current state of all the distributor and storage nodes of the cluster.
+ * Detailed information about the current state of all the distributor and storage nodes of the cluster.
*
* @author hakonhall
* @author bratseth
@@ -127,11 +127,10 @@ public class ClusterInfo {
/** Returns the node info object for a given node identifier */
private NodeInfo getInfo(Node node) {
- switch (node.getType()) {
- case DISTRIBUTOR : return getDistributorNodeInfo(node.getIndex());
- case STORAGE : return getStorageNodeInfo(node.getIndex());
- default : throw new IllegalArgumentException("No node type " + node.getType().toString());
- }
+ return switch (node.getType()) {
+ case DISTRIBUTOR -> getDistributorNodeInfo(node.getIndex());
+ case STORAGE -> getStorageNodeInfo(node.getIndex());
+ };
}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java
index 70fbbb60e26..6855859c96f 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java
@@ -2,7 +2,7 @@
package com.yahoo.vespa.clustercontroller.core;
/**
- * Base class for distributor/content node node RPC requests that are bound
+ * Base class for distributor/content node RPC requests that are bound
* to a particular cluster state version.
*/
public abstract class ClusterStateVersionSpecificRequest {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
index 9538167c6de..2535589395d 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
@@ -20,32 +20,29 @@ import static com.yahoo.vdslib.state.NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION
public class ContentCluster {
- private final String clusterName;
+ private static final int pollingFrequency = 5000;
+ private final String clusterName;
private final ClusterInfo clusterInfo = new ClusterInfo();
-
private final Map<Node, Long> nodeStartTimestamps = new TreeMap<>();
private int slobrokGenerationCount = 0;
-
- private int pollingFrequency = 5000;
-
private Distribution distribution;
private final int maxNumberOfGroupsAllowedToBeDown;
public ContentCluster(String clusterName, Collection<ConfiguredNode> configuredNodes, Distribution distribution) {
- this(clusterName, configuredNodes, distribution, 1);
+ this(clusterName, configuredNodes, distribution, -1);
}
public ContentCluster(FleetControllerOptions options) {
this(options.clusterName(), options.nodes(), options.storageDistribution(), options.maxNumberOfGroupsAllowedToBeDown());
}
- private ContentCluster(String clusterName,
- Collection<ConfiguredNode> configuredNodes,
- Distribution distribution,
- int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster(String clusterName,
+ Collection<ConfiguredNode> configuredNodes,
+ Distribution distribution,
+ int maxNumberOfGroupsAllowedToBeDown) {
if (configuredNodes == null) throw new IllegalArgumentException("Nodes must be set");
this.clusterName = clusterName;
this.distribution = distribution;
@@ -91,7 +88,6 @@ public class ContentCluster {
}
public int getPollingFrequency() { return pollingFrequency; }
- public void setPollingFrequency(int millisecs) { pollingFrequency = millisecs; }
/** Returns the configured nodes of this as a read-only map indexed on node index (distribution key) */
public Map<Integer, ConfiguredNode> getConfiguredNodes() {
@@ -131,7 +127,7 @@ public class ContentCluster {
* @param clusterState the current cluster state version
* @param condition the upgrade condition
* @param oldState the old/current wanted state
- * @param newState state wanted to be set @return NodeUpgradePrechecker.Response
+ * @param newState state wanted to be set
* @param inMoratorium whether the CC is in moratorium
*/
public NodeStateChangeChecker.Result calculateEffectOfNewState(
@@ -144,22 +140,22 @@ public class ContentCluster {
/** Returns the indices of the nodes that have been safely set to the given state by the Orchestrator (best guess). */
public List<Integer> nodesSafelySetTo(State state) {
- switch (state) {
- case MAINTENANCE: // Orchestrator's ALLOWED_TO_BE_DOWN
- case DOWN: // Orchestrator's PERMANENTLY_DOWN
- return clusterInfo.getStorageNodeInfos().stream()
- .filter(storageNodeInfo -> {
- NodeState userWantedState = storageNodeInfo.getUserWantedState();
- return userWantedState.getState() == state &&
- Objects.equals(userWantedState.getDescription(), ORCHESTRATOR_RESERVED_DESCRIPTION);
- })
- .map(NodeInfo::getNodeIndex)
- .toList();
- default:
- // Note: There is no trace left if the Orchestrator set the state to UP, so that's handled
- // like any other state:
- return List.of();
- }
+ return switch (state) {
+ // Orchestrator's ALLOWED_TO_BE_DOWN or PERMANENTLY_DOWN, respectively
+ case MAINTENANCE, DOWN ->
+ clusterInfo.getStorageNodeInfos().stream()
+ .filter(storageNodeInfo -> {
+ NodeState userWantedState = storageNodeInfo.getUserWantedState();
+ return userWantedState.getState() == state &&
+ Objects.equals(userWantedState.getDescription(), ORCHESTRATOR_RESERVED_DESCRIPTION);
+ })
+ .map(NodeInfo::getNodeIndex)
+ .toList();
+ default ->
+ // Note: There is no trace left if the Orchestrator sets the state to UP, so that's handled
+ // like any other state:
+ List.of();
+ };
}
public boolean hasConfiguredNode(int index) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisiting.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisiting.java
index 19ff51f4cc4..09f1824824c 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisiting.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisiting.java
@@ -1,16 +1,38 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-//
package com.yahoo.vespa.clustercontroller.core;
+import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.distribution.GroupVisitor;
-public interface HierarchicalGroupVisiting {
- /** Returns true if the group contains more than one (leaf) group. */
- boolean isHierarchical();
+class HierarchicalGroupVisiting {
+
+ private final Distribution distribution;
+
+ public HierarchicalGroupVisiting(Distribution distribution) {
+ this.distribution = distribution;
+ }
+
+ /**
+ * Returns true if the group contains more than one (leaf) group.
+ */
+ public boolean isHierarchical() {
+ return !distribution.getRootGroup().isLeafGroup();
+ }
/**
* Invoke the visitor for each leaf group of an implied group. If the group is non-hierarchical
* (flat), the visitor will not be invoked.
*/
- void visit(GroupVisitor visitor);
+ public void visit(GroupVisitor visitor) {
+ if (isHierarchical()) {
+ distribution.visitGroups(group -> {
+ if (group.isLeafGroup()) {
+ return visitor.visitGroup(group);
+ }
+
+ return true;
+ });
+ }
+ }
+
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisitingAdapter.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisitingAdapter.java
deleted file mode 100644
index 4bc487bfa7f..00000000000
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/HierarchicalGroupVisitingAdapter.java
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-//
-package com.yahoo.vespa.clustercontroller.core;
-
-import com.yahoo.vdslib.distribution.Distribution;
-import com.yahoo.vdslib.distribution.GroupVisitor;
-
-/**
- * Exposes {@link Distribution} as a {@link HierarchicalGroupVisiting}.
- *
- * @author hakon
- */
-public class HierarchicalGroupVisitingAdapter implements HierarchicalGroupVisiting {
- private final Distribution distribution;
-
- public HierarchicalGroupVisitingAdapter(Distribution distribution) {
- this.distribution = distribution;
- }
-
- @Override
- public boolean isHierarchical() {
- return !distribution.getRootGroup().isLeafGroup();
- }
-
- @Override
- public void visit(GroupVisitor visitor) {
- if (isHierarchical()) {
- distribution.visitGroups(group -> {
- if (group.isLeafGroup()) {
- return visitor.visitGroup(group);
- }
-
- return true;
- });
- }
- }
-}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java
index 3f1a7ab5d7b..d7aac1c26fa 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java
@@ -55,7 +55,10 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
private long nextAttemptTime;
/** Cached connection to this node. */
private Target connection;
- /** We cache last connection we did request info on, as we want to report appropriate error for node regardless of whether other commands have created new connection. */
+ /**
+ * We cache last connection we did request info on, as we want to report appropriate error for
+ * node regardless of whether other commands have created new connection.
+ */
public Target lastRequestInfoConnection;
/**
* Counts the number of attempts we have tried since last time we had
@@ -163,7 +166,7 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
}
if (prematureCrashCount != count) {
prematureCrashCount = count;
- log.log(Level.FINE, () -> "Premature crash count on " + toString() + " set to " + count);
+ log.log(Level.FINE, () -> "Premature crash count on " + this + " set to " + count);
}
}
public int getPrematureCrashCount() { return prematureCrashCount; }
@@ -311,13 +314,13 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
}
if (state.getState().equals(State.DOWN) && !reportedState.getState().oneOf("d")) {
downStableStateTime = time;
- log.log(Level.FINE, () -> "Down stable state on " + toString() + " altered to " + time);
+ log.log(Level.FINE, () -> "Down stable state on " + this + " altered to " + time);
if (reportedState.getState() == State.INITIALIZING) {
recentlyObservedUnstableDuringInit = true;
}
} else if (state.getState().equals(State.UP) && !reportedState.getState().oneOf("u")) {
upStableStateTime = time;
- log.log(Level.FINE, () -> "Up stable state on " + toString() + " altered to " + time);
+ log.log(Level.FINE, () -> "Up stable state on " + this + " altered to " + time);
}
if (!state.getState().validReportedNodeState(node.getType())) {
throw new IllegalStateException("Trying to set illegal reported node state: " + state);
@@ -340,14 +343,14 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
} else {
nextAttemptTime = time + 5000;
}
- log.log(Level.FINEST, () -> "Failed to get state from node " + toString() + ", scheduling next attempt in " + (nextAttemptTime - time) + " ms.");
+ log.log(Level.FINEST, () -> "Failed to get state from node " + this + ", scheduling next attempt in " + (nextAttemptTime - time) + " ms.");
} else {
connectionAttemptCount = 0;
timeOfFirstFailingConnectionAttempt = 0;
reportedState = state;
if (version == 0 || state.getState().equals(State.STOPPING)) {
nextAttemptTime = time + cluster.getPollingFrequency();
- log.log(Level.FINEST, () -> "Scheduling next attempt to get state from " + toString() + " in " + (nextAttemptTime - time) + " ms (polling freq).");
+ log.log(Level.FINEST, () -> "Scheduling next attempt to get state from " + this + " in " + (nextAttemptTime - time) + " ms (polling freq).");
} else {
nextAttemptTime = time;
}
@@ -368,7 +371,7 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
} catch (Exception e) {
StringWriter sw = new StringWriter();
e.printStackTrace(new PrintWriter(sw));
- log.warning("Attempted to set wanted state with more than just a main state. Extra data stripped. Original data '" + state.serialize(true) + ":\n" + sw.toString());
+ log.warning("Attempted to set wanted state with more than just a main state. Extra data stripped. Original data '" + state.serialize(true) + ":\n" + sw);
}
}
wantedState = newWanted;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index 2025dfef562..c823c94afd1 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -13,13 +13,20 @@ import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics;
import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
+import java.util.Set;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
import static com.yahoo.vdslib.state.NodeType.STORAGE;
import static com.yahoo.vdslib.state.State.DOWN;
+import static com.yahoo.vdslib.state.State.MAINTENANCE;
import static com.yahoo.vdslib.state.State.RETIRED;
import static com.yahoo.vdslib.state.State.UP;
import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result.allowSettingOfWantedState;
@@ -27,6 +34,7 @@ import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Resu
import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result.createDisallowed;
import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest.Condition.FORCE;
import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest.Condition.SAFE;
+import static java.util.logging.Level.FINE;
/**
* Checks if a node can be upgraded.
@@ -35,8 +43,9 @@ import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetU
*/
public class NodeStateChangeChecker {
- public static final String BUCKETS_METRIC_NAME = "vds.datastored.bucket_space.buckets_total";
- public static final Map<String, String> BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
+ private static final Logger log = Logger.getLogger(NodeStateChangeChecker.class.getName());
+ private static final String BUCKETS_METRIC_NAME = "vds.datastored.bucket_space.buckets_total";
+ private static final Map<String, String> BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
private final int requiredRedundancy;
private final HierarchicalGroupVisiting groupVisiting;
@@ -46,10 +55,12 @@ public class NodeStateChangeChecker {
public NodeStateChangeChecker(ContentCluster cluster, boolean inMoratorium) {
this.requiredRedundancy = cluster.getDistribution().getRedundancy();
- this.groupVisiting = new HierarchicalGroupVisitingAdapter(cluster.getDistribution());
+ this.groupVisiting = new HierarchicalGroupVisiting(cluster.getDistribution());
this.clusterInfo = cluster.clusterInfo();
this.inMoratorium = inMoratorium;
this.maxNumberOfGroupsAllowedToBeDown = cluster.maxNumberOfGroupsAllowedToBeDown();
+ if ( ! groupVisiting.isHierarchical() && maxNumberOfGroupsAllowedToBeDown > 1)
+ throw new IllegalArgumentException("Cannot have both 1 group and maxNumberOfGroupsAllowedToBeDown > 1");
}
public static class Result {
@@ -214,26 +225,34 @@ public class NodeStateChangeChecker {
oldWantedState.getState() + ": " + oldWantedState.getDescription());
}
- Result otherGroupCheck = anotherNodeInAnotherGroupHasWantedState(nodeInfo);
- if (!otherGroupCheck.settingWantedStateIsAllowed()) {
- return otherGroupCheck;
+ if (maxNumberOfGroupsAllowedToBeDown == -1) {
+ var otherGroupCheck = anotherNodeInAnotherGroupHasWantedState(nodeInfo);
+ if (!otherGroupCheck.settingWantedStateIsAllowed()) {
+ return otherGroupCheck;
+ }
+ if (anotherNodeInGroupAlreadyAllowed(nodeInfo, newDescription)) {
+ return allowSettingOfWantedState();
+ }
+ } else {
+ var result = otherNodesHaveWantedState(nodeInfo, newDescription, clusterState);
+ if (result.isPresent())
+ return result.get();
}
if (clusterState.getNodeState(nodeInfo.getNode()).getState() == DOWN) {
- return allowSettingOfWantedState();
- }
-
- if (anotherNodeInGroupAlreadyAllowed(nodeInfo, newDescription)) {
+ log.log(FINE, "node is DOWN, allow");
return allowSettingOfWantedState();
}
Result allNodesAreUpCheck = checkAllNodesAreUp(clusterState);
if (!allNodesAreUpCheck.settingWantedStateIsAllowed()) {
+ log.log(FINE, "allNodesAreUpCheck: " + allNodesAreUpCheck);
return allNodesAreUpCheck;
}
Result checkDistributorsResult = checkDistributors(nodeInfo.getNode(), clusterState.getVersion());
if (!checkDistributorsResult.settingWantedStateIsAllowed()) {
+ log.log(FINE, "checkDistributors: "+ checkDistributorsResult);
return checkDistributorsResult;
}
@@ -268,6 +287,65 @@ public class NodeStateChangeChecker {
}
}
+ /**
+ * Returns an optional Result, where return value is:
+ * For flat setup: Return Optional.of(disallowed) if wanted state is set on some node, else Optional.empty
+ * For hierarchical setup: No wanted state for other nodes, return Optional.empty
+ * Wanted state for nodes/groups are not UP:
+ * if less than maxNumberOfGroupsAllowedToBeDown: return Optional.of(allowed)
+ * else: if node is in group with nodes already down: return Optional.of(allowed), else Optional.of(disallowed)
+ */
+ private Optional<Result> otherNodesHaveWantedState(StorageNodeInfo nodeInfo, String newDescription, ClusterState clusterState) {
+ Node node = nodeInfo.getNode();
+
+ if (groupVisiting.isHierarchical()) {
+ Set<Integer> groupsWithNodesWantedStateNotUp = groupsWithUserWantedStateNotUp();
+ if (groupsWithNodesWantedStateNotUp.size() == 0) {
+ log.log(FINE, "groupsWithNodesWantedStateNotUp=0");
+ return Optional.empty();
+ }
+
+ Set<Integer> groupsWithSameStateAndDescription = groupsWithSameStateAndDescription(MAINTENANCE, newDescription);
+ if (aGroupContainsNode(groupsWithSameStateAndDescription, node)) {
+ log.log(FINE, "Node is in group with same state and description, allow");
+ return Optional.of(allowSettingOfWantedState());
+ }
+ // There are groups with nodes not up, but with another description, probably operator set
+ if (groupsWithSameStateAndDescription.size() == 0) {
+ return Optional.of(createDisallowed("Wanted state already set for another node in groups: " +
+ sortSetIntoList(groupsWithNodesWantedStateNotUp)));
+ }
+
+ Set<Integer> retiredAndNotUpGroups = groupsWithNotRetiredAndNotUp(clusterState);
+ int numberOfGroupsToConsider = retiredAndNotUpGroups.size();
+ // Subtract one group if node is in a group with nodes already retired or not up, since number of such groups will
+ // not increase if we allow node to go down
+ if (aGroupContainsNode(retiredAndNotUpGroups, node)) {
+ numberOfGroupsToConsider = retiredAndNotUpGroups.size() - 1;
+ }
+ if (numberOfGroupsToConsider < maxNumberOfGroupsAllowedToBeDown) {
+ log.log(FINE, "Allow, retiredAndNotUpGroups=" + retiredAndNotUpGroups);
+ return Optional.of(allowSettingOfWantedState());
+ }
+
+ return Optional.of(createDisallowed(String.format("At most %d groups can have wanted state: %s",
+ maxNumberOfGroupsAllowedToBeDown,
+ sortSetIntoList(retiredAndNotUpGroups))));
+ } else {
+ // Return a disallow-result if there is another node with a wanted state
+ var otherNodeHasWantedState = otherNodeHasWantedState(nodeInfo);
+ if ( ! otherNodeHasWantedState.settingWantedStateIsAllowed())
+ return Optional.of(otherNodeHasWantedState);
+ }
+ return Optional.empty();
+ }
+
+ private ArrayList<Integer> sortSetIntoList(Set<Integer> set) {
+ var sortedList = new ArrayList<>(set);
+ Collections.sort(sortedList);
+ return sortedList;
+ }
+
/** Returns a disallow-result, if there is a node in the group with wanted state != UP. */
private Result otherNodeInGroupHasWantedState(Group group) {
for (var configuredNode : group.getNodes()) {
@@ -354,6 +432,22 @@ public class NodeStateChangeChecker {
return false;
}
+ private boolean aGroupContainsNode(Collection<Integer> groupIndexes, Node node) {
+ for (Group group : getGroupsWithIndexes(groupIndexes)) {
+ if (groupContainsNode(group, node))
+ return true;
+ }
+
+ return false;
+ }
+
+ private List<Group> getGroupsWithIndexes(Collection<Integer> groupIndexes) {
+ return clusterInfo.getStorageNodeInfos().stream()
+ .map(NodeInfo::getGroup)
+ .filter(group -> groupIndexes.contains(group.getIndex()))
+ .collect(Collectors.toList());
+ }
+
private Result checkAllNodesAreUp(ClusterState clusterState) {
// This method verifies both storage nodes and distributors are up (or retired).
// The complicated part is making a summary error message.
@@ -441,4 +535,43 @@ public class NodeStateChangeChecker {
return allowSettingOfWantedState();
}
+ private Set<Integer> groupsWithUserWantedStateNotUp() {
+ return clusterInfo.getAllNodeInfos().stream()
+ .filter(sni -> !UP.equals(sni.getUserWantedState().getState()))
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .map(Group::getIndex)
+ .collect(Collectors.toSet());
+ }
+
+ // groups with at least one node with the same state & description
+ private Set<Integer> groupsWithSameStateAndDescription(State state, String newDescription) {
+ return clusterInfo.getAllNodeInfos().stream()
+ .filter(nodeInfo -> {
+ var userWantedState = nodeInfo.getUserWantedState();
+ return userWantedState.getState() == state &&
+ Objects.equals(userWantedState.getDescription(), newDescription);
+ })
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .map(Group::getIndex)
+ .collect(Collectors.toSet());
+ }
+
+ // groups with at least one node in state (not retired AND not up)
+ private Set<Integer> groupsWithNotRetiredAndNotUp(ClusterState clusterState) {
+ return clusterInfo.getAllNodeInfos().stream()
+ .filter(nodeInfo -> (nodeInfo.getUserWantedState().getState() != RETIRED
+ && nodeInfo.getUserWantedState().getState() != UP)
+ || (clusterState.getNodeState(nodeInfo.getNode()).getState() != RETIRED
+ && clusterState.getNodeState(nodeInfo.getNode()).getState() != UP))
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .map(Group::getIndex)
+ .collect(Collectors.toSet());
+ }
+
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java
index 68e46414c22..6f4d0749f3f 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java
@@ -4,7 +4,6 @@ package com.yahoo.vespa.clustercontroller.core;
import com.yahoo.jrt.ErrorCode;
import com.yahoo.jrt.Target;
import com.yahoo.vdslib.state.NodeState;
-import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.clustercontroller.core.listeners.NodeListener;
import java.util.LinkedList;
@@ -12,6 +11,9 @@ import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
+import static com.yahoo.vdslib.state.State.DOWN;
+import static com.yahoo.vdslib.state.State.STOPPING;
+
/**
* Collects the state of all nodes by making remote requests and handling the replies.
*/
@@ -65,20 +67,20 @@ public class NodeStateGatherer {
if (info.getRpcAddress() == null || info.isNotInSlobrok()) { // Cannot query state of node without RPC address or not in slobrok
log.log(Level.FINE, () -> "Not sending getNodeState request to node " + info.getNode() + ": Not in slobrok");
NodeState reportedState = info.getReportedState().clone();
- if (( ! reportedState.getState().equals(State.DOWN) && currentTime - info.lastSeenInSlobrok() > maxSlobrokDisconnectGracePeriod)
- || reportedState.getState().equals(State.STOPPING)) // Don't wait for grace period if we expect node to be stopping
+ if (( ! reportedState.getState().equals(DOWN) && currentTime - info.lastSeenInSlobrok() > maxSlobrokDisconnectGracePeriod)
+ || reportedState.getState().equals(STOPPING)) // Don't wait for grace period if we expect node to be stopping
{
log.log(Level.FINE, () -> "Setting reported state to DOWN "
- + (reportedState.getState().equals(State.STOPPING)
+ + (reportedState.getState().equals(STOPPING)
? "as node completed stopping."
- : "as node has been out of slobrok longer than " + maxSlobrokDisconnectGracePeriod + "."));
+ : "as node has been out of slobrok longer than " + maxSlobrokDisconnectGracePeriod + " ms."));
if (reportedState.getState().oneOf("iur") || ! reportedState.hasDescription()) {
- StringBuilder sb = new StringBuilder().append("Set node down as it has been out of slobrok for ")
- .append(currentTime - info.lastSeenInSlobrok()).append(" ms which is more than the max limit of ")
- .append(maxSlobrokDisconnectGracePeriod).append(" ms.");
- reportedState.setDescription(sb.toString());
+ reportedState.setDescription("Set node down as it has been out of slobrok for " +
+ (currentTime - info.lastSeenInSlobrok()) +
+ " ms which is more than the max limit of " +
+ maxSlobrokDisconnectGracePeriod + " ms.");
}
- reportedState.setState(State.DOWN);
+ reportedState.setState(DOWN);
listener.handleNewNodeState(info, reportedState.clone());
}
info.setReportedState(reportedState, currentTime); // Must reset it to null to get connection attempts counted
@@ -135,7 +137,7 @@ public class NodeStateGatherer {
info.setReportedState(state, currentTime);
} catch (Exception e) {
log.log(Level.WARNING, "Failed to process get node state response", e);
- info.setReportedState(new NodeState(info.getNode().getType(), State.DOWN), currentTime);
+ info.setReportedState(new NodeState(info.getNode().getType(), DOWN), currentTime);
}
// Important: The old host info should be accessible in info.getHostInfo(), see interface.
@@ -152,7 +154,7 @@ public class NodeStateGatherer {
private NodeState handleError(GetNodeStateRequest req, NodeInfo info, long currentTime) {
String prefix = "Failed get node state request: ";
- NodeState newState = new NodeState(info.getNode().getType(), State.DOWN);
+ NodeState newState = new NodeState(info.getNode().getType(), DOWN);
if (req.getReply().getReturnCode() == ErrorCode.TIMEOUT) {
String msg = "RPC timeout";
if (info.getReportedState().getState().oneOf("ui")) {
@@ -177,7 +179,7 @@ public class NodeStateGatherer {
log.log(Level.FINE, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode()
+ " " + req.getReply().getReturnMessage() + ": " + msg);
}
- newState.setState(State.DOWN);
+ newState.setState(DOWN);
} else if (msg.equals("jrt: Connection closed by peer") || msg.equals("Connection reset by peer")) {
msg = "Connection error: Closed at other end. (Node or switch likely shut down)";
if (info.isNotInSlobrok()) {
@@ -189,7 +191,7 @@ public class NodeStateGatherer {
if (log.isLoggable(Level.FINE))
log.log(Level.FINE, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
}
- newState.setState(State.DOWN).setDescription(msg);
+ newState.setState(DOWN).setDescription(msg);
} else if (msg.equals("Connection timed out")) {
if (info.getReportedState().getState().oneOf("ui")) {
msg = "Connection error: Timeout";
@@ -228,11 +230,11 @@ public class NodeStateGatherer {
} else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
log.log(Level.FINE, () -> "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
}
- newState.setState(State.DOWN).setDescription(msg + ": get node state");
+ newState.setState(DOWN).setDescription(msg + ": get node state");
} else if (req.getReply().getReturnCode() == 75004) {
String msg = "Node refused to answer RPC request and is likely stopping: " + req.getReply().getReturnMessage();
// The node is shutting down and is not accepting requests from anyone
- if (info.getReportedState().getState().equals(State.STOPPING)) {
+ if (info.getReportedState().getState().equals(STOPPING)) {
log.log(Level.FINE, () -> "Failed to get node state from " + info + " because it is still shutting down.");
} else {
if (info.getReportedState().getState().oneOf("ui")) {
@@ -241,7 +243,7 @@ public class NodeStateGatherer {
log.log(Level.FINE, () -> "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
}
}
- newState.setState(State.STOPPING).setDescription(msg);
+ newState.setState(STOPPING).setDescription(msg);
} else {
String msg = "Got unexpected error, assumed to be node issue " + req.getReply().getReturnCode() + ": " + req.getReply().getReturnMessage();
if (info.getReportedState().getState().oneOf("ui")) {
@@ -249,7 +251,7 @@ public class NodeStateGatherer {
} else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
log.log(Level.FINE, () -> "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
}
- newState.setState(State.DOWN).setDescription(msg);
+ newState.setState(DOWN).setDescription(msg);
}
return newState;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java
index 4ab80ec6d7a..28149477e36 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java
@@ -9,13 +9,19 @@ import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler;
import com.yahoo.vespa.clustercontroller.core.listeners.NodeListener;
-
+import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
+import static com.yahoo.vdslib.state.State.DOWN;
+import static com.yahoo.vdslib.state.State.INITIALIZING;
+import static com.yahoo.vdslib.state.State.STOPPING;
+import static java.util.logging.Level.FINE;
+import static java.util.logging.Level.FINEST;
+
/**
* This class gets node state updates and timer events and uses these to decide
* whether a new cluster state should be generated.
@@ -52,9 +58,9 @@ public class StateChangeHandler {
public void handleAllDistributorsInSync(final ClusterState currentState,
final Set<ConfiguredNode> nodes,
final DatabaseHandler database,
- final DatabaseHandler.DatabaseContext dbContext) throws InterruptedException {
+ final DatabaseHandler.DatabaseContext dbContext) {
int startTimestampsReset = 0;
- context.log(log, Level.FINE, "handleAllDistributorsInSync invoked for state version %d", currentState.getVersion());
+ context.log(log, FINE, "handleAllDistributorsInSync invoked for state version %d", currentState.getVersion());
for (NodeType nodeType : NodeType.getTypes()) {
for (ConfiguredNode configuredNode : nodes) {
final Node node = new Node(nodeType, configuredNode.index());
@@ -62,15 +68,15 @@ public class StateChangeHandler {
final NodeState nodeState = currentState.getNodeState(node);
if (nodeInfo != null && nodeState != null) {
if (nodeState.getStartTimestamp() > nodeInfo.getStartTimestamp()) {
- log.log(Level.FINE, () -> String.format("Storing away new start timestamp for node %s (%d)", node, nodeState.getStartTimestamp()));
+ log.log(FINE, () -> String.format("Storing away new start timestamp for node %s (%d)", node, nodeState.getStartTimestamp()));
nodeInfo.setStartTimestamp(nodeState.getStartTimestamp());
}
if (nodeState.getStartTimestamp() > 0) {
- log.log(Level.FINE, "Resetting timestamp in cluster state for node %s", node);
+ log.log(FINE, "Resetting timestamp in cluster state for node %s", node);
++startTimestampsReset;
}
- } else if (log.isLoggable(Level.FINE)) {
- log.log(Level.FINE, node + ": " +
+ } else if (log.isLoggable(FINE)) {
+ log.log(FINE, node + ": " +
(nodeInfo == null ? "null" : nodeInfo.getStartTimestamp()) + ", " +
(nodeState == null ? "null" : nodeState.getStartTimestamp()));
}
@@ -83,7 +89,7 @@ public class StateChangeHandler {
stateMayHaveChanged = true;
database.saveStartTimestamps(dbContext);
} else {
- log.log(Level.FINE, "Found no start timestamps to reset in cluster state.");
+ log.log(FINE, "Found no start timestamps to reset in cluster state.");
}
}
@@ -110,48 +116,45 @@ public class StateChangeHandler {
// TODO nodeListener is only used via updateNodeInfoFromReportedState -> handlePrematureCrash
// TODO this will recursively invoke proposeNewNodeState, which will presumably (i.e. hopefully) be a no-op...
- public void handleNewReportedNodeState(final ClusterState currentClusterState,
- final NodeInfo node,
- final NodeState reportedState,
- final NodeListener nodeListener)
- {
- final NodeState currentState = currentClusterState.getNodeState(node.getNode());
- final Level level = (currentState.equals(reportedState) && node.getVersion() == 0) ? Level.FINEST : Level.FINE;
- if (log.isLoggable(level)) {
- log.log(level, String.format("Got nodestate reply from %s: %s (Current state is %s)",
- node, node.getReportedState().getTextualDifference(reportedState), currentState.toString(true)));
- }
- final long currentTime = timer.getCurrentTimeInMillis();
-
- if (reportedState.getState().equals(State.DOWN)) {
+ public void handleNewReportedNodeState(ClusterState currentClusterState,
+ NodeInfo node,
+ NodeState reportedState,
+ NodeListener nodeListener) {
+ NodeState currentState = currentClusterState.getNodeState(node.getNode());
+ Level level = (currentState.equals(reportedState) && node.getVersion() == 0) ? FINEST : FINE;
+ log.log(level, () -> String.format("Got nodestate reply from %s: %s (Current state is %s)",
+ node, node.getReportedState().getTextualDifference(reportedState), currentState.toString(true)));
+ long currentTime = timer.getCurrentTimeInMillis();
+
+ if (reportedState.getState().equals(DOWN)) {
node.setTimeOfFirstFailingConnectionAttempt(currentTime);
}
// *** LOGGING ONLY
if ( ! reportedState.similarTo(node.getReportedState())) {
- if (reportedState.getState().equals(State.DOWN)) {
+ if (reportedState.getState().equals(DOWN)) {
eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(node, "Failed to get node state: " + reportedState.toString(true), NodeEvent.Type.REPORTED, currentTime), Level.INFO);
} else {
- eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(node, "Now reporting state " + reportedState.toString(true), NodeEvent.Type.REPORTED, currentTime), Level.FINE);
+ eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(node, "Now reporting state " + reportedState.toString(true), NodeEvent.Type.REPORTED, currentTime), FINE);
}
}
- if (reportedState.equals(node.getReportedState()) && ! reportedState.getState().equals(State.INITIALIZING)) {
+ if (reportedState.equals(node.getReportedState()) && ! reportedState.getState().equals(INITIALIZING)) {
return;
}
updateNodeInfoFromReportedState(node, currentState, reportedState, nodeListener);
if (reportedState.getMinUsedBits() != currentState.getMinUsedBits()) {
- final int oldCount = currentState.getMinUsedBits();
- final int newCount = reportedState.getMinUsedBits();
- log.log(Level.FINE,
+ int oldCount = currentState.getMinUsedBits();
+ int newCount = reportedState.getMinUsedBits();
+ log.log(FINE,
() -> String.format("Altering node state to reflect that min distribution bit count has changed from %d to %d", oldCount, newCount));
eventLog.add(NodeEvent.forBaseline(node, String.format("Altered min distribution bit count from %d to %d", oldCount, newCount),
NodeEvent.Type.CURRENT, currentTime), isMaster);
} else {
- log.log(Level.FINE, () -> String.format("Not altering state of %s in cluster state because new state is too similar: %s",
- node, currentState.getTextualDifference(reportedState)));
+ log.log(FINE, () -> String.format("Not altering state of %s in cluster state because new state is too similar: %s",
+ node, currentState.getTextualDifference(reportedState)));
}
stateMayHaveChanged = true;
@@ -162,10 +165,8 @@ public class StateChangeHandler {
eventLog.add(NodeEvent.forBaseline(node, message, NodeEvent.Type.REPORTED, timer.getCurrentTimeInMillis()), isMaster);
}
- public void handleMissingNode(final ClusterState currentClusterState,
- final NodeInfo node,
- final NodeListener nodeListener) {
- final long timeNow = timer.getCurrentTimeInMillis();
+ public void handleMissingNode(ClusterState currentClusterState, NodeInfo node, NodeListener nodeListener) {
+ long timeNow = timer.getCurrentTimeInMillis();
if (node.getLatestNodeStateRequestTime() != null) {
eventLog.add(NodeEvent.forBaseline(node, "Node is no longer in slobrok, but we still have a pending state request.", NodeEvent.Type.REPORTED, timeNow), isMaster);
@@ -173,13 +174,13 @@ public class StateChangeHandler {
eventLog.add(NodeEvent.forBaseline(node, "Node is no longer in slobrok. No pending state request to node.", NodeEvent.Type.REPORTED, timeNow), isMaster);
}
- if (node.getReportedState().getState().equals(State.STOPPING)) {
- log.log(Level.FINE, () -> "Node " + node.getNode() + " is no longer in slobrok. Was in stopping state, so assuming it has shut down normally. Setting node down");
+ if (node.getReportedState().getState().equals(STOPPING)) {
+ log.log(FINE, () -> "Node " + node.getNode() + " is no longer in slobrok. Was in stopping state, so assuming it has shut down normally. Setting node down");
NodeState ns = node.getReportedState().clone();
- ns.setState(State.DOWN);
+ ns.setState(DOWN);
handleNewReportedNodeState(currentClusterState, node, ns.clone(), nodeListener);
} else {
- log.log(Level.FINE, () -> "Node " + node.getNode() + " no longer in slobrok was in state " + node.getReportedState() + ". Waiting to see if it reappears in slobrok");
+ log.log(FINE, () -> "Node " + node.getNode() + " no longer in slobrok was in state " + node.getReportedState() + ". Waiting to see if it reappears in slobrok");
}
stateMayHaveChanged = true;
@@ -192,19 +193,19 @@ public class StateChangeHandler {
* If the newly proposed state differs from the state the node currently has in the system,
* a cluster state regeneration will be triggered.
*/
- public void proposeNewNodeState(final ClusterState currentClusterState, final NodeInfo node, final NodeState proposedState) {
- final NodeState currentState = currentClusterState.getNodeState(node.getNode());
- final NodeState currentReported = node.getReportedState();
+ public void proposeNewNodeState(ClusterState currentClusterState, NodeInfo node, NodeState proposedState) {
+ NodeState currentState = currentClusterState.getNodeState(node.getNode());
- if (currentState.getState().equals(proposedState.getState())) {
+ if (currentState.getState().equals(proposedState.getState()))
return;
- }
+
stateMayHaveChanged = true;
- log.log(Level.FINE, () -> String.format("Got new wanted nodestate for %s: %s", node, currentState.getTextualDifference(proposedState)));
+ log.log(FINE, () -> String.format("Got new wanted nodestate for %s: %s", node, currentState.getTextualDifference(proposedState)));
// Should be checked earlier before state was set in cluster
assert(proposedState.getState().validWantedNodeState(node.getNode().getType()));
long timeNow = timer.getCurrentTimeInMillis();
+ NodeState currentReported = node.getReportedState();
if (proposedState.above(currentReported)) {
eventLog.add(NodeEvent.forBaseline(node, String.format("Wanted state %s, but we cannot force node into that " +
"state yet as it is currently in %s", proposedState, currentReported),
@@ -239,12 +240,9 @@ public class StateChangeHandler {
// generated cluster state. Still a bit of a mine field...
// TODO remove all node state mutation from this function entirely in favor of ClusterStateGenerator!
// `--> this will require adding more event edges and premature crash handling to it. Which is fine.
- public boolean watchTimers(final ContentCluster cluster,
- final ClusterState currentClusterState,
- final NodeListener nodeListener)
- {
+ public boolean watchTimers(ContentCluster cluster, ClusterState currentClusterState, NodeListener nodeListener) {
boolean triggeredAnyTimers = false;
- final long currentTime = timer.getCurrentTimeInMillis();
+ long currentTime = timer.getCurrentTimeInMillis();
for(NodeInfo node : cluster.getNodeInfos()) {
triggeredAnyTimers |= handleTimeDependentOpsForNode(currentClusterState, nodeListener, currentTime, node);
@@ -256,23 +254,17 @@ public class StateChangeHandler {
return triggeredAnyTimers;
}
- private boolean handleTimeDependentOpsForNode(final ClusterState currentClusterState,
- final NodeListener nodeListener,
- final long currentTime,
- final NodeInfo node)
- {
- final NodeState currentStateInSystem = currentClusterState.getNodeState(node.getNode());
- final NodeState lastReportedState = node.getReportedState();
- boolean triggeredAnyTimers = false;
-
- triggeredAnyTimers = reportDownIfOutdatedSlobrokNode(
- currentClusterState, nodeListener, currentTime, node, lastReportedState);
+ private boolean handleTimeDependentOpsForNode(ClusterState currentClusterState,
+ NodeListener nodeListener,
+ long currentTime,
+ NodeInfo node) {
+ NodeState currentStateInSystem = currentClusterState.getNodeState(node.getNode());
+ NodeState lastReportedState = node.getReportedState();
+ boolean triggeredAnyTimers =
+ reportDownIfOutdatedSlobrokNode(currentClusterState, nodeListener, currentTime, node, lastReportedState);
- if (nodeStillUnavailableAfterTransitionTimeExceeded(
- currentTime, node, currentStateInSystem, lastReportedState))
- {
+ if (nodeStillUnavailableAfterTransitionTimeExceeded(currentTime, node, currentStateInSystem, lastReportedState))
triggeredAnyTimers = true;
- }
if (nodeInitProgressHasTimedOut(currentTime, node, currentStateInSystem, lastReportedState)) {
eventLog.add(NodeEvent.forBaseline(node, String.format(
@@ -287,11 +279,11 @@ public class StateChangeHandler {
if (mayResetCrashCounterOnStableUpNode(currentTime, node, lastReportedState)) {
node.setPrematureCrashCount(0);
- log.log(Level.FINE, () -> "Resetting premature crash count on node " + node + " as it has been up for a long time.");
+ log.log(FINE, () -> "Resetting premature crash count on node " + node + " as it has been up for a long time.");
triggeredAnyTimers = true;
} else if (mayResetCrashCounterOnStableDownNode(currentTime, node, lastReportedState)) {
node.setPrematureCrashCount(0);
- log.log(Level.FINE, () -> "Resetting premature crash count on node " + node + " as it has been down for a long time.");
+ log.log(FINE, () -> "Resetting premature crash count on node " + node + " as it has been down for a long time.");
triggeredAnyTimers = true;
}
@@ -299,17 +291,18 @@ public class StateChangeHandler {
}
private boolean nodeInitProgressHasTimedOut(long currentTime, NodeInfo node, NodeState currentStateInSystem, NodeState lastReportedState) {
- return !currentStateInSystem.getState().equals(State.DOWN)
- && node.getWantedState().above(new NodeState(node.getNode().getType(), State.DOWN))
- && lastReportedState.getState().equals(State.INITIALIZING)
+ return !currentStateInSystem.getState().equals(DOWN)
+ && node.getWantedState().above(new NodeState(node.getNode().getType(), DOWN))
+ && lastReportedState.getState().equals(INITIALIZING)
&& maxInitProgressTime != 0
&& node.getInitProgressTime() + maxInitProgressTime <= currentTime
&& node.getNode().getType().equals(NodeType.STORAGE);
}
+ // TODO: Merge this and the below method
private boolean mayResetCrashCounterOnStableDownNode(long currentTime, NodeInfo node, NodeState lastReportedState) {
return node.getDownStableStateTime() + stableStateTimePeriod <= currentTime
- && lastReportedState.getState().equals(State.DOWN)
+ && lastReportedState.getState().equals(DOWN)
&& node.getPrematureCrashCount() <= maxPrematureCrashes
&& node.getPrematureCrashCount() != 0;
}
@@ -328,8 +321,8 @@ public class StateChangeHandler {
NodeState lastReportedState)
{
return currentStateInSystem.getState().equals(State.MAINTENANCE)
- && node.getWantedState().above(new NodeState(node.getNode().getType(), State.DOWN))
- && (lastReportedState.getState().equals(State.DOWN) || node.isNotInSlobrok())
+ && node.getWantedState().above(new NodeState(node.getNode().getType(), DOWN))
+ && (lastReportedState.getState().equals(DOWN) || node.isNotInSlobrok())
&& node.getTransitionTime() + maxTransitionTime.get(node.getNode().getType()) < currentTime;
}
@@ -340,7 +333,7 @@ public class StateChangeHandler {
NodeState lastReportedState)
{
if (node.isNotInSlobrok()
- && !lastReportedState.getState().equals(State.DOWN)
+ && !lastReportedState.getState().equals(DOWN)
&& node.lastSeenInSlobrok() + maxSlobrokDisconnectGracePeriod <= currentTime)
{
final String desc = String.format(
@@ -350,7 +343,7 @@ public class StateChangeHandler {
maxSlobrokDisconnectGracePeriod);
node.abortCurrentNodeStateRequests();
NodeState state = lastReportedState.clone();
- state.setState(State.DOWN);
+ state.setState(DOWN);
if (!state.hasDescription()) {
state.setDescription(desc);
}
@@ -362,10 +355,12 @@ public class StateChangeHandler {
return false;
}
+ private boolean isNotControlledShutdown(NodeState state) { return ! isControlledShutdown(state); }
+
private boolean isControlledShutdown(NodeState state) {
- return (state.getState() == State.STOPPING
- && (state.getDescription().contains("Received signal 15 (SIGTERM - Termination signal)")
- || state.getDescription().contains("controlled shutdown")));
+ return state.getState() == State.STOPPING
+ && List.of("Received signal 15 (SIGTERM - Termination signal)", "controlled shutdown")
+ .contains(state.getDescription());
}
/**
@@ -381,14 +376,14 @@ public class StateChangeHandler {
final NodeState reportedState,
final NodeListener nodeListener) {
final long timeNow = timer.getCurrentTimeInMillis();
- log.log(Level.FINE, () -> String.format("Finding new cluster state entry for %s switching state %s", node, currentState.getTextualDifference(reportedState)));
+ log.log(FINE, () -> String.format("Finding new cluster state entry for %s switching state %s", node, currentState.getTextualDifference(reportedState)));
if (handleReportedNodeCrashEdge(node, currentState, reportedState, nodeListener, timeNow)) {
return;
}
if (initializationProgressHasIncreased(currentState, reportedState)) {
node.setInitProgressTime(timeNow);
- log.log(Level.FINEST, () -> "Reset initialize timer on " + node + " to " + node.getInitProgressTime());
+ log.log(FINEST, () -> "Reset initialize timer on " + node + " to " + node.getInitProgressTime());
}
if (handleImplicitCrashEdgeFromReverseInitProgress(node, currentState, reportedState, nodeListener, timeNow)) {
return;
@@ -402,9 +397,9 @@ public class StateChangeHandler {
final NodeState reportedState,
final NodeListener nodeListener,
final long timeNow) {
- if (currentState.getState().equals(State.INITIALIZING)
+ if (currentState.getState().equals(INITIALIZING)
&& reportedState.getState().oneOf("ds")
- && !isControlledShutdown(reportedState))
+ && isNotControlledShutdown(reportedState))
{
eventLog.add(NodeEvent.forBaseline(node, String.format("Stop or crash during initialization. " +
"Premature crash count is now %d.", node.getPrematureCrashCount() + 1),
@@ -421,8 +416,8 @@ public class StateChangeHandler {
final NodeState reportedState,
final NodeListener nodeListener,
final long timeNow) {
- if (currentState.getState().equals(State.INITIALIZING) &&
- (reportedState.getState().equals(State.INITIALIZING) && reportedState.getInitProgress() < currentState.getInitProgress()))
+ if (currentState.getState().equals(INITIALIZING) &&
+ (reportedState.getState().equals(INITIALIZING) && reportedState.getInitProgress() < currentState.getInitProgress()))
{
eventLog.add(NodeEvent.forBaseline(node, String.format(
"Stop or crash during initialization detected from reverse initializing progress." +
@@ -442,8 +437,8 @@ public class StateChangeHandler {
long timeNow) {
if (nodeUpToDownEdge(node, currentState, reportedState)) {
node.setTransitionTime(timeNow);
- if (node.getUpStableStateTime() + stableStateTimePeriod > timeNow && !isControlledShutdown(reportedState)) {
- log.log(Level.FINE, () -> "Stable state: " + node.getUpStableStateTime() + " + " + stableStateTimePeriod + " > " + timeNow);
+ if (node.getUpStableStateTime() + stableStateTimePeriod > timeNow && isNotControlledShutdown(reportedState)) {
+ log.log(FINE, () -> "Stable state: " + node.getUpStableStateTime() + " + " + stableStateTimePeriod + " > " + timeNow);
eventLog.add(NodeEvent.forBaseline(node,
String.format("Stopped or possibly crashed after %d ms, which is before " +
"stable state time period. Premature crash count is now %d.",
@@ -457,20 +452,20 @@ public class StateChangeHandler {
}
private boolean initializationProgressHasIncreased(NodeState currentState, NodeState reportedState) {
- return reportedState.getState().equals(State.INITIALIZING) &&
- (!currentState.getState().equals(State.INITIALIZING) ||
+ return reportedState.getState().equals(INITIALIZING) &&
+ (!currentState.getState().equals(INITIALIZING) ||
reportedState.getInitProgress() > currentState.getInitProgress());
}
private boolean nodeUpToDownEdge(NodeInfo node, NodeState currentState, NodeState reportedState) {
return currentState.getState().oneOf("ur") && reportedState.getState().oneOf("dis")
- && (node.getWantedState().getState().equals(State.RETIRED) || !reportedState.getState().equals(State.INITIALIZING));
+ && (node.getWantedState().getState().equals(State.RETIRED) || !reportedState.getState().equals(INITIALIZING));
}
private boolean handlePrematureCrash(NodeInfo node, NodeListener changeListener) {
node.setPrematureCrashCount(node.getPrematureCrashCount() + 1);
if (disableUnstableNodes && node.getPrematureCrashCount() > maxPrematureCrashes) {
- NodeState wantedState = new NodeState(node.getNode().getType(), State.DOWN)
+ NodeState wantedState = new NodeState(node.getNode().getType(), DOWN)
.setDescription("Disabled by fleet controller as it prematurely shut down " + node.getPrematureCrashCount() + " times in a row");
NodeState oldState = node.getWantedState();
node.setWantedState(wantedState);
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
index cfe4c925551..4122abe1521 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
@@ -48,6 +48,8 @@ public class Response {
public String getId() { return id; }
@Override
public String getReason() { return reason; }
+ @Override
+ public String toString() { return getId() +": " + getReason(); }
}
public static class Link implements SubUnitList {
private final Map<String, String> links = new LinkedHashMap<>();
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
index 1c72594377a..01a75034ddf 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
@@ -72,14 +72,13 @@ public class SetNodeStateRequest extends Request<SetResponse> {
static NodeState getRequestedNodeState(Map<String, UnitState> newStates, Node n) throws StateRestApiException {
UnitState newState = newStates.get("user");
if (newState == null) throw new InvalidContentException("No new user state given in request");
- State state;
- switch (newState.getId().toLowerCase()) {
- case "up": state = State.UP; break;
- case "retired": state = State.RETIRED; break;
- case "maintenance": state = State.MAINTENANCE; break;
- case "down": state = State.DOWN; break;
- default: throw new InvalidContentException("Invalid user state '" + newState.getId() + "' given.");
- }
+ State state = switch (newState.getId().toLowerCase()) {
+ case "up" -> State.UP;
+ case "retired" -> State.RETIRED;
+ case "maintenance" -> State.MAINTENANCE;
+ case "down" -> State.DOWN;
+ default -> throw new InvalidContentException("Invalid user state '" + newState.getId() + "' given.");
+ };
return new NodeState(n.getType(), state).setDescription(newState.getReason());
}
@@ -191,25 +190,18 @@ public class SetNodeStateRequest extends Request<SetResponse> {
boolean probe) {
Node distributorNode = new Node(NodeType.DISTRIBUTOR, index);
NodeInfo nodeInfo = cluster.getNodeInfo(distributorNode);
- if (nodeInfo == null) {
- throw new IllegalStateException("Missing distributor at index " +
- distributorNode.getIndex());
- }
+ if (nodeInfo == null)
+ throw new IllegalStateException("Missing distributor at index " + distributorNode.getIndex());
State newState;
switch (newStorageWantedState.getState()) {
- case MAINTENANCE:
- newState = State.DOWN;
- break;
- case RETIRED:
- newState = State.UP;
- break;
- default:
+ case MAINTENANCE -> newState = State.DOWN;
+ case RETIRED -> newState = State.UP;
+ default -> {
newState = newStorageWantedState.getState();
- if (!newState.validWantedNodeState(distributorNode.getType())) {
- throw new IllegalStateException("Distributor cannot be set to wanted state " +
- newState);
- }
+ if (!newState.validWantedNodeState(distributorNode.getType()))
+ throw new IllegalStateException("Distributor cannot be set to wanted state " + newState);
+ }
}
NodeState newWantedState = new NodeState(distributorNode.getType(), newState);