diff options
author | Harald Musum <musum@yahooinc.com> | 2023-03-30 08:55:23 +0200 |
---|---|---|
committer | Harald Musum <musum@yahooinc.com> | 2023-03-30 08:55:23 +0200 |
commit | dba0eb02a4811c954b55a81a2126f3c84a59a643 (patch) | |
tree | bd20fd2a04b60e3d2467138a430d5dc75555ca24 | |
parent | 0272c98d3ebd07f488c3d5c7ee71c71f601a84c8 (diff) |
Minor cleanup, no functional changes
6 files changed, 133 insertions, 144 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java index 2cfaf64fe83..0f119d8de50 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterInfo.java @@ -15,7 +15,7 @@ import java.util.Set; import java.util.TreeMap; /** - * Detail information about the current state of all the distributor and storage nodes of the cluster. + * Detailed information about the current state of all the distributor and storage nodes of the cluster. * * @author hakonhall * @author bratseth @@ -127,11 +127,10 @@ public class ClusterInfo { /** Returns the node info object for a given node identifier */ private NodeInfo getInfo(Node node) { - switch (node.getType()) { - case DISTRIBUTOR : return getDistributorNodeInfo(node.getIndex()); - case STORAGE : return getStorageNodeInfo(node.getIndex()); - default : throw new IllegalArgumentException("No node type " + node.getType().toString()); - } + return switch (node.getType()) { + case DISTRIBUTOR -> getDistributorNodeInfo(node.getIndex()); + case STORAGE -> getStorageNodeInfo(node.getIndex()); + }; } } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java index 70fbbb60e26..6855859c96f 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java @@ -2,7 +2,7 @@ package com.yahoo.vespa.clustercontroller.core; /** - * Base class for distributor/content node node RPC requests that are bound + * Base class for distributor/content node RPC requests that are bound * to a particular cluster state version. */ public abstract class ClusterStateVersionSpecificRequest { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java index 3f1a7ab5d7b..d7aac1c26fa 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java @@ -55,7 +55,10 @@ abstract public class NodeInfo implements Comparable<NodeInfo> { private long nextAttemptTime; /** Cached connection to this node. */ private Target connection; - /** We cache last connection we did request info on, as we want to report appropriate error for node regardless of whether other commands have created new connection. */ + /** + * We cache last connection we did request info on, as we want to report appropriate error for + * node regardless of whether other commands have created new connection. + */ public Target lastRequestInfoConnection; /** * Counts the number of attempts we have tried since last time we had @@ -163,7 +166,7 @@ abstract public class NodeInfo implements Comparable<NodeInfo> { } if (prematureCrashCount != count) { prematureCrashCount = count; - log.log(Level.FINE, () -> "Premature crash count on " + toString() + " set to " + count); + log.log(Level.FINE, () -> "Premature crash count on " + this + " set to " + count); } } public int getPrematureCrashCount() { return prematureCrashCount; } @@ -311,13 +314,13 @@ abstract public class NodeInfo implements Comparable<NodeInfo> { } if (state.getState().equals(State.DOWN) && !reportedState.getState().oneOf("d")) { downStableStateTime = time; - log.log(Level.FINE, () -> "Down stable state on " + toString() + " altered to " + time); + log.log(Level.FINE, () -> "Down stable state on " + this + " altered to " + time); if (reportedState.getState() == State.INITIALIZING) { recentlyObservedUnstableDuringInit = true; } } else if (state.getState().equals(State.UP) && !reportedState.getState().oneOf("u")) { upStableStateTime = time; - log.log(Level.FINE, () -> "Up stable state on " + toString() + " altered to " + time); + log.log(Level.FINE, () -> "Up stable state on " + this + " altered to " + time); } if (!state.getState().validReportedNodeState(node.getType())) { throw new IllegalStateException("Trying to set illegal reported node state: " + state); @@ -340,14 +343,14 @@ abstract public class NodeInfo implements Comparable<NodeInfo> { } else { nextAttemptTime = time + 5000; } - log.log(Level.FINEST, () -> "Failed to get state from node " + toString() + ", scheduling next attempt in " + (nextAttemptTime - time) + " ms."); + log.log(Level.FINEST, () -> "Failed to get state from node " + this + ", scheduling next attempt in " + (nextAttemptTime - time) + " ms."); } else { connectionAttemptCount = 0; timeOfFirstFailingConnectionAttempt = 0; reportedState = state; if (version == 0 || state.getState().equals(State.STOPPING)) { nextAttemptTime = time + cluster.getPollingFrequency(); - log.log(Level.FINEST, () -> "Scheduling next attempt to get state from " + toString() + " in " + (nextAttemptTime - time) + " ms (polling freq)."); + log.log(Level.FINEST, () -> "Scheduling next attempt to get state from " + this + " in " + (nextAttemptTime - time) + " ms (polling freq)."); } else { nextAttemptTime = time; } @@ -368,7 +371,7 @@ abstract public class NodeInfo implements Comparable<NodeInfo> { } catch (Exception e) { StringWriter sw = new StringWriter(); e.printStackTrace(new PrintWriter(sw)); - log.warning("Attempted to set wanted state with more than just a main state. Extra data stripped. Original data '" + state.serialize(true) + ":\n" + sw.toString()); + log.warning("Attempted to set wanted state with more than just a main state. Extra data stripped. Original data '" + state.serialize(true) + ":\n" + sw); } } wantedState = newWanted; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java index 68e46414c22..6f4d0749f3f 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateGatherer.java @@ -4,7 +4,6 @@ package com.yahoo.vespa.clustercontroller.core; import com.yahoo.jrt.ErrorCode; import com.yahoo.jrt.Target; import com.yahoo.vdslib.state.NodeState; -import com.yahoo.vdslib.state.State; import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import com.yahoo.vespa.clustercontroller.core.listeners.NodeListener; import java.util.LinkedList; @@ -12,6 +11,9 @@ import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; +import static com.yahoo.vdslib.state.State.DOWN; +import static com.yahoo.vdslib.state.State.STOPPING; + /** * Collects the state of all nodes by making remote requests and handling the replies. */ @@ -65,20 +67,20 @@ public class NodeStateGatherer { if (info.getRpcAddress() == null || info.isNotInSlobrok()) { // Cannot query state of node without RPC address or not in slobrok log.log(Level.FINE, () -> "Not sending getNodeState request to node " + info.getNode() + ": Not in slobrok"); NodeState reportedState = info.getReportedState().clone(); - if (( ! reportedState.getState().equals(State.DOWN) && currentTime - info.lastSeenInSlobrok() > maxSlobrokDisconnectGracePeriod) - || reportedState.getState().equals(State.STOPPING)) // Don't wait for grace period if we expect node to be stopping + if (( ! reportedState.getState().equals(DOWN) && currentTime - info.lastSeenInSlobrok() > maxSlobrokDisconnectGracePeriod) + || reportedState.getState().equals(STOPPING)) // Don't wait for grace period if we expect node to be stopping { log.log(Level.FINE, () -> "Setting reported state to DOWN " - + (reportedState.getState().equals(State.STOPPING) + + (reportedState.getState().equals(STOPPING) ? "as node completed stopping." - : "as node has been out of slobrok longer than " + maxSlobrokDisconnectGracePeriod + ".")); + : "as node has been out of slobrok longer than " + maxSlobrokDisconnectGracePeriod + " ms.")); if (reportedState.getState().oneOf("iur") || ! reportedState.hasDescription()) { - StringBuilder sb = new StringBuilder().append("Set node down as it has been out of slobrok for ") - .append(currentTime - info.lastSeenInSlobrok()).append(" ms which is more than the max limit of ") - .append(maxSlobrokDisconnectGracePeriod).append(" ms."); - reportedState.setDescription(sb.toString()); + reportedState.setDescription("Set node down as it has been out of slobrok for " + + (currentTime - info.lastSeenInSlobrok()) + + " ms which is more than the max limit of " + + maxSlobrokDisconnectGracePeriod + " ms."); } - reportedState.setState(State.DOWN); + reportedState.setState(DOWN); listener.handleNewNodeState(info, reportedState.clone()); } info.setReportedState(reportedState, currentTime); // Must reset it to null to get connection attempts counted @@ -135,7 +137,7 @@ public class NodeStateGatherer { info.setReportedState(state, currentTime); } catch (Exception e) { log.log(Level.WARNING, "Failed to process get node state response", e); - info.setReportedState(new NodeState(info.getNode().getType(), State.DOWN), currentTime); + info.setReportedState(new NodeState(info.getNode().getType(), DOWN), currentTime); } // Important: The old host info should be accessible in info.getHostInfo(), see interface. @@ -152,7 +154,7 @@ public class NodeStateGatherer { private NodeState handleError(GetNodeStateRequest req, NodeInfo info, long currentTime) { String prefix = "Failed get node state request: "; - NodeState newState = new NodeState(info.getNode().getType(), State.DOWN); + NodeState newState = new NodeState(info.getNode().getType(), DOWN); if (req.getReply().getReturnCode() == ErrorCode.TIMEOUT) { String msg = "RPC timeout"; if (info.getReportedState().getState().oneOf("ui")) { @@ -177,7 +179,7 @@ public class NodeStateGatherer { log.log(Level.FINE, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg); } - newState.setState(State.DOWN); + newState.setState(DOWN); } else if (msg.equals("jrt: Connection closed by peer") || msg.equals("Connection reset by peer")) { msg = "Connection error: Closed at other end. (Node or switch likely shut down)"; if (info.isNotInSlobrok()) { @@ -189,7 +191,7 @@ public class NodeStateGatherer { if (log.isLoggable(Level.FINE)) log.log(Level.FINE, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg); } - newState.setState(State.DOWN).setDescription(msg); + newState.setState(DOWN).setDescription(msg); } else if (msg.equals("Connection timed out")) { if (info.getReportedState().getState().oneOf("ui")) { msg = "Connection error: Timeout"; @@ -228,11 +230,11 @@ public class NodeStateGatherer { } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) { log.log(Level.FINE, () -> "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg); } - newState.setState(State.DOWN).setDescription(msg + ": get node state"); + newState.setState(DOWN).setDescription(msg + ": get node state"); } else if (req.getReply().getReturnCode() == 75004) { String msg = "Node refused to answer RPC request and is likely stopping: " + req.getReply().getReturnMessage(); // The node is shutting down and is not accepting requests from anyone - if (info.getReportedState().getState().equals(State.STOPPING)) { + if (info.getReportedState().getState().equals(STOPPING)) { log.log(Level.FINE, () -> "Failed to get node state from " + info + " because it is still shutting down."); } else { if (info.getReportedState().getState().oneOf("ui")) { @@ -241,7 +243,7 @@ public class NodeStateGatherer { log.log(Level.FINE, () -> "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg); } } - newState.setState(State.STOPPING).setDescription(msg); + newState.setState(STOPPING).setDescription(msg); } else { String msg = "Got unexpected error, assumed to be node issue " + req.getReply().getReturnCode() + ": " + req.getReply().getReturnMessage(); if (info.getReportedState().getState().oneOf("ui")) { @@ -249,7 +251,7 @@ public class NodeStateGatherer { } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) { log.log(Level.FINE, () -> "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg); } - newState.setState(State.DOWN).setDescription(msg); + newState.setState(DOWN).setDescription(msg); } return newState; } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java index 4ab80ec6d7a..a9325788cb6 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java @@ -16,6 +16,12 @@ import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; +import static com.yahoo.vdslib.state.State.DOWN; +import static com.yahoo.vdslib.state.State.INITIALIZING; +import static com.yahoo.vdslib.state.State.STOPPING; +import static java.util.logging.Level.FINE; +import static java.util.logging.Level.FINEST; + /** * This class gets node state updates and timer events and uses these to decide * whether a new cluster state should be generated. @@ -52,9 +58,9 @@ public class StateChangeHandler { public void handleAllDistributorsInSync(final ClusterState currentState, final Set<ConfiguredNode> nodes, final DatabaseHandler database, - final DatabaseHandler.DatabaseContext dbContext) throws InterruptedException { + final DatabaseHandler.DatabaseContext dbContext) { int startTimestampsReset = 0; - context.log(log, Level.FINE, "handleAllDistributorsInSync invoked for state version %d", currentState.getVersion()); + context.log(log, FINE, "handleAllDistributorsInSync invoked for state version %d", currentState.getVersion()); for (NodeType nodeType : NodeType.getTypes()) { for (ConfiguredNode configuredNode : nodes) { final Node node = new Node(nodeType, configuredNode.index()); @@ -62,15 +68,15 @@ public class StateChangeHandler { final NodeState nodeState = currentState.getNodeState(node); if (nodeInfo != null && nodeState != null) { if (nodeState.getStartTimestamp() > nodeInfo.getStartTimestamp()) { - log.log(Level.FINE, () -> String.format("Storing away new start timestamp for node %s (%d)", node, nodeState.getStartTimestamp())); + log.log(FINE, () -> String.format("Storing away new start timestamp for node %s (%d)", node, nodeState.getStartTimestamp())); nodeInfo.setStartTimestamp(nodeState.getStartTimestamp()); } if (nodeState.getStartTimestamp() > 0) { - log.log(Level.FINE, "Resetting timestamp in cluster state for node %s", node); + log.log(FINE, "Resetting timestamp in cluster state for node %s", node); ++startTimestampsReset; } - } else if (log.isLoggable(Level.FINE)) { - log.log(Level.FINE, node + ": " + + } else if (log.isLoggable(FINE)) { + log.log(FINE, node + ": " + (nodeInfo == null ? "null" : nodeInfo.getStartTimestamp()) + ", " + (nodeState == null ? "null" : nodeState.getStartTimestamp())); } @@ -83,7 +89,7 @@ public class StateChangeHandler { stateMayHaveChanged = true; database.saveStartTimestamps(dbContext); } else { - log.log(Level.FINE, "Found no start timestamps to reset in cluster state."); + log.log(FINE, "Found no start timestamps to reset in cluster state."); } } @@ -110,48 +116,45 @@ public class StateChangeHandler { // TODO nodeListener is only used via updateNodeInfoFromReportedState -> handlePrematureCrash // TODO this will recursively invoke proposeNewNodeState, which will presumably (i.e. hopefully) be a no-op... - public void handleNewReportedNodeState(final ClusterState currentClusterState, - final NodeInfo node, - final NodeState reportedState, - final NodeListener nodeListener) - { - final NodeState currentState = currentClusterState.getNodeState(node.getNode()); - final Level level = (currentState.equals(reportedState) && node.getVersion() == 0) ? Level.FINEST : Level.FINE; - if (log.isLoggable(level)) { - log.log(level, String.format("Got nodestate reply from %s: %s (Current state is %s)", - node, node.getReportedState().getTextualDifference(reportedState), currentState.toString(true))); - } - final long currentTime = timer.getCurrentTimeInMillis(); - - if (reportedState.getState().equals(State.DOWN)) { + public void handleNewReportedNodeState(ClusterState currentClusterState, + NodeInfo node, + NodeState reportedState, + NodeListener nodeListener) { + NodeState currentState = currentClusterState.getNodeState(node.getNode()); + Level level = (currentState.equals(reportedState) && node.getVersion() == 0) ? FINEST : FINE; + log.log(level, () -> String.format("Got nodestate reply from %s: %s (Current state is %s)", + node, node.getReportedState().getTextualDifference(reportedState), currentState.toString(true))); + long currentTime = timer.getCurrentTimeInMillis(); + + if (reportedState.getState().equals(DOWN)) { node.setTimeOfFirstFailingConnectionAttempt(currentTime); } // *** LOGGING ONLY if ( ! reportedState.similarTo(node.getReportedState())) { - if (reportedState.getState().equals(State.DOWN)) { + if (reportedState.getState().equals(DOWN)) { eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(node, "Failed to get node state: " + reportedState.toString(true), NodeEvent.Type.REPORTED, currentTime), Level.INFO); } else { - eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(node, "Now reporting state " + reportedState.toString(true), NodeEvent.Type.REPORTED, currentTime), Level.FINE); + eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(node, "Now reporting state " + reportedState.toString(true), NodeEvent.Type.REPORTED, currentTime), FINE); } } - if (reportedState.equals(node.getReportedState()) && ! reportedState.getState().equals(State.INITIALIZING)) { + if (reportedState.equals(node.getReportedState()) && ! reportedState.getState().equals(INITIALIZING)) { return; } updateNodeInfoFromReportedState(node, currentState, reportedState, nodeListener); if (reportedState.getMinUsedBits() != currentState.getMinUsedBits()) { - final int oldCount = currentState.getMinUsedBits(); - final int newCount = reportedState.getMinUsedBits(); - log.log(Level.FINE, + int oldCount = currentState.getMinUsedBits(); + int newCount = reportedState.getMinUsedBits(); + log.log(FINE, () -> String.format("Altering node state to reflect that min distribution bit count has changed from %d to %d", oldCount, newCount)); eventLog.add(NodeEvent.forBaseline(node, String.format("Altered min distribution bit count from %d to %d", oldCount, newCount), NodeEvent.Type.CURRENT, currentTime), isMaster); } else { - log.log(Level.FINE, () -> String.format("Not altering state of %s in cluster state because new state is too similar: %s", - node, currentState.getTextualDifference(reportedState))); + log.log(FINE, () -> String.format("Not altering state of %s in cluster state because new state is too similar: %s", + node, currentState.getTextualDifference(reportedState))); } stateMayHaveChanged = true; @@ -162,10 +165,8 @@ public class StateChangeHandler { eventLog.add(NodeEvent.forBaseline(node, message, NodeEvent.Type.REPORTED, timer.getCurrentTimeInMillis()), isMaster); } - public void handleMissingNode(final ClusterState currentClusterState, - final NodeInfo node, - final NodeListener nodeListener) { - final long timeNow = timer.getCurrentTimeInMillis(); + public void handleMissingNode(ClusterState currentClusterState, NodeInfo node, NodeListener nodeListener) { + long timeNow = timer.getCurrentTimeInMillis(); if (node.getLatestNodeStateRequestTime() != null) { eventLog.add(NodeEvent.forBaseline(node, "Node is no longer in slobrok, but we still have a pending state request.", NodeEvent.Type.REPORTED, timeNow), isMaster); @@ -173,13 +174,13 @@ public class StateChangeHandler { eventLog.add(NodeEvent.forBaseline(node, "Node is no longer in slobrok. No pending state request to node.", NodeEvent.Type.REPORTED, timeNow), isMaster); } - if (node.getReportedState().getState().equals(State.STOPPING)) { - log.log(Level.FINE, () -> "Node " + node.getNode() + " is no longer in slobrok. Was in stopping state, so assuming it has shut down normally. Setting node down"); + if (node.getReportedState().getState().equals(STOPPING)) { + log.log(FINE, () -> "Node " + node.getNode() + " is no longer in slobrok. Was in stopping state, so assuming it has shut down normally. Setting node down"); NodeState ns = node.getReportedState().clone(); - ns.setState(State.DOWN); + ns.setState(DOWN); handleNewReportedNodeState(currentClusterState, node, ns.clone(), nodeListener); } else { - log.log(Level.FINE, () -> "Node " + node.getNode() + " no longer in slobrok was in state " + node.getReportedState() + ". Waiting to see if it reappears in slobrok"); + log.log(FINE, () -> "Node " + node.getNode() + " no longer in slobrok was in state " + node.getReportedState() + ". Waiting to see if it reappears in slobrok"); } stateMayHaveChanged = true; @@ -192,19 +193,19 @@ public class StateChangeHandler { * If the newly proposed state differs from the state the node currently has in the system, * a cluster state regeneration will be triggered. */ - public void proposeNewNodeState(final ClusterState currentClusterState, final NodeInfo node, final NodeState proposedState) { - final NodeState currentState = currentClusterState.getNodeState(node.getNode()); - final NodeState currentReported = node.getReportedState(); + public void proposeNewNodeState(ClusterState currentClusterState, NodeInfo node, NodeState proposedState) { + NodeState currentState = currentClusterState.getNodeState(node.getNode()); - if (currentState.getState().equals(proposedState.getState())) { + if (currentState.getState().equals(proposedState.getState())) return; - } + stateMayHaveChanged = true; - log.log(Level.FINE, () -> String.format("Got new wanted nodestate for %s: %s", node, currentState.getTextualDifference(proposedState))); + log.log(FINE, () -> String.format("Got new wanted nodestate for %s: %s", node, currentState.getTextualDifference(proposedState))); // Should be checked earlier before state was set in cluster assert(proposedState.getState().validWantedNodeState(node.getNode().getType())); long timeNow = timer.getCurrentTimeInMillis(); + NodeState currentReported = node.getReportedState(); if (proposedState.above(currentReported)) { eventLog.add(NodeEvent.forBaseline(node, String.format("Wanted state %s, but we cannot force node into that " + "state yet as it is currently in %s", proposedState, currentReported), @@ -239,12 +240,9 @@ public class StateChangeHandler { // generated cluster state. Still a bit of a mine field... // TODO remove all node state mutation from this function entirely in favor of ClusterStateGenerator! // `--> this will require adding more event edges and premature crash handling to it. Which is fine. - public boolean watchTimers(final ContentCluster cluster, - final ClusterState currentClusterState, - final NodeListener nodeListener) - { + public boolean watchTimers(ContentCluster cluster, ClusterState currentClusterState, NodeListener nodeListener) { boolean triggeredAnyTimers = false; - final long currentTime = timer.getCurrentTimeInMillis(); + long currentTime = timer.getCurrentTimeInMillis(); for(NodeInfo node : cluster.getNodeInfos()) { triggeredAnyTimers |= handleTimeDependentOpsForNode(currentClusterState, nodeListener, currentTime, node); @@ -256,23 +254,17 @@ public class StateChangeHandler { return triggeredAnyTimers; } - private boolean handleTimeDependentOpsForNode(final ClusterState currentClusterState, - final NodeListener nodeListener, - final long currentTime, - final NodeInfo node) - { - final NodeState currentStateInSystem = currentClusterState.getNodeState(node.getNode()); - final NodeState lastReportedState = node.getReportedState(); - boolean triggeredAnyTimers = false; + private boolean handleTimeDependentOpsForNode(ClusterState currentClusterState, + NodeListener nodeListener, + long currentTime, + NodeInfo node) { + NodeState currentStateInSystem = currentClusterState.getNodeState(node.getNode()); + NodeState lastReportedState = node.getReportedState(); + boolean triggeredAnyTimers = + reportDownIfOutdatedSlobrokNode(currentClusterState, nodeListener, currentTime, node, lastReportedState); - triggeredAnyTimers = reportDownIfOutdatedSlobrokNode( - currentClusterState, nodeListener, currentTime, node, lastReportedState); - - if (nodeStillUnavailableAfterTransitionTimeExceeded( - currentTime, node, currentStateInSystem, lastReportedState)) - { + if (nodeStillUnavailableAfterTransitionTimeExceeded(currentTime, node, currentStateInSystem, lastReportedState)) triggeredAnyTimers = true; - } if (nodeInitProgressHasTimedOut(currentTime, node, currentStateInSystem, lastReportedState)) { eventLog.add(NodeEvent.forBaseline(node, String.format( @@ -287,11 +279,11 @@ public class StateChangeHandler { if (mayResetCrashCounterOnStableUpNode(currentTime, node, lastReportedState)) { node.setPrematureCrashCount(0); - log.log(Level.FINE, () -> "Resetting premature crash count on node " + node + " as it has been up for a long time."); + log.log(FINE, () -> "Resetting premature crash count on node " + node + " as it has been up for a long time."); triggeredAnyTimers = true; } else if (mayResetCrashCounterOnStableDownNode(currentTime, node, lastReportedState)) { node.setPrematureCrashCount(0); - log.log(Level.FINE, () -> "Resetting premature crash count on node " + node + " as it has been down for a long time."); + log.log(FINE, () -> "Resetting premature crash count on node " + node + " as it has been down for a long time."); triggeredAnyTimers = true; } @@ -299,17 +291,18 @@ public class StateChangeHandler { } private boolean nodeInitProgressHasTimedOut(long currentTime, NodeInfo node, NodeState currentStateInSystem, NodeState lastReportedState) { - return !currentStateInSystem.getState().equals(State.DOWN) - && node.getWantedState().above(new NodeState(node.getNode().getType(), State.DOWN)) - && lastReportedState.getState().equals(State.INITIALIZING) + return !currentStateInSystem.getState().equals(DOWN) + && node.getWantedState().above(new NodeState(node.getNode().getType(), DOWN)) + && lastReportedState.getState().equals(INITIALIZING) && maxInitProgressTime != 0 && node.getInitProgressTime() + maxInitProgressTime <= currentTime && node.getNode().getType().equals(NodeType.STORAGE); } + // TODO: Merge this and the below method private boolean mayResetCrashCounterOnStableDownNode(long currentTime, NodeInfo node, NodeState lastReportedState) { return node.getDownStableStateTime() + stableStateTimePeriod <= currentTime - && lastReportedState.getState().equals(State.DOWN) + && lastReportedState.getState().equals(DOWN) && node.getPrematureCrashCount() <= maxPrematureCrashes && node.getPrematureCrashCount() != 0; } @@ -328,8 +321,8 @@ public class StateChangeHandler { NodeState lastReportedState) { return currentStateInSystem.getState().equals(State.MAINTENANCE) - && node.getWantedState().above(new NodeState(node.getNode().getType(), State.DOWN)) - && (lastReportedState.getState().equals(State.DOWN) || node.isNotInSlobrok()) + && node.getWantedState().above(new NodeState(node.getNode().getType(), DOWN)) + && (lastReportedState.getState().equals(DOWN) || node.isNotInSlobrok()) && node.getTransitionTime() + maxTransitionTime.get(node.getNode().getType()) < currentTime; } @@ -340,7 +333,7 @@ public class StateChangeHandler { NodeState lastReportedState) { if (node.isNotInSlobrok() - && !lastReportedState.getState().equals(State.DOWN) + && !lastReportedState.getState().equals(DOWN) && node.lastSeenInSlobrok() + maxSlobrokDisconnectGracePeriod <= currentTime) { final String desc = String.format( @@ -350,7 +343,7 @@ public class StateChangeHandler { maxSlobrokDisconnectGracePeriod); node.abortCurrentNodeStateRequests(); NodeState state = lastReportedState.clone(); - state.setState(State.DOWN); + state.setState(DOWN); if (!state.hasDescription()) { state.setDescription(desc); } @@ -362,10 +355,10 @@ public class StateChangeHandler { return false; } - private boolean isControlledShutdown(NodeState state) { - return (state.getState() == State.STOPPING - && (state.getDescription().contains("Received signal 15 (SIGTERM - Termination signal)") - || state.getDescription().contains("controlled shutdown"))); + private boolean isNotControlledShutdown(NodeState state) { + return (state.getState() != STOPPING + || (!state.getDescription().contains("Received signal 15 (SIGTERM - Termination signal)") + && !state.getDescription().contains("controlled shutdown"))); } /** @@ -381,14 +374,14 @@ public class StateChangeHandler { final NodeState reportedState, final NodeListener nodeListener) { final long timeNow = timer.getCurrentTimeInMillis(); - log.log(Level.FINE, () -> String.format("Finding new cluster state entry for %s switching state %s", node, currentState.getTextualDifference(reportedState))); + log.log(FINE, () -> String.format("Finding new cluster state entry for %s switching state %s", node, currentState.getTextualDifference(reportedState))); if (handleReportedNodeCrashEdge(node, currentState, reportedState, nodeListener, timeNow)) { return; } if (initializationProgressHasIncreased(currentState, reportedState)) { node.setInitProgressTime(timeNow); - log.log(Level.FINEST, () -> "Reset initialize timer on " + node + " to " + node.getInitProgressTime()); + log.log(FINEST, () -> "Reset initialize timer on " + node + " to " + node.getInitProgressTime()); } if (handleImplicitCrashEdgeFromReverseInitProgress(node, currentState, reportedState, nodeListener, timeNow)) { return; @@ -402,9 +395,9 @@ public class StateChangeHandler { final NodeState reportedState, final NodeListener nodeListener, final long timeNow) { - if (currentState.getState().equals(State.INITIALIZING) + if (currentState.getState().equals(INITIALIZING) && reportedState.getState().oneOf("ds") - && !isControlledShutdown(reportedState)) + && isNotControlledShutdown(reportedState)) { eventLog.add(NodeEvent.forBaseline(node, String.format("Stop or crash during initialization. " + "Premature crash count is now %d.", node.getPrematureCrashCount() + 1), @@ -421,8 +414,8 @@ public class StateChangeHandler { final NodeState reportedState, final NodeListener nodeListener, final long timeNow) { - if (currentState.getState().equals(State.INITIALIZING) && - (reportedState.getState().equals(State.INITIALIZING) && reportedState.getInitProgress() < currentState.getInitProgress())) + if (currentState.getState().equals(INITIALIZING) && + (reportedState.getState().equals(INITIALIZING) && reportedState.getInitProgress() < currentState.getInitProgress())) { eventLog.add(NodeEvent.forBaseline(node, String.format( "Stop or crash during initialization detected from reverse initializing progress." + @@ -442,8 +435,8 @@ public class StateChangeHandler { long timeNow) { if (nodeUpToDownEdge(node, currentState, reportedState)) { node.setTransitionTime(timeNow); - if (node.getUpStableStateTime() + stableStateTimePeriod > timeNow && !isControlledShutdown(reportedState)) { - log.log(Level.FINE, () -> "Stable state: " + node.getUpStableStateTime() + " + " + stableStateTimePeriod + " > " + timeNow); + if (node.getUpStableStateTime() + stableStateTimePeriod > timeNow && isNotControlledShutdown(reportedState)) { + log.log(FINE, () -> "Stable state: " + node.getUpStableStateTime() + " + " + stableStateTimePeriod + " > " + timeNow); eventLog.add(NodeEvent.forBaseline(node, String.format("Stopped or possibly crashed after %d ms, which is before " + "stable state time period. Premature crash count is now %d.", @@ -457,20 +450,20 @@ public class StateChangeHandler { } private boolean initializationProgressHasIncreased(NodeState currentState, NodeState reportedState) { - return reportedState.getState().equals(State.INITIALIZING) && - (!currentState.getState().equals(State.INITIALIZING) || + return reportedState.getState().equals(INITIALIZING) && + (!currentState.getState().equals(INITIALIZING) || reportedState.getInitProgress() > currentState.getInitProgress()); } private boolean nodeUpToDownEdge(NodeInfo node, NodeState currentState, NodeState reportedState) { return currentState.getState().oneOf("ur") && reportedState.getState().oneOf("dis") - && (node.getWantedState().getState().equals(State.RETIRED) || !reportedState.getState().equals(State.INITIALIZING)); + && (node.getWantedState().getState().equals(State.RETIRED) || !reportedState.getState().equals(INITIALIZING)); } private boolean handlePrematureCrash(NodeInfo node, NodeListener changeListener) { node.setPrematureCrashCount(node.getPrematureCrashCount() + 1); if (disableUnstableNodes && node.getPrematureCrashCount() > maxPrematureCrashes) { - NodeState wantedState = new NodeState(node.getNode().getType(), State.DOWN) + NodeState wantedState = new NodeState(node.getNode().getType(), DOWN) .setDescription("Disabled by fleet controller as it prematurely shut down " + node.getPrematureCrashCount() + " times in a row"); NodeState oldState = node.getWantedState(); node.setWantedState(wantedState); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java index 1c72594377a..01a75034ddf 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java @@ -72,14 +72,13 @@ public class SetNodeStateRequest extends Request<SetResponse> { static NodeState getRequestedNodeState(Map<String, UnitState> newStates, Node n) throws StateRestApiException { UnitState newState = newStates.get("user"); if (newState == null) throw new InvalidContentException("No new user state given in request"); - State state; - switch (newState.getId().toLowerCase()) { - case "up": state = State.UP; break; - case "retired": state = State.RETIRED; break; - case "maintenance": state = State.MAINTENANCE; break; - case "down": state = State.DOWN; break; - default: throw new InvalidContentException("Invalid user state '" + newState.getId() + "' given."); - } + State state = switch (newState.getId().toLowerCase()) { + case "up" -> State.UP; + case "retired" -> State.RETIRED; + case "maintenance" -> State.MAINTENANCE; + case "down" -> State.DOWN; + default -> throw new InvalidContentException("Invalid user state '" + newState.getId() + "' given."); + }; return new NodeState(n.getType(), state).setDescription(newState.getReason()); } @@ -191,25 +190,18 @@ public class SetNodeStateRequest extends Request<SetResponse> { boolean probe) { Node distributorNode = new Node(NodeType.DISTRIBUTOR, index); NodeInfo nodeInfo = cluster.getNodeInfo(distributorNode); - if (nodeInfo == null) { - throw new IllegalStateException("Missing distributor at index " + - distributorNode.getIndex()); - } + if (nodeInfo == null) + throw new IllegalStateException("Missing distributor at index " + distributorNode.getIndex()); State newState; switch (newStorageWantedState.getState()) { - case MAINTENANCE: - newState = State.DOWN; - break; - case RETIRED: - newState = State.UP; - break; - default: + case MAINTENANCE -> newState = State.DOWN; + case RETIRED -> newState = State.UP; + default -> { newState = newStorageWantedState.getState(); - if (!newState.validWantedNodeState(distributorNode.getType())) { - throw new IllegalStateException("Distributor cannot be set to wanted state " + - newState); - } + if (!newState.validWantedNodeState(distributorNode.getType())) + throw new IllegalStateException("Distributor cannot be set to wanted state " + newState); + } } NodeState newWantedState = new NodeState(distributorNode.getType(), newState); |