diff options
author | Håkon Hallingstad <hakon@yahoo-inc.com> | 2017-05-21 01:28:02 +0200 |
---|---|---|
committer | Håkon Hallingstad <hakon@yahoo-inc.com> | 2017-05-21 01:28:02 +0200 |
commit | 5a3e16eb58305a61d6accf0fd104159728017729 (patch) | |
tree | 63b81a3fc0072e78982d78ec7bcc44edd11386fd /clustercontroller-core | |
parent | 414a52d969971841abdb4b93e1e955e3266414ec (diff) |
Verify version and reported state
Diffstat (limited to 'clustercontroller-core')
2 files changed, 76 insertions, 11 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java index 1f2897a1453..7f2d9ba9611 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java @@ -6,6 +6,7 @@ import com.yahoo.vdslib.state.Node; import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.NodeType; import com.yahoo.vdslib.state.State; +import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics; import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode; import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest; @@ -120,11 +121,17 @@ public class NodeStateChangeChecker { } private Result canSetStateDownPermanently(Node node, ClusterState clusterState) { - NodeInfo nodeInfo = clusterInfo.getNodeInfo(node); + StorageNodeInfo nodeInfo = clusterInfo.getStorageNodeInfo(node.getIndex()); if (nodeInfo == null) { return Result.createDisallowed("Unknown node " + node); } + State reportedState = nodeInfo.getReportedState().getState(); + if (reportedState != State.UP) { + return Result.createDisallowed("Reported state (" + reportedState + + ") is not UP, so no bucket data is available"); + } + NodeState currentState = clusterState.getNodeState(node); if (currentState.getState() != State.RETIRED) { return Result.createDisallowed("Only retired nodes are allowed to be set to DOWN in safe mode - is " @@ -136,8 +143,16 @@ public class NodeStateChangeChecker { return thresholdCheckResult; } - Optional<Metrics.Value> bucketsMetric = clusterInfo.getStorageNodeInfo(node.getIndex()) - .getHostInfo().getMetrics().getValue(BUCKETS_METRIC_NAME); + HostInfo hostInfo = nodeInfo.getHostInfo(); + Integer hostInfoNodeVersion = hostInfo.getClusterStateVersionOrNull(); + int clusterControllerVersion = clusterState.getVersion(); + if (hostInfoNodeVersion == null || hostInfoNodeVersion != clusterControllerVersion) { + return Result.createDisallowed("Cluster controller at version " + clusterControllerVersion + + " got info for storage node " + node.getIndex() + " at a different version " + + hostInfoNodeVersion); + } + + Optional<Metrics.Value> bucketsMetric = hostInfo.getMetrics().getValue(BUCKETS_METRIC_NAME); if (!bucketsMetric.isPresent() || bucketsMetric.get().getLast() == null) { return Result.createDisallowed("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + node.getIndex()); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java index 37c241e7bc6..9dd1d305405 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java @@ -20,9 +20,13 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import static org.hamcrest.core.StringContains.containsString; import static org.hamcrest.core.Is.is; -import static org.junit.Assert.*; +import static org.hamcrest.core.StringContains.containsString; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -413,8 +417,9 @@ public class NodeStateChangeCheckerTest { ContentCluster cluster = createCluster(createNodes(4)); NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster); - cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()) - .setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion, 1)); + StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()); + nodeInfo.setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0); + nodeInfo.setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion, 1)); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, @@ -433,8 +438,9 @@ public class NodeStateChangeCheckerTest { "version:%d distributor:4 storage:4 .%d.s:r", currentClusterStateVersion, nodeStorage.getIndex())); - cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()) - .setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion, 1)); + StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()); + nodeInfo.setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0); + nodeInfo.setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion, 1)); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( nodeStorage, stateWithRetiredNode, SetUnitStateRequest.Condition.SAFE, @@ -445,6 +451,49 @@ public class NodeStateChangeCheckerTest { } @Test + public void testDownDisallowedByReportedState() { + ContentCluster cluster = createCluster(createNodes(4)); + NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster); + + ClusterState stateWithRetiredNode = clusterState(String.format( + "version:%d distributor:4 storage:4 .%d.s:r", + currentClusterStateVersion, nodeStorage.getIndex())); + + StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()); + nodeInfo.setReportedState(new NodeState(NodeType.STORAGE, State.INITIALIZING), 0); + nodeInfo.setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion, 0)); + + NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( + nodeStorage, stateWithRetiredNode, SetUnitStateRequest.Condition.SAFE, + UP_NODE_STATE, DOWN_NODE_STATE); + assertFalse(result.settingWantedStateIsAllowed()); + assertFalse(result.wantedStateAlreadySet()); + assertEquals("Reported state (Initializing) is not UP, so no bucket data is available", result.getReason()); + } + + @Test + public void testDownDisallowedByVersionMismatch() { + ContentCluster cluster = createCluster(createNodes(4)); + NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster); + + ClusterState stateWithRetiredNode = clusterState(String.format( + "version:%d distributor:4 storage:4 .%d.s:r", + currentClusterStateVersion, nodeStorage.getIndex())); + + StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()); + nodeInfo.setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0); + nodeInfo.setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion - 1, 0)); + + NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( + nodeStorage, stateWithRetiredNode, SetUnitStateRequest.Condition.SAFE, + UP_NODE_STATE, DOWN_NODE_STATE); + assertFalse(result.settingWantedStateIsAllowed()); + assertFalse(result.wantedStateAlreadySet()); + assertEquals("Cluster controller at version 2 got info for storage node 1 at a different version 1", + result.getReason()); + } + + @Test public void testAllowedToSetDown() { ContentCluster cluster = createCluster(createNodes(4)); NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster); @@ -453,8 +502,9 @@ public class NodeStateChangeCheckerTest { "version:%d distributor:4 storage:4 .%d.s:r", currentClusterStateVersion, nodeStorage.getIndex())); - cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()) - .setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion, 0)); + StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex()); + nodeInfo.setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0); + nodeInfo.setHostInfo(createHostInfoWithMetrics(currentClusterStateVersion, 0)); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( nodeStorage, stateWithRetiredNode, SetUnitStateRequest.Condition.SAFE, |