diff options
author | Tor Brede Vekterli <vekterli@yahoo-inc.com> | 2016-06-17 16:49:39 +0200 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahoo-inc.com> | 2016-06-17 16:49:39 +0200 |
commit | 90c3b21d21d83f2cc4b148afe4d4e6278d1f8394 (patch) | |
tree | da4dde815dbcf47eb4744091dab1b66aebc65ccc | |
parent | 916e5a5b8a4574bb2d878c5b07c97b1678df81b3 (diff) |
Don't reintroduce already observed timestamps in cluster state
Also address code review comments.
2 files changed, 60 insertions, 9 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java index 3dd1216d5d0..2af9fe1f091 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java @@ -336,6 +336,16 @@ public class SystemStateGenerator { baseline.setState(wanted.getState()); } } + // Don't reintroduce start timestamp to the node's state if it has already been + // observed by all distributors. This matches how handleNewReportedNodeState() sets timestamps. + // TODO make timestamp semantics clearer. Non-obvious what the two different timestamp stores imply. + // For posterity: reported.getStartTimestamp() is the start timestamp the node itself has stated. + // info.getStartTimestamp() is the timestamp written as having been observed by all distributors + // (which is done in handleAllDistributorsInSync()). + if (reported.getStartTimestamp() <= info.getStartTimestamp()) { + baseline.setStartTimestamp(0); + } + return baseline; } @@ -349,24 +359,32 @@ public class SystemStateGenerator { final Node node = storageNode(i); final NodeInfo info = cluster.getNodeInfo(node); final NodeState currentState = candidateState.getNodeState(node); - if (currentState.getState() == State.DOWN) { - if (mayClearNodeDownState(info)) { - candidateState.setNodeState(node, baselineNodeState(info)); - clearedNodes.add(i); - } + if (mayClearCurrentNodeState(currentState, info)) { + candidateState.setNodeState(node, baselineNodeState(info)); + clearedNodes.add(i); } } return clearedNodes; } - private boolean mayClearNodeDownState(NodeInfo info) { + private boolean mayClearCurrentNodeState(NodeState currentState, NodeInfo info) { + if (currentState.getState() != State.DOWN) { + return false; + } if (info == null) { // Nothing known about node in cluster info; we definitely don't want it // to be taken up at this point. return false; } + // Rationale: we can only enter this statement if the _current_ (generated) state + // of the node is Down. Aside from the group take-down logic, there should not exist + // any other edges in the cluster controller state transition logic where a node + // may be set Down while both its reported state, RPC connectivity and wanted state + // imply that a better state should already have been chosen. Consequently we allow + // the node to have its Down-state cleared. return (info.getReportedState().getState() != State.DOWN - && info.getWantedState().getState().oneOf("ur")); + && !info.isRpcAddressOutdated() + && !info.getWantedState().getState().oneOf("d")); } private ClusterStateView createNextVersionOfClusterStateView(ContentCluster cluster) { @@ -736,7 +754,7 @@ public class SystemStateGenerator { triggeredAnyTimers = true; } - // If node haven't increased its initializing progress within initprogresstime, mark it down. + // If node hasn't increased its initializing progress within initprogresstime, mark it down. if (!currentStateInSystem.getState().equals(State.DOWN) && node.getWantedState().above(new NodeState(node.getNode().getType(), State.DOWN)) && lastReportedState.getState().equals(State.INITIALIZING) diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java index 93e34b1f772..be60fba234a 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java @@ -7,6 +7,7 @@ import com.yahoo.vdslib.state.Node; import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.NodeType; import com.yahoo.vdslib.state.State; +import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler; import com.yahoo.vespa.clustercontroller.core.listeners.NodeStateOrHostInfoChangeHandler; import com.yahoo.vespa.clustercontroller.core.listeners.SystemStateListener; import org.junit.Test; @@ -23,6 +24,7 @@ import static org.mockito.Matchers.any; import static org.mockito.Matchers.argThat; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; public class GroupAutoTakedownTest { @@ -308,7 +310,6 @@ public class GroupAutoTakedownTest { ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); - final Node node = new Node(NodeType.STORAGE, 4); final NodeState newState = new NodeState(NodeType.STORAGE, State.INITIALIZING); newState.setInitProgress(0.5); @@ -367,4 +368,36 @@ public class GroupAutoTakedownTest { verboseStateAfterStorageTransition(fixture, 4, State.UP)); } + @Test + public void previously_cleared_start_timestamps_are_not_reintroduced_on_up_edge() throws Exception { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + final NodeState newState = new NodeState(NodeType.STORAGE, State.UP); + newState.setStartTimestamp(123456); + + fixture.reportStorageNodeState(4, newState); + + SystemStateListener listener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + + assertEquals("version:1 distributor:6 storage:6 .4.t:123456", fixture.generatedClusterState()); + + DatabaseHandler handler = mock(DatabaseHandler.class); + DatabaseHandler.Context context = mock(DatabaseHandler.Context.class); + when(context.getCluster()).thenReturn(fixture.cluster); + + fixture.generator.handleAllDistributorsInSync(handler, context); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + + // Timestamp should now be cleared from state + assertEquals("version:2 distributor:6 storage:6", fixture.generatedClusterState()); + + // Trigger a group down+up edge. Timestamp should _not_ be reintroduced since it was previously cleared. + assertEquals("version:3 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + assertEquals("version:4 distributor:6 storage:6", + stateAfterStorageTransition(fixture, 5, State.UP)); + } + } |