Don't reintroduce already observed timestamps in cluster state

Also address code review comments.
author: Tor Brede Vekterli <vekterli@yahoo-inc.com> 2016-06-17 16:49:39 +0200
committer: Tor Brede Vekterli <vekterli@yahoo-inc.com> 2016-06-17 16:49:39 +0200
commit: 90c3b21d21d83f2cc4b148afe4d4e6278d1f8394 (patch)
tree: da4dde815dbcf47eb4744091dab1b66aebc65ccc /clustercontroller-core
parent: 916e5a5b8a4574bb2d878c5b07c97b1678df81b3 (diff)
2 files changed, 60 insertions, 9 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
index 3dd1216d5d0..2af9fe1f091 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
@@ -336,6 +336,16 @@ public class SystemStateGenerator {
                 baseline.setState(wanted.getState());
             }
         }
+        // Don't reintroduce start timestamp to the node's state if it has already been
+        // observed by all distributors. This matches how handleNewReportedNodeState() sets timestamps.
+        // TODO make timestamp semantics clearer. Non-obvious what the two different timestamp stores imply.
+        // For posterity: reported.getStartTimestamp() is the start timestamp the node itself has stated.
+        // info.getStartTimestamp() is the timestamp written as having been observed by all distributors
+        // (which is done in handleAllDistributorsInSync()).
+        if (reported.getStartTimestamp() <= info.getStartTimestamp()) {
+            baseline.setStartTimestamp(0);
+        }
+
         return baseline;
     }
 
@@ -349,24 +359,32 @@ public class SystemStateGenerator {
             final Node node = storageNode(i);
             final NodeInfo info = cluster.getNodeInfo(node);
             final NodeState currentState = candidateState.getNodeState(node);
-            if (currentState.getState() == State.DOWN) {
-                if (mayClearNodeDownState(info)) {
-                    candidateState.setNodeState(node, baselineNodeState(info));
-                    clearedNodes.add(i);
-                }
+            if (mayClearCurrentNodeState(currentState, info)) {
+                candidateState.setNodeState(node, baselineNodeState(info));
+                clearedNodes.add(i);
             }
         }
         return clearedNodes;
     }
 
-    private boolean mayClearNodeDownState(NodeInfo info) {
+    private boolean mayClearCurrentNodeState(NodeState currentState, NodeInfo info) {
+        if (currentState.getState() != State.DOWN) {
+            return false;
+        }
         if (info == null) {
             // Nothing known about node in cluster info; we definitely don't want it
             // to be taken up at this point.
             return false;
         }
+        // Rationale: we can only enter this statement if the _current_ (generated) state
+        // of the node is Down. Aside from the group take-down logic, there should not exist
+        // any other edges in the cluster controller state transition logic where a node
+        // may be set Down while both its reported state, RPC connectivity and wanted state
+        // imply that a better state should already have been chosen. Consequently we allow
+        // the node to have its Down-state cleared.
         return (info.getReportedState().getState() != State.DOWN
-                && info.getWantedState().getState().oneOf("ur"));
+                && !info.isRpcAddressOutdated()
+                && !info.getWantedState().getState().oneOf("d"));
     }
 
     private ClusterStateView createNextVersionOfClusterStateView(ContentCluster cluster) {
@@ -736,7 +754,7 @@ public class SystemStateGenerator {
                 triggeredAnyTimers = true;
             }
 
-            // If node haven't increased its initializing progress within initprogresstime, mark it down.
+            // If node hasn't increased its initializing progress within initprogresstime, mark it down.
             if (!currentStateInSystem.getState().equals(State.DOWN)
                 && node.getWantedState().above(new NodeState(node.getNode().getType(), State.DOWN))
                 && lastReportedState.getState().equals(State.INITIALIZING)
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java
index 93e34b1f772..be60fba234a 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java
@@ -7,6 +7,7 @@ import com.yahoo.vdslib.state.Node;
 import com.yahoo.vdslib.state.NodeState;
 import com.yahoo.vdslib.state.NodeType;
 import com.yahoo.vdslib.state.State;
+import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler;
 import com.yahoo.vespa.clustercontroller.core.listeners.NodeStateOrHostInfoChangeHandler;
 import com.yahoo.vespa.clustercontroller.core.listeners.SystemStateListener;
 import org.junit.Test;
@@ -23,6 +24,7 @@ import static org.mockito.Matchers.any;
 import static org.mockito.Matchers.argThat;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 public class GroupAutoTakedownTest {
 
@@ -308,7 +310,6 @@ public class GroupAutoTakedownTest {
         ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
                 DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
 
-        final Node node = new Node(NodeType.STORAGE, 4);
         final NodeState newState = new NodeState(NodeType.STORAGE, State.INITIALIZING);
         newState.setInitProgress(0.5);
 
@@ -367,4 +368,36 @@ public class GroupAutoTakedownTest {
                 verboseStateAfterStorageTransition(fixture, 4, State.UP));
     }
 
+    @Test
+    public void previously_cleared_start_timestamps_are_not_reintroduced_on_up_edge() throws Exception {
+        ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+                DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+        final NodeState newState = new NodeState(NodeType.STORAGE, State.UP);
+        newState.setStartTimestamp(123456);
+
+        fixture.reportStorageNodeState(4, newState);
+
+        SystemStateListener listener = mock(SystemStateListener.class);
+        assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+
+        assertEquals("version:1 distributor:6 storage:6 .4.t:123456", fixture.generatedClusterState());
+
+        DatabaseHandler handler = mock(DatabaseHandler.class);
+        DatabaseHandler.Context context = mock(DatabaseHandler.Context.class);
+        when(context.getCluster()).thenReturn(fixture.cluster);
+
+        fixture.generator.handleAllDistributorsInSync(handler, context);
+        assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+
+        // Timestamp should now be cleared from state
+        assertEquals("version:2 distributor:6 storage:6", fixture.generatedClusterState());
+
+        // Trigger a group down+up edge. Timestamp should _not_ be reintroduced since it was previously cleared.
+        assertEquals("version:3 distributor:6 storage:4",
+                stateAfterStorageTransition(fixture, 5, State.DOWN));
+        assertEquals("version:4 distributor:6 storage:6",
+                stateAfterStorageTransition(fixture, 5, State.UP));
+    }
+
 }
author	Tor Brede Vekterli <vekterli@yahoo-inc.com>	2016-06-17 16:49:39 +0200
committer	Tor Brede Vekterli <vekterli@yahoo-inc.com>	2016-06-17 16:49:39 +0200
commit	90c3b21d21d83f2cc4b148afe4d4e6278d1f8394 (patch)
tree	da4dde815dbcf47eb4744091dab1b66aebc65ccc /clustercontroller-core
parent	916e5a5b8a4574bb2d878c5b07c97b1678df81b3 (diff)