From 336ae2c005c07c1f55295414e921e88a8ec81679 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Fri, 1 Dec 2017 11:35:21 +0100 Subject: Add TRANSITIONING state in NodeAdminStateUpdater --- .../admin/nodeadmin/NodeAdminStateUpdater.java | 14 +++-- .../admin/nodeadmin/NodeAdminStateUpdaterTest.java | 70 ++++++++++++++++------ 2 files changed, 59 insertions(+), 25 deletions(-) (limited to 'node-admin') diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index 906e6b55640..e380e7a2e26 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -34,6 +34,7 @@ import java.util.stream.Collectors; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.TRANSITIONING; /** * Pulls information from node repository and forwards containers to run to node admin. @@ -110,15 +111,15 @@ public class NodeAdminStateUpdater { return this.getClass().getSimpleName() + "@" + Integer.toString(System.identityHashCode(this)); } - public enum State { RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED} + public enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED} public Map getDebugPage() { Map debug = new LinkedHashMap<>(); synchronized (monitor) { debug.put("dockerHostHostName", dockerHostHostName); + debug.put("wantedState", wantedState); + debug.put("currentState", currentState); debug.put("NodeAdmin", nodeAdmin.debugInfo()); - debug.put("Wanted State: ", wantedState); - debug.put("Current State: ", currentState); } return debug; } @@ -185,7 +186,7 @@ public class NodeAdminStateUpdater { log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedStateCopy, e); } - if (wantedStateCopy != RESUMED && currentState == RESUMED) { + if (wantedStateCopy != RESUMED && currentState == TRANSITIONING) { Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration(); if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) { // We have spent too much time trying to freeze and node admin is still not frozen. @@ -203,8 +204,9 @@ public class NodeAdminStateUpdater { * with respect to: freeze, Orchestrator, and services running. */ private void convergeState(State wantedState) { - if (currentState == wantedState) { - return; + if (currentState == wantedState) return; + synchronized (monitor) { + currentState = TRANSITIONING; } boolean wantFrozen = wantedState != RESUMED; diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java index ce062702a3b..1058278a02c 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java @@ -13,8 +13,9 @@ import org.junit.Test; import java.io.IOException; import java.time.Duration; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -50,26 +51,13 @@ public class NodeAdminStateUpdaterTest { @Test public void testStateConvergence() throws IOException { - List containersToRun = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - containersToRun.add( - new ContainerNodeSpec.Builder() - .hostname("host" + i + ".test.yahoo.com") - .nodeState(i % 3 == 0 ? Node.State.active : Node.State.ready) - .nodeType("tenant") - .nodeFlavor("docker") - .minCpuCores(1) - .minMainMemoryAvailableGb(1) - .minDiskAvailableGb(1) - .build()); - } - List activeHostnames = Arrays.asList( - "host0.test.yahoo.com", "host3.test.yahoo.com", "host6.test.yahoo.com", "host9.test.yahoo.com"); + mockNodeRepo(4); + List activeHostnames = nodeRepository.getContainersToRun(parentHostname).stream() + .map(node -> node.hostname) + .collect(Collectors.toList()); List suspendHostnames = new ArrayList<>(activeHostnames); suspendHostnames.add(parentHostname); - when(nodeRepository.getContainersToRun(eq(parentHostname))).thenReturn(containersToRun); - // Initially everything is frozen to force convergence assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); @@ -148,7 +136,7 @@ public class NodeAdminStateUpdaterTest { assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state verifyNoMoreInteractions(nodeAdmin); - + // Lets try going back to resumed when(nodeAdmin.setFrozen(eq(false))).thenReturn(false).thenReturn(true); // NodeAgents not converged to yet assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); @@ -163,6 +151,50 @@ public class NodeAdminStateUpdaterTest { assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); } + @Test + public void half_transition_revert() throws IOException { + mockNodeRepo(3); + + // Initially everything is frozen to force convergence + when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); + doNothing().when(orchestrator).resume(parentHostname); + tickAfter(0); // The first tick should unfreeze + assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + verify(nodeAdmin, times(1)).setFrozen(eq(false)); + + // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension + when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); + when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); + doThrow(new RuntimeException("Cannot allow to suspend because some reason")) + .when(orchestrator).suspend(eq(parentHostname)); + + assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); + tickAfter(0); + verify(nodeAdmin, times(1)).setFrozen(eq(true)); + + // We change our mind, want to remain resumed + assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + tickAfter(0); + assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze! + } + + private void mockNodeRepo(int numberOfNodes) throws IOException { + List containersToRun = IntStream.range(0, numberOfNodes) + .mapToObj(i -> new ContainerNodeSpec.Builder() + .hostname("host" + i + ".test.yahoo.com") + .nodeState(Node.State.active) + .nodeType("tenant") + .nodeFlavor("docker") + .minCpuCores(1) + .minMainMemoryAvailableGb(1) + .minDiskAvailableGb(1) + .build()) + .collect(Collectors.toList()); + + when(nodeRepository.getContainersToRun(eq(parentHostname))).thenReturn(containersToRun); + } + private void tickAfter(int seconds) { clock.advance(Duration.ofSeconds(seconds)); refresher.tick(); -- cgit v1.2.3