diff options
author | hakonhall <hakon@yahoo-inc.com> | 2017-04-05 09:17:25 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-04-05 09:17:25 +0200 |
commit | a3a23a45f9d65a26dbc57299258d48ba880aa6f4 (patch) | |
tree | 7ed533563031480913d53a12c114c56219bf5534 /node-admin/src | |
parent | e9c418d113cee3075d07279f3589fa6a144e593b (diff) | |
parent | 3f2a4dfa290bf68f06169191f6fd828e357f2e07 (diff) |
Merge pull request #2153 from yahoo/freva/suspend-all-on-node-admin-upgrade
Freva/suspend all on node admin upgrade
Diffstat (limited to 'node-admin/src')
5 files changed, 50 insertions, 55 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index 4748670a4be..cc953ae729b 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -105,7 +105,7 @@ public class NodeAdminImpl implements NodeAdmin { @Override public boolean setFrozen(boolean wantFrozen) { - // Use filter with count instead of allMatch() because allMatch() will short curcuit on first non-match + // Use filter with count instead of allMatch() because allMatch() will short circuit on first non-match boolean allNodeAgentsConverged = nodeAgents.values().stream() .filter(nodeAgent -> !nodeAgent.setFrozen(wantFrozen)) .count() == 0; @@ -129,6 +129,7 @@ public class NodeAdminImpl implements NodeAdmin { .forEach(NodeAgent::stopServices); } + @Override public Set<String> getListOfHosts() { return nodeAgents.keySet(); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index d581a6fadc6..26139575f92 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -12,6 +12,7 @@ import java.io.IOException; import java.time.Clock; import java.time.Duration; import java.time.Instant; +import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -132,9 +133,8 @@ public class NodeAdminStateUpdater extends AbstractComponent { /** * This method attempts to converge NodeAgent's and NodeAdmin's frozen state with their orchestrator * state. When trying to suspend node-admin, this method will first attempt to freeze all NodeAgents and - * NodeAdmin, then asking orchestrator for permission to suspend node-admin app, and finally asking orchestrator - * for permission to suspend all active nodes on this host, if either of the request is denied, - * this method will unfreeze NodeAgents and NodeAdmin. + * NodeAdmin, then asking orchestrator for permission to suspend all active nodes on this host, including + * node-admin itself, if the request is denied, this method will unfreeze NodeAgents and NodeAdmin. */ private void convergeState(State wantedState) { boolean wantFrozen = wantedState != RESUMED; @@ -150,18 +150,6 @@ public class NodeAdminStateUpdater extends AbstractComponent { return; } - if (currentState == RESUMED) { - if (! orchestrator.suspend(dockerHostHostName)) { - nodeAdmin.setFrozen(false); - throw new RuntimeException("Failed to get permission to suspend node-admin, resuming."); - } - - synchronized (monitor) { - currentState = SUSPENDED_NODE_ADMIN; - } - if (wantedState == currentState) return; - } - // Fetch active nodes from node repo before suspending nodes. // It is only possible to suspend active nodes, // the orchestrator will fail if trying to suspend nodes in other states. @@ -175,14 +163,21 @@ public class NodeAdminStateUpdater extends AbstractComponent { throw new RuntimeException("Failed to get nodes from node repo:" + e.getMessage()); } - if (nodesInActiveState.size() > 0) { - orchestrator.suspend(dockerHostHostName, nodesInActiveState).ifPresent(orchestratorResponse -> { + if (currentState == RESUMED) { + List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState); + nodesToSuspend.add(dockerHostHostName); + orchestrator.suspend(dockerHostHostName, nodesToSuspend).ifPresent(orchestratorResponse -> { nodeAdmin.setFrozen(false); throw new RuntimeException("Failed to get permission to suspend, resuming. Reason: " + orchestratorResponse); }); - nodeAdmin.stopNodeAgentServices(nodesInActiveState); + + synchronized (monitor) { + currentState = SUSPENDED_NODE_ADMIN; + } + if (wantedState == currentState) return; } + nodeAdmin.stopNodeAgentServices(nodesInActiveState); synchronized (monitor) { currentState = SUSPENDED; } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 883c3e8d0a4..8128bb47eac 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -54,8 +54,8 @@ public class NodeAgentImpl implements NodeAgent { private final PrefixLogger logger; private DockerImage imageBeingDownloaded = null; - private final String hostname; + private final String hostname; private final ContainerName containerName; private final NodeRepository nodeRepository; private final Orchestrator orchestrator; @@ -69,9 +69,9 @@ public class NodeAgentImpl implements NodeAgent { private final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); private final LinkedList<String> debugMessages = new LinkedList<>(); - private long delaysBetweenEachTickMillis = 30_000; + private long delaysBetweenEachConvergeMillis = 30_000; private int numberOfUnhandledException = 0; - private Instant lastTick; + private Instant lastConverge; private Thread loopThread; @@ -109,7 +109,7 @@ public class NodeAgentImpl implements NodeAgent { this.environment = environment; this.clock = clock; this.aclMaintainer = aclMaintainer; - this.lastTick = clock.instant(); + this.lastConverge = clock.instant(); // If the container is already running, initialize vespaVersion and lastCpuMetric lastCpuMetric = new CpuUsageReporter(clock.instant()); @@ -165,7 +165,7 @@ public class NodeAgentImpl implements NodeAgent { @Override public void start(int intervalMillis) { addDebugMessage("Starting with interval " + intervalMillis + "ms"); - delaysBetweenEachTickMillis = intervalMillis; + delaysBetweenEachConvergeMillis = intervalMillis; if (loopThread != null) { throw new RuntimeException("Can not restart a node agent."); } @@ -374,7 +374,7 @@ public class NodeAgentImpl implements NodeAgent { boolean isFrozenCopy; synchronized (monitor) { while (! workToDoNow) { - long remainder = delaysBetweenEachTickMillis - Duration.between(lastTick, clock.instant()).toMillis(); + long remainder = delaysBetweenEachConvergeMillis - Duration.between(lastConverge, clock.instant()).toMillis(); if (remainder > 0) { try { monitor.wait(remainder); @@ -383,7 +383,7 @@ public class NodeAgentImpl implements NodeAgent { } } else break; } - lastTick = clock.instant(); + lastConverge = clock.instant(); workToDoNow = false; if (isFrozen != wantFrozen) { diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java index c92cbe9e7e5..b154663092e 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java @@ -24,6 +24,7 @@ import java.io.StringWriter; import java.net.ServerSocket; import java.nio.charset.StandardCharsets; import java.time.Instant; +import java.util.Arrays; import java.util.Collections; import java.util.Optional; @@ -31,7 +32,6 @@ import static org.hamcrest.Matchers.is; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; -import static org.mockito.Matchers.any; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.when; @@ -40,7 +40,6 @@ import static org.mockito.Mockito.when; * @author dybis */ public class RunInContainerTest { - private JDisc container; private int port; @@ -111,19 +110,23 @@ public class RunInContainerTest { @Test public void testGetContainersToRunAPi() throws IOException, InterruptedException { waitForJdiscContainerToServe(); + final String parentHostname = "localhost.test.yahoo.com"; + assertThat(doPutCall("resume"), is(true)); // No nodes are allocated to this host yet, so freezing should be fine, but orchestrator doesnt allow node-admin suspend - when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com")).thenReturn(false); + when(ComponentsProviderWithMocks.orchestratorMock.suspend(parentHostname, Collections.singletonList(parentHostname))) + .thenReturn(Optional.of("Cannot suspend because...")); assertFalse(doPutCall("suspend/node-admin")); - // Ochestrator changes its mind, allows node-admin to suspend - when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com")).thenReturn(true); + // Orchestrator changes its mind, allows node-admin to suspend + when(ComponentsProviderWithMocks.nodeRepositoryMock.getContainersToRun()).thenReturn(Collections.emptyList()); + when(ComponentsProviderWithMocks.orchestratorMock.suspend(parentHostname, Collections.singletonList(parentHostname))) + .thenReturn(Optional.empty()); Thread.sleep(50); assertTrue(doPutCall("suspend/node-admin")); // Tick loop should've run several times by now, expect to be suspended - // Lets try to suspend everything now - when(ComponentsProviderWithMocks.nodeRepositoryMock.getContainersToRun()).thenReturn(Collections.emptyList()); + // Lets try to suspend everything now, should be trivial as we have no active containers to stop services at assertFalse(doPutCall("suspend")); Thread.sleep(50); assertTrue(doPutCall("suspend")); @@ -143,25 +146,28 @@ public class RunInContainerTest { .nodeFlavor("docker") .build())); when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com", - Collections.singletonList("host1.test.yahoo.com"))).thenReturn(Optional.of("Cant suspend because...")); + Arrays.asList("host1.test.yahoo.com", parentHostname))) + .thenReturn(Optional.of("Cant suspend because...")); - // Suspending node-admin is still independent of suspending containers, so this should succeed + // Orchestrator doesn't allow to suspend either the container or the node-admin assertFalse(doPutCall("suspend/node-admin")); Thread.sleep(50); - assertTrue(doPutCall("suspend/node-admin")); + assertFalse(doPutCall("suspend/node-admin")); - // Fails because orchestrator wont allow to suspend the container - assertFalse(doPutCall("suspend")); + when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com", + Arrays.asList("host1.test.yahoo.com", parentHostname))) + .thenReturn(Optional.empty()); + // Orchestrator successfully suspended everything Thread.sleep(50); - assertFalse(doPutCall("suspend")); + assertTrue(doPutCall("suspend/node-admin")); - when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com", - Collections.singletonList("host1.test.yahoo.com"))).thenReturn(Optional.empty()); + // Allow stopping services in active nodes doNothing().when(ComponentsProviderWithMocks.dockerOperationsMock) .trySuspendNode(eq(new ContainerName("host1"))); doNothing().when(ComponentsProviderWithMocks.dockerOperationsMock) .stopServicesOnNode(eq(new ContainerName("host1"))); + assertFalse(doPutCall("suspend")); Thread.sleep(50); assertTrue(doPutCall("suspend")); } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java index ad5b4ce1096..28da1192b3b 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java @@ -17,7 +17,6 @@ import java.util.Optional; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import static org.mockito.Matchers.any; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; @@ -58,6 +57,8 @@ public class NodeAdminStateUpdaterTest { } List<String> activeHostnames = Arrays.asList( "host0.test.yahoo.com", "host3.test.yahoo.com", "host6.test.yahoo.com", "host9.test.yahoo.com"); + List<String> suspendHostnames = new ArrayList<>(activeHostnames); + suspendHostnames.add(parentHostname); when(nodeRepository.getContainersToRun()).thenReturn(containersToRun); @@ -83,7 +84,9 @@ public class NodeAdminStateUpdaterTest { verify(refresher, times(1)).signalWorkToBeDone(); // No change in desired state when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - when(orchestrator.suspend(eq(parentHostname))).thenReturn(false).thenReturn(true); + when(orchestrator.suspend(eq(parentHostname), eq(suspendHostnames))) + .thenReturn(Optional.of("Cannot allow to suspend because some reason")) + .thenReturn(Optional.empty()); tickAfter(35); assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); verify(refresher, times(1)).signalWorkToBeDone(); @@ -92,23 +95,13 @@ public class NodeAdminStateUpdaterTest { tickAfter(35); assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); verify(nodeAdmin, times(1)).setFrozen(eq(false)); - verify(orchestrator, never()).suspend(any(), any()); - - // Lets just suspend everything... - when(orchestrator.suspend(eq(parentHostname), eq(activeHostnames))) - .thenReturn(Optional.of("Cannot allow to suspend becuase some reason")) - .thenReturn(Optional.empty()); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); - tickAfter(0); // Change in wanted state, no need to wait - verify(refresher, times(2)).signalWorkToBeDone(); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Another roll back // At this point orchestrator says its OK to suspend, but something goes wrong when we try to stop services doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames)); - tickAfter(35); assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); + tickAfter(0); // Change in wanted state, no need to wait verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state - verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure we dont roll back + verify(nodeAdmin, times(1)).setFrozen(eq(false)); // Make sure we dont roll back // Finally we are successful in transitioning to frozen tickAfter(35); |