summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorhakonhall <hakon@yahoo-inc.com>2017-04-05 09:17:25 +0200
committerGitHub <noreply@github.com>2017-04-05 09:17:25 +0200
commita3a23a45f9d65a26dbc57299258d48ba880aa6f4 (patch)
tree7ed533563031480913d53a12c114c56219bf5534 /node-admin
parente9c418d113cee3075d07279f3589fa6a144e593b (diff)
parent3f2a4dfa290bf68f06169191f6fd828e357f2e07 (diff)
Merge pull request #2153 from yahoo/freva/suspend-all-on-node-admin-upgrade
Freva/suspend all on node admin upgrade
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java3
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java31
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java14
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java36
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java21
5 files changed, 50 insertions, 55 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index 4748670a4be..cc953ae729b 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -105,7 +105,7 @@ public class NodeAdminImpl implements NodeAdmin {
@Override
public boolean setFrozen(boolean wantFrozen) {
- // Use filter with count instead of allMatch() because allMatch() will short curcuit on first non-match
+ // Use filter with count instead of allMatch() because allMatch() will short circuit on first non-match
boolean allNodeAgentsConverged = nodeAgents.values().stream()
.filter(nodeAgent -> !nodeAgent.setFrozen(wantFrozen))
.count() == 0;
@@ -129,6 +129,7 @@ public class NodeAdminImpl implements NodeAdmin {
.forEach(NodeAgent::stopServices);
}
+ @Override
public Set<String> getListOfHosts() {
return nodeAgents.keySet();
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
index d581a6fadc6..26139575f92 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -12,6 +12,7 @@ import java.io.IOException;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
+import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -132,9 +133,8 @@ public class NodeAdminStateUpdater extends AbstractComponent {
/**
* This method attempts to converge NodeAgent's and NodeAdmin's frozen state with their orchestrator
* state. When trying to suspend node-admin, this method will first attempt to freeze all NodeAgents and
- * NodeAdmin, then asking orchestrator for permission to suspend node-admin app, and finally asking orchestrator
- * for permission to suspend all active nodes on this host, if either of the request is denied,
- * this method will unfreeze NodeAgents and NodeAdmin.
+ * NodeAdmin, then asking orchestrator for permission to suspend all active nodes on this host, including
+ * node-admin itself, if the request is denied, this method will unfreeze NodeAgents and NodeAdmin.
*/
private void convergeState(State wantedState) {
boolean wantFrozen = wantedState != RESUMED;
@@ -150,18 +150,6 @@ public class NodeAdminStateUpdater extends AbstractComponent {
return;
}
- if (currentState == RESUMED) {
- if (! orchestrator.suspend(dockerHostHostName)) {
- nodeAdmin.setFrozen(false);
- throw new RuntimeException("Failed to get permission to suspend node-admin, resuming.");
- }
-
- synchronized (monitor) {
- currentState = SUSPENDED_NODE_ADMIN;
- }
- if (wantedState == currentState) return;
- }
-
// Fetch active nodes from node repo before suspending nodes.
// It is only possible to suspend active nodes,
// the orchestrator will fail if trying to suspend nodes in other states.
@@ -175,14 +163,21 @@ public class NodeAdminStateUpdater extends AbstractComponent {
throw new RuntimeException("Failed to get nodes from node repo:" + e.getMessage());
}
- if (nodesInActiveState.size() > 0) {
- orchestrator.suspend(dockerHostHostName, nodesInActiveState).ifPresent(orchestratorResponse -> {
+ if (currentState == RESUMED) {
+ List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState);
+ nodesToSuspend.add(dockerHostHostName);
+ orchestrator.suspend(dockerHostHostName, nodesToSuspend).ifPresent(orchestratorResponse -> {
nodeAdmin.setFrozen(false);
throw new RuntimeException("Failed to get permission to suspend, resuming. Reason: " + orchestratorResponse);
});
- nodeAdmin.stopNodeAgentServices(nodesInActiveState);
+
+ synchronized (monitor) {
+ currentState = SUSPENDED_NODE_ADMIN;
+ }
+ if (wantedState == currentState) return;
}
+ nodeAdmin.stopNodeAgentServices(nodesInActiveState);
synchronized (monitor) {
currentState = SUSPENDED;
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 883c3e8d0a4..8128bb47eac 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -54,8 +54,8 @@ public class NodeAgentImpl implements NodeAgent {
private final PrefixLogger logger;
private DockerImage imageBeingDownloaded = null;
- private final String hostname;
+ private final String hostname;
private final ContainerName containerName;
private final NodeRepository nodeRepository;
private final Orchestrator orchestrator;
@@ -69,9 +69,9 @@ public class NodeAgentImpl implements NodeAgent {
private final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private final LinkedList<String> debugMessages = new LinkedList<>();
- private long delaysBetweenEachTickMillis = 30_000;
+ private long delaysBetweenEachConvergeMillis = 30_000;
private int numberOfUnhandledException = 0;
- private Instant lastTick;
+ private Instant lastConverge;
private Thread loopThread;
@@ -109,7 +109,7 @@ public class NodeAgentImpl implements NodeAgent {
this.environment = environment;
this.clock = clock;
this.aclMaintainer = aclMaintainer;
- this.lastTick = clock.instant();
+ this.lastConverge = clock.instant();
// If the container is already running, initialize vespaVersion and lastCpuMetric
lastCpuMetric = new CpuUsageReporter(clock.instant());
@@ -165,7 +165,7 @@ public class NodeAgentImpl implements NodeAgent {
@Override
public void start(int intervalMillis) {
addDebugMessage("Starting with interval " + intervalMillis + "ms");
- delaysBetweenEachTickMillis = intervalMillis;
+ delaysBetweenEachConvergeMillis = intervalMillis;
if (loopThread != null) {
throw new RuntimeException("Can not restart a node agent.");
}
@@ -374,7 +374,7 @@ public class NodeAgentImpl implements NodeAgent {
boolean isFrozenCopy;
synchronized (monitor) {
while (! workToDoNow) {
- long remainder = delaysBetweenEachTickMillis - Duration.between(lastTick, clock.instant()).toMillis();
+ long remainder = delaysBetweenEachConvergeMillis - Duration.between(lastConverge, clock.instant()).toMillis();
if (remainder > 0) {
try {
monitor.wait(remainder);
@@ -383,7 +383,7 @@ public class NodeAgentImpl implements NodeAgent {
}
} else break;
}
- lastTick = clock.instant();
+ lastConverge = clock.instant();
workToDoNow = false;
if (isFrozen != wantFrozen) {
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java
index c92cbe9e7e5..b154663092e 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java
@@ -24,6 +24,7 @@ import java.io.StringWriter;
import java.net.ServerSocket;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
+import java.util.Arrays;
import java.util.Collections;
import java.util.Optional;
@@ -31,7 +32,6 @@ import static org.hamcrest.Matchers.is;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
-import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.when;
@@ -40,7 +40,6 @@ import static org.mockito.Mockito.when;
* @author dybis
*/
public class RunInContainerTest {
-
private JDisc container;
private int port;
@@ -111,19 +110,23 @@ public class RunInContainerTest {
@Test
public void testGetContainersToRunAPi() throws IOException, InterruptedException {
waitForJdiscContainerToServe();
+ final String parentHostname = "localhost.test.yahoo.com";
+
assertThat(doPutCall("resume"), is(true));
// No nodes are allocated to this host yet, so freezing should be fine, but orchestrator doesnt allow node-admin suspend
- when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com")).thenReturn(false);
+ when(ComponentsProviderWithMocks.orchestratorMock.suspend(parentHostname, Collections.singletonList(parentHostname)))
+ .thenReturn(Optional.of("Cannot suspend because..."));
assertFalse(doPutCall("suspend/node-admin"));
- // Ochestrator changes its mind, allows node-admin to suspend
- when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com")).thenReturn(true);
+ // Orchestrator changes its mind, allows node-admin to suspend
+ when(ComponentsProviderWithMocks.nodeRepositoryMock.getContainersToRun()).thenReturn(Collections.emptyList());
+ when(ComponentsProviderWithMocks.orchestratorMock.suspend(parentHostname, Collections.singletonList(parentHostname)))
+ .thenReturn(Optional.empty());
Thread.sleep(50);
assertTrue(doPutCall("suspend/node-admin")); // Tick loop should've run several times by now, expect to be suspended
- // Lets try to suspend everything now
- when(ComponentsProviderWithMocks.nodeRepositoryMock.getContainersToRun()).thenReturn(Collections.emptyList());
+ // Lets try to suspend everything now, should be trivial as we have no active containers to stop services at
assertFalse(doPutCall("suspend"));
Thread.sleep(50);
assertTrue(doPutCall("suspend"));
@@ -143,25 +146,28 @@ public class RunInContainerTest {
.nodeFlavor("docker")
.build()));
when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com",
- Collections.singletonList("host1.test.yahoo.com"))).thenReturn(Optional.of("Cant suspend because..."));
+ Arrays.asList("host1.test.yahoo.com", parentHostname)))
+ .thenReturn(Optional.of("Cant suspend because..."));
- // Suspending node-admin is still independent of suspending containers, so this should succeed
+ // Orchestrator doesn't allow to suspend either the container or the node-admin
assertFalse(doPutCall("suspend/node-admin"));
Thread.sleep(50);
- assertTrue(doPutCall("suspend/node-admin"));
+ assertFalse(doPutCall("suspend/node-admin"));
- // Fails because orchestrator wont allow to suspend the container
- assertFalse(doPutCall("suspend"));
+ when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com",
+ Arrays.asList("host1.test.yahoo.com", parentHostname)))
+ .thenReturn(Optional.empty());
+ // Orchestrator successfully suspended everything
Thread.sleep(50);
- assertFalse(doPutCall("suspend"));
+ assertTrue(doPutCall("suspend/node-admin"));
- when(ComponentsProviderWithMocks.orchestratorMock.suspend("localhost.test.yahoo.com",
- Collections.singletonList("host1.test.yahoo.com"))).thenReturn(Optional.empty());
+ // Allow stopping services in active nodes
doNothing().when(ComponentsProviderWithMocks.dockerOperationsMock)
.trySuspendNode(eq(new ContainerName("host1")));
doNothing().when(ComponentsProviderWithMocks.dockerOperationsMock)
.stopServicesOnNode(eq(new ContainerName("host1")));
+ assertFalse(doPutCall("suspend"));
Thread.sleep(50);
assertTrue(doPutCall("suspend"));
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
index ad5b4ce1096..28da1192b3b 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
@@ -17,7 +17,6 @@ import java.util.Optional;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
-import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
@@ -58,6 +57,8 @@ public class NodeAdminStateUpdaterTest {
}
List<String> activeHostnames = Arrays.asList(
"host0.test.yahoo.com", "host3.test.yahoo.com", "host6.test.yahoo.com", "host9.test.yahoo.com");
+ List<String> suspendHostnames = new ArrayList<>(activeHostnames);
+ suspendHostnames.add(parentHostname);
when(nodeRepository.getContainersToRun()).thenReturn(containersToRun);
@@ -83,7 +84,9 @@ public class NodeAdminStateUpdaterTest {
verify(refresher, times(1)).signalWorkToBeDone(); // No change in desired state
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- when(orchestrator.suspend(eq(parentHostname))).thenReturn(false).thenReturn(true);
+ when(orchestrator.suspend(eq(parentHostname), eq(suspendHostnames)))
+ .thenReturn(Optional.of("Cannot allow to suspend because some reason"))
+ .thenReturn(Optional.empty());
tickAfter(35);
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
verify(refresher, times(1)).signalWorkToBeDone();
@@ -92,23 +95,13 @@ public class NodeAdminStateUpdaterTest {
tickAfter(35);
assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
verify(nodeAdmin, times(1)).setFrozen(eq(false));
- verify(orchestrator, never()).suspend(any(), any());
-
- // Lets just suspend everything...
- when(orchestrator.suspend(eq(parentHostname), eq(activeHostnames)))
- .thenReturn(Optional.of("Cannot allow to suspend becuase some reason"))
- .thenReturn(Optional.empty());
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
- tickAfter(0); // Change in wanted state, no need to wait
- verify(refresher, times(2)).signalWorkToBeDone();
- verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Another roll back
// At this point orchestrator says its OK to suspend, but something goes wrong when we try to stop services
doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
- tickAfter(35);
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
+ tickAfter(0); // Change in wanted state, no need to wait
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
- verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure we dont roll back
+ verify(nodeAdmin, times(1)).setFrozen(eq(false)); // Make sure we dont roll back
// Finally we are successful in transitioning to frozen
tickAfter(35);