diff options
Diffstat (limited to 'node-admin')
7 files changed, 53 insertions, 24 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java index 4a19e5fe215..cdfefa6a184 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java @@ -38,7 +38,7 @@ public interface DockerOperations { /** * Try to suspend node. Suspending a node means the node should be taken offline, - * such that maintenance can be done of the node (upgrading, rebooting, etc), + * such that maintenance of the node can be done (upgrading, rebooting, etc), * and such that we will start serving again as soon as possible afterwards. */ void trySuspendNode(ContainerName containerName); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index ba8a2e55587..12c1b9bcf11 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -145,7 +145,10 @@ public class NodeAdminImpl implements NodeAdmin { // Each container may spend 1-1:30 minutes stopping nodeAgentsByHostname.values().parallelStream() .filter(nodeAgent -> hostnames.contains(nodeAgent.getHostname())) - .forEach(NodeAgent::stopServices); + .forEach(nodeAgent -> { + nodeAgent.suspend(); + nodeAgent.stopServices(); + }); } public int getNumberOfNodeAgents() { diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java index 5ab73788299..90e4b2e6c8b 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java @@ -208,11 +208,12 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { // We have spent too much time trying to freeze and node admin is still not frozen. // To avoid node agents stalling for too long, we'll force unfrozen ticks now. log.info("Timed out trying to freeze, will force unfreezed ticks"); + fetchContainersToRunFromNodeRepository(); nodeAdmin.setFrozen(false); } + } else if (currentState == RESUMED) { + fetchContainersToRunFromNodeRepository(); } - - fetchContainersToRunFromNodeRepository(); } private void setLastConvergenceException(RuntimeException exception) { @@ -277,20 +278,11 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { } private void fetchContainersToRunFromNodeRepository() { - synchronized (monitor) { - // Refresh containers to run even if we would like to suspend but have failed to do so yet, - // because it may take a long time to get permission to suspend. - if (currentState != RESUMED) { - log.info("Frozen, skipping fetching info from node repository"); - return; - } - - try { - final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName); - nodeAdmin.refreshContainersToRun(containersToRun); - } catch (Exception e) { - log.log(LogLevel.WARNING, "Failed to update which containers should be running", e); - } + try { + final List<NodeSpec> containersToRun = nodeRepository.getNodes(dockerHostHostName); + nodeAdmin.refreshContainersToRun(containersToRun); + } catch (Exception e) { + log.log(LogLevel.WARNING, "Failed to update which containers should be running", e); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index 92c44969d5e..9b759b208eb 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -18,9 +18,18 @@ public interface NodeAgent { */ boolean setFrozen(boolean frozen); + /** + * Stop services running on node. Depending on the state of the node, {@link #suspend()} might need to be + * called before calling this method. + */ void stopServices(); /** + * Suspend node. Take node offline (e.g. take node out of VIP, drain traffic, prepare for restart etc.) + */ + void suspend(); + + /** * Returns a map containing all relevant NodeAgent variables and their current values. */ Map<String, Object> debugInfo(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index ad38306547d..43e3051ce84 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -302,13 +302,23 @@ public class NodeAgentImpl implements NodeAgent { logger.info("Stopping services"); if (containerState == ABSENT) return; try { - dockerOperations.trySuspendNode(containerName); dockerOperations.stopServicesOnNode(containerName); } catch (ContainerNotFoundException e) { containerState = ABSENT; } } + @Override + public void suspend() { + logger.info("Suspending services on node"); + if (containerState == ABSENT) return; + try { + dockerOperations.trySuspendNode(containerName); + } catch (ContainerNotFoundException e) { + containerState = ABSENT; + } + } + private Optional<String> shouldRemoveContainer(NodeSpec node, Container existingContainer) { final Node.State nodeState = node.getState(); if (nodeState == Node.State.dirty || nodeState == Node.State.provisioned) { @@ -344,6 +354,9 @@ public class NodeAgentImpl implements NodeAgent { } try { + if (node.getState() != Node.State.dirty) { + suspend(); + } stopServices(); } catch (Exception e) { logger.info("Failed stopping services, ignoring", e); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java index cfdb576fb34..fd54773910d 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java @@ -22,6 +22,7 @@ import java.util.stream.IntStream; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import static org.mockito.Matchers.any; import static org.mockito.Matchers.anyBoolean; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doNothing; @@ -172,19 +173,28 @@ public class NodeAdminStateUpdaterImplTest { // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - doThrow(new RuntimeException(exceptionMsg)) - .when(orchestrator).suspend(eq(parentHostname)); + doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(parentHostname)); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(0); + tickAfter(30); verify(nodeAdmin, times(1)).setFrozen(eq(true)); + tickAfter(30); + verify(nodeAdmin, times(2)).setFrozen(eq(true)); + verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks + verify(nodeAdmin, times(1)).refreshContainersToRun(any()); assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg); + // Only resume and fetch containers when subsystem freeze duration expires + when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1)); + tickAfter(30); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + verify(nodeAdmin, times(2)).refreshContainersToRun(any()); + // We change our mind, want to remain resumed assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(0); + tickAfter(30); refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze! + verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze! } @Test diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index e0031f9b9b3..28a2b8ff5e0 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -426,6 +426,8 @@ public class NodeAgentImplTest { verify(dockerOperations, never()).createContainer(eq(containerName), any(), any()); verify(dockerOperations, never()).startContainer(eq(containerName)); + verify(dockerOperations, never()).trySuspendNode(eq(containerName)); + verify(dockerOperations, times(1)).stopServicesOnNode(eq(containerName)); verify(orchestrator, never()).resume(any(String.class)); verify(orchestrator, never()).suspend(any(String.class)); // current Docker image and vespa version should be cleared |