diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-03-06 18:14:04 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-03-06 18:14:04 +0100 |
commit | 24daed22401950b04c596bfdb98c4eacb1e0ace1 (patch) | |
tree | ca77437463dcb18ac2ecdfa7e90b6b8fe1a19aeb /node-admin/src/main/java | |
parent | 9b1d4b5485b23e2526989a9c670c94989c3c3bed (diff) |
Remove Docker containers when suspending host
Diffstat (limited to 'node-admin/src/main/java')
4 files changed, 52 insertions, 55 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java index 6002e7bcd89..ca9083e9d27 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java @@ -52,7 +52,7 @@ public interface NodeAdmin { void start(); /** - * Stop the NodeAgent. Will not delete the storage or stop the container. + * Stop the NodeAgents. Will not delete the storage or stop the container. */ void stop(); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index e6a3d740273..e9eacddb060 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -79,7 +79,7 @@ public class NodeAdminImpl implements NodeAdmin { // Stop and remove NodeAgents that should no longer be running diff(nodeAgentWithSchedulerByHostname.keySet(), nodeAgentContextsByHostname.keySet()) - .forEach(hostname -> nodeAgentWithSchedulerByHostname.remove(hostname).stop()); + .forEach(hostname -> nodeAgentWithSchedulerByHostname.remove(hostname).stopForRemoval()); // Start NodeAgent for hostnames that should be running, but aren't yet diff(nodeAgentContextsByHostname.keySet(), nodeAgentWithSchedulerByHostname.keySet()).forEach(hostname -> { @@ -156,10 +156,7 @@ public class NodeAdminImpl implements NodeAdmin { hostnames.parallelStream() .filter(nodeAgentWithSchedulerByHostname::containsKey) .map(nodeAgentWithSchedulerByHostname::get) - .forEach(nodeAgent -> { - nodeAgent.suspend(); - nodeAgent.stopServices(); - }); + .forEach(NodeAgentWithScheduler::stopForHostSuspension); } @Override @@ -170,7 +167,7 @@ public class NodeAdminImpl implements NodeAdmin { @Override public void stop() { // Stop all node-agents in parallel, will block until the last NodeAgent is stopped - nodeAgentWithSchedulerByHostname.values().parallelStream().forEach(NodeAgent::stop); + nodeAgentWithSchedulerByHostname.values().parallelStream().forEach(NodeAgent::stopForRemoval); } // Set-difference. Returns minuend minus subtrahend. @@ -189,10 +186,9 @@ public class NodeAdminImpl implements NodeAdmin { this.nodeAgentScheduler = nodeAgentScheduler; } - @Override public void stopServices() { nodeAgent.stopServices(); } - @Override public void suspend() { nodeAgent.suspend(); } @Override public void start() { nodeAgent.start(); } - @Override public void stop() { nodeAgent.stop(); } + @Override public void stopForHostSuspension() { nodeAgent.stopForHostSuspension(); } + @Override public void stopForRemoval() { nodeAgent.stopForRemoval(); } @Override public void updateContainerNodeMetrics() { nodeAgent.updateContainerNodeMetrics(); } @Override public boolean isDownloadingImage() { return nodeAgent.isDownloadingImage(); } @Override public int getAndResetNumberOfUnhandledExceptions() { return nodeAgent.getAndResetNumberOfUnhandledExceptions(); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index 10076c4f48a..d62cb8e45d9 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -9,18 +9,6 @@ package com.yahoo.vespa.hosted.node.admin.nodeagent; * @author bakksjo */ public interface NodeAgent { - - /** - * Stop services running on node. Depending on the state of the node, {@link #suspend()} might need to be - * called before calling this method. - */ - void stopServices(); - - /** - * Suspend node. Take node offline (e.g. take node out of VIP, drain traffic, prepare for restart etc.) - */ - void suspend(); - /** * Starts the agent. After this method is called, the agent will asynchronously maintain the node, continuously * striving to make the current state equal to the wanted state. @@ -28,11 +16,16 @@ public interface NodeAgent { void start(); /** + * Stop the node in anticipation of host suspension, e.g. reboot or docker upgrade. + */ + void stopForHostSuspension(); + + /** * Signals to the agent that the node is at the end of its lifecycle and no longer needs a managing agent. * Cleans up any resources the agent owns, such as threads, connections etc. Cleanup is synchronous; when this * method returns, no more actions will be taken by the agent. */ - void stop(); + void stopForRemoval(); /** * Updates metric receiver with the latest node-agent stats diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 6e3a507c649..781b036e484 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -137,7 +137,7 @@ public class NodeAgentImpl implements NodeAgent { } @Override - public void stop() { + public void stopForRemoval() { if (!terminated.compareAndSet(false, true)) { throw new RuntimeException("Can not re-stop a node agent."); } @@ -213,16 +213,21 @@ public class NodeAgentImpl implements NodeAgent { private Optional<Container> removeContainerIfNeededUpdateContainerState( NodeAgentContext context, Optional<Container> existingContainer) { - return existingContainer - .flatMap(container -> removeContainerIfNeeded(context, container)) - .map(container -> { - shouldRestartServices(context.node()).ifPresent(restartReason -> { - context.log(logger, "Will restart services: " + restartReason); - restartServices(context, container); - currentRestartGeneration = context.node().getWantedRestartGeneration(); - }); - return container; - }); + if (existingContainer.isPresent()) { + Optional<String> reason = shouldRemoveContainer(context.node(), existingContainer.get()); + if (reason.isPresent()) { + removeContainer(context, existingContainer.get(), reason.get(), false); + return Optional.empty(); + } + + shouldRestartServices(context.node()).ifPresent(restartReason -> { + context.log(logger, "Will restart services: " + restartReason); + restartServices(context, existingContainer.get()); + currentRestartGeneration = context.node().getWantedRestartGeneration(); + }); + } + + return existingContainer; } private Optional<String> shouldRestartServices(NodeSpec node) { @@ -245,8 +250,7 @@ public class NodeAgentImpl implements NodeAgent { } } - @Override - public void stopServices() { + private void stopServices() { NodeAgentContext context = contextSupplier.currentContext(); context.log(logger, "Stopping services"); if (containerState == ABSENT) return; @@ -259,6 +263,11 @@ public class NodeAgentImpl implements NodeAgent { } @Override + public void stopForHostSuspension() { + NodeAgentContext context = contextSupplier.currentContext(); + getContainer(context).ifPresent(container -> removeContainer(context, container, "suspending host", true)); + } + public void suspend() { NodeAgentContext context = contextSupplier.currentContext(); context.log(logger, "Suspending services on node"); @@ -306,33 +315,32 @@ public class NodeAgentImpl implements NodeAgent { return Optional.empty(); } - private Optional<Container> removeContainerIfNeeded(NodeAgentContext context, Container existingContainer) { - Optional<String> removeReason = shouldRemoveContainer(context.node(), existingContainer); - if (removeReason.isPresent()) { - context.log(logger, "Will remove container: " + removeReason.get()); + private void removeContainer(NodeAgentContext context, Container existingContainer, String reason, boolean alreadySuspended) { + context.log(logger, "Will remove container: " + reason); - if (existingContainer.state.isRunning()) { + if (existingContainer.state.isRunning()) { + if (!alreadySuspended) { orchestratorSuspendNode(context); + } - try { - if (context.node().getState() != NodeState.dirty) { - suspend(); - } - stopServices(); - } catch (Exception e) { - context.log(logger, LogLevel.WARNING, "Failed stopping services, ignoring", e); + try { + if (context.node().getState() != NodeState.dirty) { + suspend(); } + stopServices(); + } catch (Exception e) { + context.log(logger, LogLevel.WARNING, "Failed stopping services, ignoring", e); } - storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer)); - dockerOperations.removeContainer(context, existingContainer); - currentRebootGeneration = context.node().getWantedRebootGeneration(); - containerState = ABSENT; - context.log(logger, "Container successfully removed, new containerState is " + containerState); - return Optional.empty(); } - return Optional.of(existingContainer); + + storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer)); + dockerOperations.removeContainer(context, existingContainer); + currentRebootGeneration = context.node().getWantedRebootGeneration(); + containerState = ABSENT; + context.log(logger, "Container successfully removed, new containerState is " + containerState); } + private void updateContainerIfNeeded(NodeAgentContext context, Container existingContainer) { ContainerResources wantedContainerResources = getContainerResources(context.node()); if (wantedContainerResources.equalsCpu(existingContainer.resources)) return; |