summaryrefslogtreecommitdiffstats
path: root/node-admin/src/main/java
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-03-06 18:14:04 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2019-03-06 18:14:04 +0100
commit24daed22401950b04c596bfdb98c4eacb1e0ace1 (patch)
treeca77437463dcb18ac2ecdfa7e90b6b8fe1a19aeb /node-admin/src/main/java
parent9b1d4b5485b23e2526989a9c670c94989c3c3bed (diff)
Remove Docker containers when suspending host
Diffstat (limited to 'node-admin/src/main/java')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java14
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java19
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java72
4 files changed, 52 insertions, 55 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java
index 6002e7bcd89..ca9083e9d27 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java
@@ -52,7 +52,7 @@ public interface NodeAdmin {
void start();
/**
- * Stop the NodeAgent. Will not delete the storage or stop the container.
+ * Stop the NodeAgents. Will not delete the storage or stop the container.
*/
void stop();
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index e6a3d740273..e9eacddb060 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -79,7 +79,7 @@ public class NodeAdminImpl implements NodeAdmin {
// Stop and remove NodeAgents that should no longer be running
diff(nodeAgentWithSchedulerByHostname.keySet(), nodeAgentContextsByHostname.keySet())
- .forEach(hostname -> nodeAgentWithSchedulerByHostname.remove(hostname).stop());
+ .forEach(hostname -> nodeAgentWithSchedulerByHostname.remove(hostname).stopForRemoval());
// Start NodeAgent for hostnames that should be running, but aren't yet
diff(nodeAgentContextsByHostname.keySet(), nodeAgentWithSchedulerByHostname.keySet()).forEach(hostname -> {
@@ -156,10 +156,7 @@ public class NodeAdminImpl implements NodeAdmin {
hostnames.parallelStream()
.filter(nodeAgentWithSchedulerByHostname::containsKey)
.map(nodeAgentWithSchedulerByHostname::get)
- .forEach(nodeAgent -> {
- nodeAgent.suspend();
- nodeAgent.stopServices();
- });
+ .forEach(NodeAgentWithScheduler::stopForHostSuspension);
}
@Override
@@ -170,7 +167,7 @@ public class NodeAdminImpl implements NodeAdmin {
@Override
public void stop() {
// Stop all node-agents in parallel, will block until the last NodeAgent is stopped
- nodeAgentWithSchedulerByHostname.values().parallelStream().forEach(NodeAgent::stop);
+ nodeAgentWithSchedulerByHostname.values().parallelStream().forEach(NodeAgent::stopForRemoval);
}
// Set-difference. Returns minuend minus subtrahend.
@@ -189,10 +186,9 @@ public class NodeAdminImpl implements NodeAdmin {
this.nodeAgentScheduler = nodeAgentScheduler;
}
- @Override public void stopServices() { nodeAgent.stopServices(); }
- @Override public void suspend() { nodeAgent.suspend(); }
@Override public void start() { nodeAgent.start(); }
- @Override public void stop() { nodeAgent.stop(); }
+ @Override public void stopForHostSuspension() { nodeAgent.stopForHostSuspension(); }
+ @Override public void stopForRemoval() { nodeAgent.stopForRemoval(); }
@Override public void updateContainerNodeMetrics() { nodeAgent.updateContainerNodeMetrics(); }
@Override public boolean isDownloadingImage() { return nodeAgent.isDownloadingImage(); }
@Override public int getAndResetNumberOfUnhandledExceptions() { return nodeAgent.getAndResetNumberOfUnhandledExceptions(); }
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
index 10076c4f48a..d62cb8e45d9 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
@@ -9,18 +9,6 @@ package com.yahoo.vespa.hosted.node.admin.nodeagent;
* @author bakksjo
*/
public interface NodeAgent {
-
- /**
- * Stop services running on node. Depending on the state of the node, {@link #suspend()} might need to be
- * called before calling this method.
- */
- void stopServices();
-
- /**
- * Suspend node. Take node offline (e.g. take node out of VIP, drain traffic, prepare for restart etc.)
- */
- void suspend();
-
/**
* Starts the agent. After this method is called, the agent will asynchronously maintain the node, continuously
* striving to make the current state equal to the wanted state.
@@ -28,11 +16,16 @@ public interface NodeAgent {
void start();
/**
+ * Stop the node in anticipation of host suspension, e.g. reboot or docker upgrade.
+ */
+ void stopForHostSuspension();
+
+ /**
* Signals to the agent that the node is at the end of its lifecycle and no longer needs a managing agent.
* Cleans up any resources the agent owns, such as threads, connections etc. Cleanup is synchronous; when this
* method returns, no more actions will be taken by the agent.
*/
- void stop();
+ void stopForRemoval();
/**
* Updates metric receiver with the latest node-agent stats
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 6e3a507c649..781b036e484 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -137,7 +137,7 @@ public class NodeAgentImpl implements NodeAgent {
}
@Override
- public void stop() {
+ public void stopForRemoval() {
if (!terminated.compareAndSet(false, true)) {
throw new RuntimeException("Can not re-stop a node agent.");
}
@@ -213,16 +213,21 @@ public class NodeAgentImpl implements NodeAgent {
private Optional<Container> removeContainerIfNeededUpdateContainerState(
NodeAgentContext context, Optional<Container> existingContainer) {
- return existingContainer
- .flatMap(container -> removeContainerIfNeeded(context, container))
- .map(container -> {
- shouldRestartServices(context.node()).ifPresent(restartReason -> {
- context.log(logger, "Will restart services: " + restartReason);
- restartServices(context, container);
- currentRestartGeneration = context.node().getWantedRestartGeneration();
- });
- return container;
- });
+ if (existingContainer.isPresent()) {
+ Optional<String> reason = shouldRemoveContainer(context.node(), existingContainer.get());
+ if (reason.isPresent()) {
+ removeContainer(context, existingContainer.get(), reason.get(), false);
+ return Optional.empty();
+ }
+
+ shouldRestartServices(context.node()).ifPresent(restartReason -> {
+ context.log(logger, "Will restart services: " + restartReason);
+ restartServices(context, existingContainer.get());
+ currentRestartGeneration = context.node().getWantedRestartGeneration();
+ });
+ }
+
+ return existingContainer;
}
private Optional<String> shouldRestartServices(NodeSpec node) {
@@ -245,8 +250,7 @@ public class NodeAgentImpl implements NodeAgent {
}
}
- @Override
- public void stopServices() {
+ private void stopServices() {
NodeAgentContext context = contextSupplier.currentContext();
context.log(logger, "Stopping services");
if (containerState == ABSENT) return;
@@ -259,6 +263,11 @@ public class NodeAgentImpl implements NodeAgent {
}
@Override
+ public void stopForHostSuspension() {
+ NodeAgentContext context = contextSupplier.currentContext();
+ getContainer(context).ifPresent(container -> removeContainer(context, container, "suspending host", true));
+ }
+
public void suspend() {
NodeAgentContext context = contextSupplier.currentContext();
context.log(logger, "Suspending services on node");
@@ -306,33 +315,32 @@ public class NodeAgentImpl implements NodeAgent {
return Optional.empty();
}
- private Optional<Container> removeContainerIfNeeded(NodeAgentContext context, Container existingContainer) {
- Optional<String> removeReason = shouldRemoveContainer(context.node(), existingContainer);
- if (removeReason.isPresent()) {
- context.log(logger, "Will remove container: " + removeReason.get());
+ private void removeContainer(NodeAgentContext context, Container existingContainer, String reason, boolean alreadySuspended) {
+ context.log(logger, "Will remove container: " + reason);
- if (existingContainer.state.isRunning()) {
+ if (existingContainer.state.isRunning()) {
+ if (!alreadySuspended) {
orchestratorSuspendNode(context);
+ }
- try {
- if (context.node().getState() != NodeState.dirty) {
- suspend();
- }
- stopServices();
- } catch (Exception e) {
- context.log(logger, LogLevel.WARNING, "Failed stopping services, ignoring", e);
+ try {
+ if (context.node().getState() != NodeState.dirty) {
+ suspend();
}
+ stopServices();
+ } catch (Exception e) {
+ context.log(logger, LogLevel.WARNING, "Failed stopping services, ignoring", e);
}
- storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer));
- dockerOperations.removeContainer(context, existingContainer);
- currentRebootGeneration = context.node().getWantedRebootGeneration();
- containerState = ABSENT;
- context.log(logger, "Container successfully removed, new containerState is " + containerState);
- return Optional.empty();
}
- return Optional.of(existingContainer);
+
+ storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer));
+ dockerOperations.removeContainer(context, existingContainer);
+ currentRebootGeneration = context.node().getWantedRebootGeneration();
+ containerState = ABSENT;
+ context.log(logger, "Container successfully removed, new containerState is " + containerState);
}
+
private void updateContainerIfNeeded(NodeAgentContext context, Container existingContainer) {
ContainerResources wantedContainerResources = getContainerResources(context.node());
if (wantedContainerResources.equalsCpu(existingContainer.resources)) return;