diff options
author | Valerij Fredriksen <valerij92@gmail.com> | 2018-09-18 23:18:09 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerij92@gmail.com> | 2018-09-18 23:18:09 +0200 |
commit | 0029766a9a08c8371b96e4dd7a14b45a162583ed (patch) | |
tree | 2ddf450785a51dda3b6b274f9285a636475def6c | |
parent | 94f6d92f81de41938788e15a19a473105bdc4ff4 (diff) |
Improve container is missing error handling
3 files changed, 22 insertions, 24 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java index a76733a6947..4a19e5fe215 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java @@ -36,6 +36,11 @@ public interface DockerOperations { void stopServicesOnNode(ContainerName containerName); + /** + * Try to suspend node. Suspending a node means the node should be taken offline, + * such that maintenance can be done of the node (upgrading, rebooting, etc), + * and such that we will start serving again as soon as possible afterwards. + */ void trySuspendNode(ContainerName containerName); Optional<ContainerStats> getContainerStats(ContainerName containerName); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java index fdf825e7a24..d27034c6f76 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java @@ -215,26 +215,6 @@ public class DockerOperationsImpl implements DockerOperations { } /** - * Try to suspend node. Suspending a node means the node should be taken offline, - * such that maintenance can be done of the node (upgrading, rebooting, etc), - * and such that we will start serving again as soon as possible afterwards. - * <p> - * Any failures are logged and ignored. - */ - @Override - public void trySuspendNode(ContainerName containerName) { - try { - // TODO: Change to waiting w/o timeout (need separate thread that we can stop). - executeCommandInContainer(containerName, nodeProgram, "suspend"); - } catch (RuntimeException e) { - PrefixLogger logger = PrefixLogger.getNodeAgentLogger(DockerOperationsImpl.class, containerName); - // It's bad to continue as-if nothing happened, but on the other hand if we do not proceed to - // remove container, we will not be able to upgrade to fix any problems in the suspend logic! - logger.warning("Failed trying to suspend container " + containerName.asString(), e); - } - } - - /** * For macvlan: * <p> * Due to a bug in docker (https://github.com/docker/libnetwork/issues/1443), we need to manually set @@ -302,7 +282,6 @@ public class DockerOperationsImpl implements DockerOperations { Arrays.toString(wrappedCommand), containerName.asString(), containerPid), e); throw new RuntimeException(e); } - } @Override @@ -321,6 +300,11 @@ public class DockerOperationsImpl implements DockerOperations { } @Override + public void trySuspendNode(ContainerName containerName) { + executeCommandInContainer(containerName, nodeProgram, "suspend"); + } + + @Override public Optional<ContainerStats> getContainerStats(ContainerName containerName) { return docker.getContainerStats(containerName); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 92138317867..56159f79622 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -7,6 +7,7 @@ import com.yahoo.vespa.hosted.dockerapi.Container; import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.dockerapi.ContainerResources; import com.yahoo.vespa.hosted.dockerapi.ContainerStats; +import com.yahoo.vespa.hosted.dockerapi.exception.ContainerNotFoundException; import com.yahoo.vespa.hosted.dockerapi.exception.DockerException; import com.yahoo.vespa.hosted.dockerapi.exception.DockerExecTimeoutException; import com.yahoo.vespa.hosted.dockerapi.DockerImage; @@ -293,8 +294,13 @@ public class NodeAgentImpl implements NodeAgent { @Override public void stopServices() { logger.info("Stopping services for " + containerName); - dockerOperations.trySuspendNode(containerName); - dockerOperations.stopServicesOnNode(containerName); + if (containerState == ABSENT) return; + try { + dockerOperations.trySuspendNode(containerName); + dockerOperations.stopServicesOnNode(containerName); + } catch (ContainerNotFoundException e) { + containerState = ABSENT; + } } private Optional<String> shouldRemoveContainer(NodeSpec node, Container existingContainer) { @@ -403,9 +409,12 @@ public class NodeAgentImpl implements NodeAgent { converged = true; } catch (OrchestratorException e) { logger.info(e.getMessage()); + } catch (ContainerNotFoundException e) { + containerState = ABSENT; + logger.warning("Container unexpectedly gone, resetting containerState to " + containerState); } catch (DockerException e) { numberOfUnhandledException++; - logger.error("Caught a DockerException, resetting containerState to " + containerState, e); + logger.error("Caught a DockerException", e); } catch (Exception e) { numberOfUnhandledException++; logger.error("Unhandled exception, ignoring.", e); |