aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <valerij92@gmail.com>2018-09-18 23:18:09 +0200
committerValerij Fredriksen <valerij92@gmail.com>2018-09-18 23:18:09 +0200
commit0029766a9a08c8371b96e4dd7a14b45a162583ed (patch)
tree2ddf450785a51dda3b6b274f9285a636475def6c
parent94f6d92f81de41938788e15a19a473105bdc4ff4 (diff)
Improve container is missing error handling
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java5
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java26
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java15
3 files changed, 22 insertions, 24 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java
index a76733a6947..4a19e5fe215 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperations.java
@@ -36,6 +36,11 @@ public interface DockerOperations {
void stopServicesOnNode(ContainerName containerName);
+ /**
+ * Try to suspend node. Suspending a node means the node should be taken offline,
+ * such that maintenance can be done of the node (upgrading, rebooting, etc),
+ * and such that we will start serving again as soon as possible afterwards.
+ */
void trySuspendNode(ContainerName containerName);
Optional<ContainerStats> getContainerStats(ContainerName containerName);
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
index fdf825e7a24..d27034c6f76 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
@@ -215,26 +215,6 @@ public class DockerOperationsImpl implements DockerOperations {
}
/**
- * Try to suspend node. Suspending a node means the node should be taken offline,
- * such that maintenance can be done of the node (upgrading, rebooting, etc),
- * and such that we will start serving again as soon as possible afterwards.
- * <p>
- * Any failures are logged and ignored.
- */
- @Override
- public void trySuspendNode(ContainerName containerName) {
- try {
- // TODO: Change to waiting w/o timeout (need separate thread that we can stop).
- executeCommandInContainer(containerName, nodeProgram, "suspend");
- } catch (RuntimeException e) {
- PrefixLogger logger = PrefixLogger.getNodeAgentLogger(DockerOperationsImpl.class, containerName);
- // It's bad to continue as-if nothing happened, but on the other hand if we do not proceed to
- // remove container, we will not be able to upgrade to fix any problems in the suspend logic!
- logger.warning("Failed trying to suspend container " + containerName.asString(), e);
- }
- }
-
- /**
* For macvlan:
* <p>
* Due to a bug in docker (https://github.com/docker/libnetwork/issues/1443), we need to manually set
@@ -302,7 +282,6 @@ public class DockerOperationsImpl implements DockerOperations {
Arrays.toString(wrappedCommand), containerName.asString(), containerPid), e);
throw new RuntimeException(e);
}
-
}
@Override
@@ -321,6 +300,11 @@ public class DockerOperationsImpl implements DockerOperations {
}
@Override
+ public void trySuspendNode(ContainerName containerName) {
+ executeCommandInContainer(containerName, nodeProgram, "suspend");
+ }
+
+ @Override
public Optional<ContainerStats> getContainerStats(ContainerName containerName) {
return docker.getContainerStats(containerName);
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 92138317867..56159f79622 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -7,6 +7,7 @@ import com.yahoo.vespa.hosted.dockerapi.Container;
import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.dockerapi.ContainerResources;
import com.yahoo.vespa.hosted.dockerapi.ContainerStats;
+import com.yahoo.vespa.hosted.dockerapi.exception.ContainerNotFoundException;
import com.yahoo.vespa.hosted.dockerapi.exception.DockerException;
import com.yahoo.vespa.hosted.dockerapi.exception.DockerExecTimeoutException;
import com.yahoo.vespa.hosted.dockerapi.DockerImage;
@@ -293,8 +294,13 @@ public class NodeAgentImpl implements NodeAgent {
@Override
public void stopServices() {
logger.info("Stopping services for " + containerName);
- dockerOperations.trySuspendNode(containerName);
- dockerOperations.stopServicesOnNode(containerName);
+ if (containerState == ABSENT) return;
+ try {
+ dockerOperations.trySuspendNode(containerName);
+ dockerOperations.stopServicesOnNode(containerName);
+ } catch (ContainerNotFoundException e) {
+ containerState = ABSENT;
+ }
}
private Optional<String> shouldRemoveContainer(NodeSpec node, Container existingContainer) {
@@ -403,9 +409,12 @@ public class NodeAgentImpl implements NodeAgent {
converged = true;
} catch (OrchestratorException e) {
logger.info(e.getMessage());
+ } catch (ContainerNotFoundException e) {
+ containerState = ABSENT;
+ logger.warning("Container unexpectedly gone, resetting containerState to " + containerState);
} catch (DockerException e) {
numberOfUnhandledException++;
- logger.error("Caught a DockerException, resetting containerState to " + containerState, e);
+ logger.error("Caught a DockerException", e);
} catch (Exception e) {
numberOfUnhandledException++;
logger.error("Unhandled exception, ignoring.", e);