diff options
author | Valerij Fredriksen <valerijf@oath.com> | 2018-03-15 14:21:09 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@oath.com> | 2018-03-15 14:21:09 +0100 |
commit | 1f7cdac6b200a73fd8f999340fad55e8b5d84f55 (patch) | |
tree | 282a5526d56f27164e167c4138bfa8dde0a0ac30 /node-admin | |
parent | 53f858167a81a54b6b16bf46fc58769a62bc5b71 (diff) |
Retry container start
Diffstat (limited to 'node-admin')
2 files changed, 84 insertions, 25 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index f08c91ec9c0..4fb1f81e042 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -49,6 +49,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.ABSENT; +import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.STARTING; import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.UNKNOWN; /** @@ -93,19 +94,21 @@ public class NodeAgentImpl implements NodeAgent { private final ScheduledExecutorService filebeatRestarter = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter")); private Consumer<String> serviceRestarter; - private Future<?> currentFilebeatRestarter; + private Optional<Future<?>> currentFilebeatRestarter = Optional.empty(); private boolean resumeScriptRun = false; /** * ABSENT means container is definitely absent - A container that was absent will not suddenly appear without * NodeAgent explicitly starting it. + * STARTING state is set just before we attempt to start a container, if successful we move to the next state. * Otherwise we can't be certain. A container that was running a minute ago may no longer be running without * NodeAgent doing anything (container could have crashed). Therefore we always have to ask docker daemon * to get updated state of the container. */ enum ContainerState { ABSENT, + STARTING, UNKNOWN } @@ -225,8 +228,15 @@ public class NodeAgentImpl implements NodeAgent { logger.info("Stopped"); } - private void runLocalResumeScriptIfNeeded() { + void runLocalResumeScriptIfNeeded(ContainerNodeSpec nodeSpec) { if (! resumeScriptRun) { + storageMaintainer.writeMetricsConfig(containerName, nodeSpec); + storageMaintainer.writeFilebeatConfig(containerName, nodeSpec); + aclMaintainer.run(); + stopFilebeatSchedulerIfNeeded(); + currentFilebeatRestarter = Optional.of(filebeatRestarter.scheduleWithFixedDelay( + () -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS)); + addDebugMessage("Starting optional node program resume command"); dockerOperations.resumeNode(containerName); resumeScriptRun = true; @@ -245,8 +255,8 @@ public class NodeAgentImpl implements NodeAgent { // update reboot gen with wanted gen if set, we ignore reboot for Docker nodes but // want the two to be equal in node repo .withRebootGeneration(nodeSpec.wantedRebootGeneration.orElse(0L)) - .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState != ABSENT).orElse(new DockerImage(""))) - .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState != ABSENT).orElse("")); + .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState == UNKNOWN).orElse(new DockerImage(""))) + .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState == UNKNOWN).orElse("")); publishStateToNodeRepoIfChanged(currentNodeAttributes, wantedNodeAttributes); } @@ -265,15 +275,9 @@ public class NodeAgentImpl implements NodeAgent { createContainerData(nodeSpec); dockerOperations.createContainer(containerName, nodeSpec); dockerOperations.startContainer(containerName, nodeSpec); - aclMaintainer.run(); lastCpuMetric = new CpuUsageReporter(); - currentFilebeatRestarter = filebeatRestarter.scheduleWithFixedDelay(() -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS); - storageMaintainer.writeMetricsConfig(containerName, nodeSpec); - storageMaintainer.writeFilebeatConfig(containerName, nodeSpec); - resumeScriptRun = false; - containerState = UNKNOWN; logger.info("Container successfully started, new containerState is " + containerState); } @@ -336,6 +340,8 @@ public class NodeAgentImpl implements NodeAgent { return Optional.of("Container should be running with different resource allocation, wanted: " + wantedContainerResources + ", actual: " + existingContainer.resources); } + + if (containerState == STARTING) return Optional.of("Container failed to start"); return Optional.empty(); } @@ -355,7 +361,7 @@ public class NodeAgentImpl implements NodeAgent { logger.info("Failed stopping services, ignoring", e); } } - if (currentFilebeatRestarter != null) currentFilebeatRestarter.cancel(true); + stopFilebeatSchedulerIfNeeded(); dockerOperations.removeContainer(existingContainer, nodeSpec); containerState = ABSENT; logger.info("Container successfully removed, new containerState is " + containerState); @@ -419,13 +425,7 @@ public class NodeAgentImpl implements NodeAgent { logger.info(e.getMessage()); addDebugMessage(e.getMessage()); } catch (DockerException e) { - // When a new version of node-admin app is released, there is a brief period of time when both - // new and old version run together. If one of them stats/stops/deletes the container it manages, - // the other's assumption of containerState may become incorrect. It'll then start making invalid - // requests, for example to start a container that is already running, the containerState should - // therefore be reset if we get an exception from docker. numberOfUnhandledException++; - containerState = UNKNOWN; logger.error("Caught a DockerException, resetting containerState to " + containerState, e); } catch (Exception e) { numberOfUnhandledException++; @@ -486,10 +486,12 @@ public class NodeAgentImpl implements NodeAgent { container = removeContainerIfNeededUpdateContainerState(nodeSpec, container); if (! container.isPresent()) { storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false); + containerState = STARTING; startContainer(nodeSpec); + containerState = UNKNOWN; } - runLocalResumeScriptIfNeeded(); + runLocalResumeScriptIfNeeded(nodeSpec); // Because it's more important to stop a bad release from rolling out in prod, // we put the resume call last. So if we fail after updating the node repo attributes // but before resume, the app may go through the tenant pipeline but will halt in prod. @@ -524,10 +526,17 @@ public class NodeAgentImpl implements NodeAgent { } } + private void stopFilebeatSchedulerIfNeeded() { + if (currentFilebeatRestarter.isPresent()) { + currentFilebeatRestarter.get().cancel(true); + currentFilebeatRestarter = Optional.empty(); + } + } + @SuppressWarnings("unchecked") public void updateContainerNodeMetrics() { final ContainerNodeSpec nodeSpec = lastNodeSpec; - if (nodeSpec == null || containerState == ABSENT) return; + if (nodeSpec == null || containerState != UNKNOWN) return; Optional<Docker.ContainerStats> containerStats = dockerOperations.getContainerStats(containerName); if (!containerStats.isPresent()) return; diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 9d379babeca..45bb4bbf720 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.dockerapi.ContainerResources; import com.yahoo.vespa.hosted.dockerapi.ContainerStatsImpl; import com.yahoo.vespa.hosted.dockerapi.Docker; +import com.yahoo.vespa.hosted.dockerapi.DockerException; import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; @@ -52,6 +53,7 @@ import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.inOrder; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; +import static org.mockito.Mockito.reset; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -551,6 +553,49 @@ public class NodeAgentImplTest { } @Test + public void start_container_subtask_failure_leads_to_container_restart() throws IOException { + final long restartGeneration = 1; + final long rebootGeneration = 0; + final ContainerNodeSpec nodeSpec = nodeSpecBuilder + .wantedDockerImage(dockerImage) + .nodeState(Node.State.active) + .wantedVespaVersion(vespaVersion) + .wantedRestartGeneration(restartGeneration) + .currentRestartGeneration(restartGeneration) + .wantedRebootGeneration(rebootGeneration) + .build(); + + NodeAgentImpl nodeAgent = spy(makeNodeAgent(null, false)); + + when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec)); + when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo")); + when(pathResolver.getApplicationStoragePathForHost()).thenReturn(Files.createTempDirectory("bar")); + when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L)); + doThrow(new DockerException("Failed to set up network")).doNothing().when(dockerOperations).startContainer(eq(containerName), eq(nodeSpec)); + + try { + nodeAgent.converge(); + fail("Expected to get DockerException"); + } catch (DockerException ignored) { } + + verify(dockerOperations, never()).removeContainer(any(), any()); + verify(dockerOperations, times(1)).createContainer(eq(containerName), eq(nodeSpec)); + verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec)); + verify(nodeAgent, never()).runLocalResumeScriptIfNeeded(any()); + + // The docker container was actually started and is running, but subsequent exec calls to set up + // networking failed + mockGetContainer(dockerImage, true); + nodeAgent.converge(); + + verify(dockerOperations, times(1)).removeContainer(any(), eq(nodeSpec)); + verify(dockerOperations, times(2)).createContainer(eq(containerName), eq(nodeSpec)); + verify(dockerOperations, times(2)).startContainer(eq(containerName), eq(nodeSpec)); + verify(nodeAgent, times(1)).runLocalResumeScriptIfNeeded(any()); + } + + @Test @SuppressWarnings("unchecked") public void testGetRelevantMetrics() throws Exception { final ObjectMapper objectMapper = new ObjectMapper(); @@ -689,6 +734,17 @@ public class NodeAgentImplTest { } private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning) { + mockGetContainer(dockerImage, isRunning); + + when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats)); + doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any()); + doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); + + return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations, + storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL); + } + + private void mockGetContainer(DockerImage dockerImage, boolean isRunning) { Optional<Container> container = dockerImage != null ? Optional.of(new Container( hostName, @@ -699,12 +755,6 @@ public class NodeAgentImplTest { isRunning ? 1 : 0)) : Optional.empty(); - when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats)); when(dockerOperations.getContainer(eq(containerName))).thenReturn(container); - doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any()); - doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); - - return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations, - storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL); } } |