From 53f858167a81a54b6b16bf46fc58769a62bc5b71 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Thu, 15 Mar 2018 13:09:21 +0100 Subject: Fix unstable test --- .../node/admin/integrationTests/CallOrderVerifier.java | 12 ++++++++++-- .../hosted/node/admin/integrationTests/DockerFailTest.java | 3 +-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'node-admin') diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java index 8511c86ed88..0464289101b 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java @@ -32,11 +32,19 @@ public class CallOrderVerifier { } public void assertInOrder(String... functionCalls) { - assertInOrderWithAssertMessage("", functionCalls); + assertInOrder(waitForCallOrderTimeout, functionCalls); + } + + public void assertInOrder(long timeout, String... functionCalls) { + assertInOrderWithAssertMessage(timeout, "", functionCalls); } public void assertInOrderWithAssertMessage(String assertMessage, String... functionCalls) { - boolean inOrder = verifyInOrder(waitForCallOrderTimeout, functionCalls); + assertInOrderWithAssertMessage(waitForCallOrderTimeout, assertMessage, functionCalls); + } + + public void assertInOrderWithAssertMessage(long timeout, String assertMessage, String... functionCalls) { + boolean inOrder = verifyInOrder(timeout, functionCalls); if ( ! inOrder && ! assertMessage.isEmpty()) System.err.println(assertMessage); assertTrue(toString(), inOrder); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java index 2c8cea4c8de..e52aeeb419c 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java @@ -4,7 +4,6 @@ package com.yahoo.vespa.hosted.node.admin.integrationTests; import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; -import com.yahoo.vespa.hosted.node.admin.docker.DockerOperationsImpl; import com.yahoo.vespa.hosted.provision.Node; import org.junit.Test; @@ -35,7 +34,7 @@ public class DockerFailTest { Thread.sleep(10); } - dockerTester.callOrderVerifier.assertInOrder( + dockerTester.callOrderVerifier.assertInOrder(1200, "createContainerCommand with DockerImage { imageId=dockerImage }, HostName: host1.test.yahoo.com, ContainerName { name=host1 }", "executeInContainerAsRoot with ContainerName { name=host1 }, args: [" + DockerTester.NODE_PROGRAM + ", resume]"); -- cgit v1.2.3 From 1f7cdac6b200a73fd8f999340fad55e8b5d84f55 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Thu, 15 Mar 2018 14:21:09 +0100 Subject: Retry container start --- .../hosted/node/admin/nodeagent/NodeAgentImpl.java | 47 +++++++++------- .../node/admin/nodeagent/NodeAgentImplTest.java | 62 +++++++++++++++++++--- 2 files changed, 84 insertions(+), 25 deletions(-) (limited to 'node-admin') diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index f08c91ec9c0..4fb1f81e042 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -49,6 +49,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.ABSENT; +import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.STARTING; import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.UNKNOWN; /** @@ -93,19 +94,21 @@ public class NodeAgentImpl implements NodeAgent { private final ScheduledExecutorService filebeatRestarter = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter")); private Consumer serviceRestarter; - private Future currentFilebeatRestarter; + private Optional> currentFilebeatRestarter = Optional.empty(); private boolean resumeScriptRun = false; /** * ABSENT means container is definitely absent - A container that was absent will not suddenly appear without * NodeAgent explicitly starting it. + * STARTING state is set just before we attempt to start a container, if successful we move to the next state. * Otherwise we can't be certain. A container that was running a minute ago may no longer be running without * NodeAgent doing anything (container could have crashed). Therefore we always have to ask docker daemon * to get updated state of the container. */ enum ContainerState { ABSENT, + STARTING, UNKNOWN } @@ -225,8 +228,15 @@ public class NodeAgentImpl implements NodeAgent { logger.info("Stopped"); } - private void runLocalResumeScriptIfNeeded() { + void runLocalResumeScriptIfNeeded(ContainerNodeSpec nodeSpec) { if (! resumeScriptRun) { + storageMaintainer.writeMetricsConfig(containerName, nodeSpec); + storageMaintainer.writeFilebeatConfig(containerName, nodeSpec); + aclMaintainer.run(); + stopFilebeatSchedulerIfNeeded(); + currentFilebeatRestarter = Optional.of(filebeatRestarter.scheduleWithFixedDelay( + () -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS)); + addDebugMessage("Starting optional node program resume command"); dockerOperations.resumeNode(containerName); resumeScriptRun = true; @@ -245,8 +255,8 @@ public class NodeAgentImpl implements NodeAgent { // update reboot gen with wanted gen if set, we ignore reboot for Docker nodes but // want the two to be equal in node repo .withRebootGeneration(nodeSpec.wantedRebootGeneration.orElse(0L)) - .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState != ABSENT).orElse(new DockerImage(""))) - .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState != ABSENT).orElse("")); + .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState == UNKNOWN).orElse(new DockerImage(""))) + .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState == UNKNOWN).orElse("")); publishStateToNodeRepoIfChanged(currentNodeAttributes, wantedNodeAttributes); } @@ -265,15 +275,9 @@ public class NodeAgentImpl implements NodeAgent { createContainerData(nodeSpec); dockerOperations.createContainer(containerName, nodeSpec); dockerOperations.startContainer(containerName, nodeSpec); - aclMaintainer.run(); lastCpuMetric = new CpuUsageReporter(); - currentFilebeatRestarter = filebeatRestarter.scheduleWithFixedDelay(() -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS); - storageMaintainer.writeMetricsConfig(containerName, nodeSpec); - storageMaintainer.writeFilebeatConfig(containerName, nodeSpec); - resumeScriptRun = false; - containerState = UNKNOWN; logger.info("Container successfully started, new containerState is " + containerState); } @@ -336,6 +340,8 @@ public class NodeAgentImpl implements NodeAgent { return Optional.of("Container should be running with different resource allocation, wanted: " + wantedContainerResources + ", actual: " + existingContainer.resources); } + + if (containerState == STARTING) return Optional.of("Container failed to start"); return Optional.empty(); } @@ -355,7 +361,7 @@ public class NodeAgentImpl implements NodeAgent { logger.info("Failed stopping services, ignoring", e); } } - if (currentFilebeatRestarter != null) currentFilebeatRestarter.cancel(true); + stopFilebeatSchedulerIfNeeded(); dockerOperations.removeContainer(existingContainer, nodeSpec); containerState = ABSENT; logger.info("Container successfully removed, new containerState is " + containerState); @@ -419,13 +425,7 @@ public class NodeAgentImpl implements NodeAgent { logger.info(e.getMessage()); addDebugMessage(e.getMessage()); } catch (DockerException e) { - // When a new version of node-admin app is released, there is a brief period of time when both - // new and old version run together. If one of them stats/stops/deletes the container it manages, - // the other's assumption of containerState may become incorrect. It'll then start making invalid - // requests, for example to start a container that is already running, the containerState should - // therefore be reset if we get an exception from docker. numberOfUnhandledException++; - containerState = UNKNOWN; logger.error("Caught a DockerException, resetting containerState to " + containerState, e); } catch (Exception e) { numberOfUnhandledException++; @@ -486,10 +486,12 @@ public class NodeAgentImpl implements NodeAgent { container = removeContainerIfNeededUpdateContainerState(nodeSpec, container); if (! container.isPresent()) { storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false); + containerState = STARTING; startContainer(nodeSpec); + containerState = UNKNOWN; } - runLocalResumeScriptIfNeeded(); + runLocalResumeScriptIfNeeded(nodeSpec); // Because it's more important to stop a bad release from rolling out in prod, // we put the resume call last. So if we fail after updating the node repo attributes // but before resume, the app may go through the tenant pipeline but will halt in prod. @@ -524,10 +526,17 @@ public class NodeAgentImpl implements NodeAgent { } } + private void stopFilebeatSchedulerIfNeeded() { + if (currentFilebeatRestarter.isPresent()) { + currentFilebeatRestarter.get().cancel(true); + currentFilebeatRestarter = Optional.empty(); + } + } + @SuppressWarnings("unchecked") public void updateContainerNodeMetrics() { final ContainerNodeSpec nodeSpec = lastNodeSpec; - if (nodeSpec == null || containerState == ABSENT) return; + if (nodeSpec == null || containerState != UNKNOWN) return; Optional containerStats = dockerOperations.getContainerStats(containerName); if (!containerStats.isPresent()) return; diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 9d379babeca..45bb4bbf720 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.dockerapi.ContainerResources; import com.yahoo.vespa.hosted.dockerapi.ContainerStatsImpl; import com.yahoo.vespa.hosted.dockerapi.Docker; +import com.yahoo.vespa.hosted.dockerapi.DockerException; import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; @@ -52,6 +53,7 @@ import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.inOrder; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; +import static org.mockito.Mockito.reset; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -550,6 +552,49 @@ public class NodeAgentImplTest { verify(nodeAgent, times(3)).converge(); } + @Test + public void start_container_subtask_failure_leads_to_container_restart() throws IOException { + final long restartGeneration = 1; + final long rebootGeneration = 0; + final ContainerNodeSpec nodeSpec = nodeSpecBuilder + .wantedDockerImage(dockerImage) + .nodeState(Node.State.active) + .wantedVespaVersion(vespaVersion) + .wantedRestartGeneration(restartGeneration) + .currentRestartGeneration(restartGeneration) + .wantedRebootGeneration(rebootGeneration) + .build(); + + NodeAgentImpl nodeAgent = spy(makeNodeAgent(null, false)); + + when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec)); + when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo")); + when(pathResolver.getApplicationStoragePathForHost()).thenReturn(Files.createTempDirectory("bar")); + when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L)); + doThrow(new DockerException("Failed to set up network")).doNothing().when(dockerOperations).startContainer(eq(containerName), eq(nodeSpec)); + + try { + nodeAgent.converge(); + fail("Expected to get DockerException"); + } catch (DockerException ignored) { } + + verify(dockerOperations, never()).removeContainer(any(), any()); + verify(dockerOperations, times(1)).createContainer(eq(containerName), eq(nodeSpec)); + verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec)); + verify(nodeAgent, never()).runLocalResumeScriptIfNeeded(any()); + + // The docker container was actually started and is running, but subsequent exec calls to set up + // networking failed + mockGetContainer(dockerImage, true); + nodeAgent.converge(); + + verify(dockerOperations, times(1)).removeContainer(any(), eq(nodeSpec)); + verify(dockerOperations, times(2)).createContainer(eq(containerName), eq(nodeSpec)); + verify(dockerOperations, times(2)).startContainer(eq(containerName), eq(nodeSpec)); + verify(nodeAgent, times(1)).runLocalResumeScriptIfNeeded(any()); + } + @Test @SuppressWarnings("unchecked") public void testGetRelevantMetrics() throws Exception { @@ -689,6 +734,17 @@ public class NodeAgentImplTest { } private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning) { + mockGetContainer(dockerImage, isRunning); + + when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats)); + doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any()); + doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); + + return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations, + storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL); + } + + private void mockGetContainer(DockerImage dockerImage, boolean isRunning) { Optional container = dockerImage != null ? Optional.of(new Container( hostName, @@ -699,12 +755,6 @@ public class NodeAgentImplTest { isRunning ? 1 : 0)) : Optional.empty(); - when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats)); when(dockerOperations.getContainer(eq(containerName))).thenReturn(container); - doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any()); - doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); - - return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations, - storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL); } } -- cgit v1.2.3