summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java47
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java12
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java3
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java62
4 files changed, 95 insertions, 29 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index f08c91ec9c0..4fb1f81e042 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -49,6 +49,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.ABSENT;
+import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.STARTING;
import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.UNKNOWN;
/**
@@ -93,19 +94,21 @@ public class NodeAgentImpl implements NodeAgent {
private final ScheduledExecutorService filebeatRestarter =
Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter"));
private Consumer<String> serviceRestarter;
- private Future<?> currentFilebeatRestarter;
+ private Optional<Future<?>> currentFilebeatRestarter = Optional.empty();
private boolean resumeScriptRun = false;
/**
* ABSENT means container is definitely absent - A container that was absent will not suddenly appear without
* NodeAgent explicitly starting it.
+ * STARTING state is set just before we attempt to start a container, if successful we move to the next state.
* Otherwise we can't be certain. A container that was running a minute ago may no longer be running without
* NodeAgent doing anything (container could have crashed). Therefore we always have to ask docker daemon
* to get updated state of the container.
*/
enum ContainerState {
ABSENT,
+ STARTING,
UNKNOWN
}
@@ -225,8 +228,15 @@ public class NodeAgentImpl implements NodeAgent {
logger.info("Stopped");
}
- private void runLocalResumeScriptIfNeeded() {
+ void runLocalResumeScriptIfNeeded(ContainerNodeSpec nodeSpec) {
if (! resumeScriptRun) {
+ storageMaintainer.writeMetricsConfig(containerName, nodeSpec);
+ storageMaintainer.writeFilebeatConfig(containerName, nodeSpec);
+ aclMaintainer.run();
+ stopFilebeatSchedulerIfNeeded();
+ currentFilebeatRestarter = Optional.of(filebeatRestarter.scheduleWithFixedDelay(
+ () -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS));
+
addDebugMessage("Starting optional node program resume command");
dockerOperations.resumeNode(containerName);
resumeScriptRun = true;
@@ -245,8 +255,8 @@ public class NodeAgentImpl implements NodeAgent {
// update reboot gen with wanted gen if set, we ignore reboot for Docker nodes but
// want the two to be equal in node repo
.withRebootGeneration(nodeSpec.wantedRebootGeneration.orElse(0L))
- .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState != ABSENT).orElse(new DockerImage("")))
- .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState != ABSENT).orElse(""));
+ .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState == UNKNOWN).orElse(new DockerImage("")))
+ .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState == UNKNOWN).orElse(""));
publishStateToNodeRepoIfChanged(currentNodeAttributes, wantedNodeAttributes);
}
@@ -265,15 +275,9 @@ public class NodeAgentImpl implements NodeAgent {
createContainerData(nodeSpec);
dockerOperations.createContainer(containerName, nodeSpec);
dockerOperations.startContainer(containerName, nodeSpec);
- aclMaintainer.run();
lastCpuMetric = new CpuUsageReporter();
- currentFilebeatRestarter = filebeatRestarter.scheduleWithFixedDelay(() -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS);
- storageMaintainer.writeMetricsConfig(containerName, nodeSpec);
- storageMaintainer.writeFilebeatConfig(containerName, nodeSpec);
-
resumeScriptRun = false;
- containerState = UNKNOWN;
logger.info("Container successfully started, new containerState is " + containerState);
}
@@ -336,6 +340,8 @@ public class NodeAgentImpl implements NodeAgent {
return Optional.of("Container should be running with different resource allocation, wanted: " +
wantedContainerResources + ", actual: " + existingContainer.resources);
}
+
+ if (containerState == STARTING) return Optional.of("Container failed to start");
return Optional.empty();
}
@@ -355,7 +361,7 @@ public class NodeAgentImpl implements NodeAgent {
logger.info("Failed stopping services, ignoring", e);
}
}
- if (currentFilebeatRestarter != null) currentFilebeatRestarter.cancel(true);
+ stopFilebeatSchedulerIfNeeded();
dockerOperations.removeContainer(existingContainer, nodeSpec);
containerState = ABSENT;
logger.info("Container successfully removed, new containerState is " + containerState);
@@ -419,13 +425,7 @@ public class NodeAgentImpl implements NodeAgent {
logger.info(e.getMessage());
addDebugMessage(e.getMessage());
} catch (DockerException e) {
- // When a new version of node-admin app is released, there is a brief period of time when both
- // new and old version run together. If one of them stats/stops/deletes the container it manages,
- // the other's assumption of containerState may become incorrect. It'll then start making invalid
- // requests, for example to start a container that is already running, the containerState should
- // therefore be reset if we get an exception from docker.
numberOfUnhandledException++;
- containerState = UNKNOWN;
logger.error("Caught a DockerException, resetting containerState to " + containerState, e);
} catch (Exception e) {
numberOfUnhandledException++;
@@ -486,10 +486,12 @@ public class NodeAgentImpl implements NodeAgent {
container = removeContainerIfNeededUpdateContainerState(nodeSpec, container);
if (! container.isPresent()) {
storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false);
+ containerState = STARTING;
startContainer(nodeSpec);
+ containerState = UNKNOWN;
}
- runLocalResumeScriptIfNeeded();
+ runLocalResumeScriptIfNeeded(nodeSpec);
// Because it's more important to stop a bad release from rolling out in prod,
// we put the resume call last. So if we fail after updating the node repo attributes
// but before resume, the app may go through the tenant pipeline but will halt in prod.
@@ -524,10 +526,17 @@ public class NodeAgentImpl implements NodeAgent {
}
}
+ private void stopFilebeatSchedulerIfNeeded() {
+ if (currentFilebeatRestarter.isPresent()) {
+ currentFilebeatRestarter.get().cancel(true);
+ currentFilebeatRestarter = Optional.empty();
+ }
+ }
+
@SuppressWarnings("unchecked")
public void updateContainerNodeMetrics() {
final ContainerNodeSpec nodeSpec = lastNodeSpec;
- if (nodeSpec == null || containerState == ABSENT) return;
+ if (nodeSpec == null || containerState != UNKNOWN) return;
Optional<Docker.ContainerStats> containerStats = dockerOperations.getContainerStats(containerName);
if (!containerStats.isPresent()) return;
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java
index 8511c86ed88..0464289101b 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/CallOrderVerifier.java
@@ -32,11 +32,19 @@ public class CallOrderVerifier {
}
public void assertInOrder(String... functionCalls) {
- assertInOrderWithAssertMessage("", functionCalls);
+ assertInOrder(waitForCallOrderTimeout, functionCalls);
+ }
+
+ public void assertInOrder(long timeout, String... functionCalls) {
+ assertInOrderWithAssertMessage(timeout, "", functionCalls);
}
public void assertInOrderWithAssertMessage(String assertMessage, String... functionCalls) {
- boolean inOrder = verifyInOrder(waitForCallOrderTimeout, functionCalls);
+ assertInOrderWithAssertMessage(waitForCallOrderTimeout, assertMessage, functionCalls);
+ }
+
+ public void assertInOrderWithAssertMessage(long timeout, String assertMessage, String... functionCalls) {
+ boolean inOrder = verifyInOrder(timeout, functionCalls);
if ( ! inOrder && ! assertMessage.isEmpty())
System.err.println(assertMessage);
assertTrue(toString(), inOrder);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java
index 2c8cea4c8de..e52aeeb419c 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerFailTest.java
@@ -4,7 +4,6 @@ package com.yahoo.vespa.hosted.node.admin.integrationTests;
import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.dockerapi.DockerImage;
import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec;
-import com.yahoo.vespa.hosted.node.admin.docker.DockerOperationsImpl;
import com.yahoo.vespa.hosted.provision.Node;
import org.junit.Test;
@@ -35,7 +34,7 @@ public class DockerFailTest {
Thread.sleep(10);
}
- dockerTester.callOrderVerifier.assertInOrder(
+ dockerTester.callOrderVerifier.assertInOrder(1200,
"createContainerCommand with DockerImage { imageId=dockerImage }, HostName: host1.test.yahoo.com, ContainerName { name=host1 }",
"executeInContainerAsRoot with ContainerName { name=host1 }, args: [" + DockerTester.NODE_PROGRAM + ", resume]");
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 9d379babeca..45bb4bbf720 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.dockerapi.ContainerResources;
import com.yahoo.vespa.hosted.dockerapi.ContainerStatsImpl;
import com.yahoo.vespa.hosted.dockerapi.Docker;
+import com.yahoo.vespa.hosted.dockerapi.DockerException;
import com.yahoo.vespa.hosted.dockerapi.DockerImage;
import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper;
import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec;
@@ -52,6 +53,7 @@ import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.reset;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
@@ -551,6 +553,49 @@ public class NodeAgentImplTest {
}
@Test
+ public void start_container_subtask_failure_leads_to_container_restart() throws IOException {
+ final long restartGeneration = 1;
+ final long rebootGeneration = 0;
+ final ContainerNodeSpec nodeSpec = nodeSpecBuilder
+ .wantedDockerImage(dockerImage)
+ .nodeState(Node.State.active)
+ .wantedVespaVersion(vespaVersion)
+ .wantedRestartGeneration(restartGeneration)
+ .currentRestartGeneration(restartGeneration)
+ .wantedRebootGeneration(rebootGeneration)
+ .build();
+
+ NodeAgentImpl nodeAgent = spy(makeNodeAgent(null, false));
+
+ when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec));
+ when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo"));
+ when(pathResolver.getApplicationStoragePathForHost()).thenReturn(Files.createTempDirectory("bar"));
+ when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L));
+ doThrow(new DockerException("Failed to set up network")).doNothing().when(dockerOperations).startContainer(eq(containerName), eq(nodeSpec));
+
+ try {
+ nodeAgent.converge();
+ fail("Expected to get DockerException");
+ } catch (DockerException ignored) { }
+
+ verify(dockerOperations, never()).removeContainer(any(), any());
+ verify(dockerOperations, times(1)).createContainer(eq(containerName), eq(nodeSpec));
+ verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec));
+ verify(nodeAgent, never()).runLocalResumeScriptIfNeeded(any());
+
+ // The docker container was actually started and is running, but subsequent exec calls to set up
+ // networking failed
+ mockGetContainer(dockerImage, true);
+ nodeAgent.converge();
+
+ verify(dockerOperations, times(1)).removeContainer(any(), eq(nodeSpec));
+ verify(dockerOperations, times(2)).createContainer(eq(containerName), eq(nodeSpec));
+ verify(dockerOperations, times(2)).startContainer(eq(containerName), eq(nodeSpec));
+ verify(nodeAgent, times(1)).runLocalResumeScriptIfNeeded(any());
+ }
+
+ @Test
@SuppressWarnings("unchecked")
public void testGetRelevantMetrics() throws Exception {
final ObjectMapper objectMapper = new ObjectMapper();
@@ -689,6 +734,17 @@ public class NodeAgentImplTest {
}
private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning) {
+ mockGetContainer(dockerImage, isRunning);
+
+ when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats));
+ doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any());
+ doNothing().when(storageMaintainer).writeMetricsConfig(any(), any());
+
+ return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations,
+ storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL);
+ }
+
+ private void mockGetContainer(DockerImage dockerImage, boolean isRunning) {
Optional<Container> container = dockerImage != null ?
Optional.of(new Container(
hostName,
@@ -699,12 +755,6 @@ public class NodeAgentImplTest {
isRunning ? 1 : 0)) :
Optional.empty();
- when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats));
when(dockerOperations.getContainer(eq(containerName))).thenReturn(container);
- doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any());
- doNothing().when(storageMaintainer).writeMetricsConfig(any(), any());
-
- return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations,
- storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL);
}
}