summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2018-03-15 14:21:09 +0100
committerValerij Fredriksen <valerijf@oath.com>2018-03-15 14:21:09 +0100
commit1f7cdac6b200a73fd8f999340fad55e8b5d84f55 (patch)
tree282a5526d56f27164e167c4138bfa8dde0a0ac30 /node-admin
parent53f858167a81a54b6b16bf46fc58769a62bc5b71 (diff)
Retry container start
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java47
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java62
2 files changed, 84 insertions, 25 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index f08c91ec9c0..4fb1f81e042 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -49,6 +49,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.ABSENT;
+import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.STARTING;
import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.UNKNOWN;
/**
@@ -93,19 +94,21 @@ public class NodeAgentImpl implements NodeAgent {
private final ScheduledExecutorService filebeatRestarter =
Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter"));
private Consumer<String> serviceRestarter;
- private Future<?> currentFilebeatRestarter;
+ private Optional<Future<?>> currentFilebeatRestarter = Optional.empty();
private boolean resumeScriptRun = false;
/**
* ABSENT means container is definitely absent - A container that was absent will not suddenly appear without
* NodeAgent explicitly starting it.
+ * STARTING state is set just before we attempt to start a container, if successful we move to the next state.
* Otherwise we can't be certain. A container that was running a minute ago may no longer be running without
* NodeAgent doing anything (container could have crashed). Therefore we always have to ask docker daemon
* to get updated state of the container.
*/
enum ContainerState {
ABSENT,
+ STARTING,
UNKNOWN
}
@@ -225,8 +228,15 @@ public class NodeAgentImpl implements NodeAgent {
logger.info("Stopped");
}
- private void runLocalResumeScriptIfNeeded() {
+ void runLocalResumeScriptIfNeeded(ContainerNodeSpec nodeSpec) {
if (! resumeScriptRun) {
+ storageMaintainer.writeMetricsConfig(containerName, nodeSpec);
+ storageMaintainer.writeFilebeatConfig(containerName, nodeSpec);
+ aclMaintainer.run();
+ stopFilebeatSchedulerIfNeeded();
+ currentFilebeatRestarter = Optional.of(filebeatRestarter.scheduleWithFixedDelay(
+ () -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS));
+
addDebugMessage("Starting optional node program resume command");
dockerOperations.resumeNode(containerName);
resumeScriptRun = true;
@@ -245,8 +255,8 @@ public class NodeAgentImpl implements NodeAgent {
// update reboot gen with wanted gen if set, we ignore reboot for Docker nodes but
// want the two to be equal in node repo
.withRebootGeneration(nodeSpec.wantedRebootGeneration.orElse(0L))
- .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState != ABSENT).orElse(new DockerImage("")))
- .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState != ABSENT).orElse(""));
+ .withDockerImage(nodeSpec.wantedDockerImage.filter(node -> containerState == UNKNOWN).orElse(new DockerImage("")))
+ .withVespaVersion(nodeSpec.wantedVespaVersion.filter(node -> containerState == UNKNOWN).orElse(""));
publishStateToNodeRepoIfChanged(currentNodeAttributes, wantedNodeAttributes);
}
@@ -265,15 +275,9 @@ public class NodeAgentImpl implements NodeAgent {
createContainerData(nodeSpec);
dockerOperations.createContainer(containerName, nodeSpec);
dockerOperations.startContainer(containerName, nodeSpec);
- aclMaintainer.run();
lastCpuMetric = new CpuUsageReporter();
- currentFilebeatRestarter = filebeatRestarter.scheduleWithFixedDelay(() -> serviceRestarter.accept("filebeat"), 1, 1, TimeUnit.DAYS);
- storageMaintainer.writeMetricsConfig(containerName, nodeSpec);
- storageMaintainer.writeFilebeatConfig(containerName, nodeSpec);
-
resumeScriptRun = false;
- containerState = UNKNOWN;
logger.info("Container successfully started, new containerState is " + containerState);
}
@@ -336,6 +340,8 @@ public class NodeAgentImpl implements NodeAgent {
return Optional.of("Container should be running with different resource allocation, wanted: " +
wantedContainerResources + ", actual: " + existingContainer.resources);
}
+
+ if (containerState == STARTING) return Optional.of("Container failed to start");
return Optional.empty();
}
@@ -355,7 +361,7 @@ public class NodeAgentImpl implements NodeAgent {
logger.info("Failed stopping services, ignoring", e);
}
}
- if (currentFilebeatRestarter != null) currentFilebeatRestarter.cancel(true);
+ stopFilebeatSchedulerIfNeeded();
dockerOperations.removeContainer(existingContainer, nodeSpec);
containerState = ABSENT;
logger.info("Container successfully removed, new containerState is " + containerState);
@@ -419,13 +425,7 @@ public class NodeAgentImpl implements NodeAgent {
logger.info(e.getMessage());
addDebugMessage(e.getMessage());
} catch (DockerException e) {
- // When a new version of node-admin app is released, there is a brief period of time when both
- // new and old version run together. If one of them stats/stops/deletes the container it manages,
- // the other's assumption of containerState may become incorrect. It'll then start making invalid
- // requests, for example to start a container that is already running, the containerState should
- // therefore be reset if we get an exception from docker.
numberOfUnhandledException++;
- containerState = UNKNOWN;
logger.error("Caught a DockerException, resetting containerState to " + containerState, e);
} catch (Exception e) {
numberOfUnhandledException++;
@@ -486,10 +486,12 @@ public class NodeAgentImpl implements NodeAgent {
container = removeContainerIfNeededUpdateContainerState(nodeSpec, container);
if (! container.isPresent()) {
storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false);
+ containerState = STARTING;
startContainer(nodeSpec);
+ containerState = UNKNOWN;
}
- runLocalResumeScriptIfNeeded();
+ runLocalResumeScriptIfNeeded(nodeSpec);
// Because it's more important to stop a bad release from rolling out in prod,
// we put the resume call last. So if we fail after updating the node repo attributes
// but before resume, the app may go through the tenant pipeline but will halt in prod.
@@ -524,10 +526,17 @@ public class NodeAgentImpl implements NodeAgent {
}
}
+ private void stopFilebeatSchedulerIfNeeded() {
+ if (currentFilebeatRestarter.isPresent()) {
+ currentFilebeatRestarter.get().cancel(true);
+ currentFilebeatRestarter = Optional.empty();
+ }
+ }
+
@SuppressWarnings("unchecked")
public void updateContainerNodeMetrics() {
final ContainerNodeSpec nodeSpec = lastNodeSpec;
- if (nodeSpec == null || containerState == ABSENT) return;
+ if (nodeSpec == null || containerState != UNKNOWN) return;
Optional<Docker.ContainerStats> containerStats = dockerOperations.getContainerStats(containerName);
if (!containerStats.isPresent()) return;
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 9d379babeca..45bb4bbf720 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.dockerapi.ContainerResources;
import com.yahoo.vespa.hosted.dockerapi.ContainerStatsImpl;
import com.yahoo.vespa.hosted.dockerapi.Docker;
+import com.yahoo.vespa.hosted.dockerapi.DockerException;
import com.yahoo.vespa.hosted.dockerapi.DockerImage;
import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper;
import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec;
@@ -52,6 +53,7 @@ import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.reset;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
@@ -551,6 +553,49 @@ public class NodeAgentImplTest {
}
@Test
+ public void start_container_subtask_failure_leads_to_container_restart() throws IOException {
+ final long restartGeneration = 1;
+ final long rebootGeneration = 0;
+ final ContainerNodeSpec nodeSpec = nodeSpecBuilder
+ .wantedDockerImage(dockerImage)
+ .nodeState(Node.State.active)
+ .wantedVespaVersion(vespaVersion)
+ .wantedRestartGeneration(restartGeneration)
+ .currentRestartGeneration(restartGeneration)
+ .wantedRebootGeneration(rebootGeneration)
+ .build();
+
+ NodeAgentImpl nodeAgent = spy(makeNodeAgent(null, false));
+
+ when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec));
+ when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo"));
+ when(pathResolver.getApplicationStoragePathForHost()).thenReturn(Files.createTempDirectory("bar"));
+ when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L));
+ doThrow(new DockerException("Failed to set up network")).doNothing().when(dockerOperations).startContainer(eq(containerName), eq(nodeSpec));
+
+ try {
+ nodeAgent.converge();
+ fail("Expected to get DockerException");
+ } catch (DockerException ignored) { }
+
+ verify(dockerOperations, never()).removeContainer(any(), any());
+ verify(dockerOperations, times(1)).createContainer(eq(containerName), eq(nodeSpec));
+ verify(dockerOperations, times(1)).startContainer(eq(containerName), eq(nodeSpec));
+ verify(nodeAgent, never()).runLocalResumeScriptIfNeeded(any());
+
+ // The docker container was actually started and is running, but subsequent exec calls to set up
+ // networking failed
+ mockGetContainer(dockerImage, true);
+ nodeAgent.converge();
+
+ verify(dockerOperations, times(1)).removeContainer(any(), eq(nodeSpec));
+ verify(dockerOperations, times(2)).createContainer(eq(containerName), eq(nodeSpec));
+ verify(dockerOperations, times(2)).startContainer(eq(containerName), eq(nodeSpec));
+ verify(nodeAgent, times(1)).runLocalResumeScriptIfNeeded(any());
+ }
+
+ @Test
@SuppressWarnings("unchecked")
public void testGetRelevantMetrics() throws Exception {
final ObjectMapper objectMapper = new ObjectMapper();
@@ -689,6 +734,17 @@ public class NodeAgentImplTest {
}
private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning) {
+ mockGetContainer(dockerImage, isRunning);
+
+ when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats));
+ doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any());
+ doNothing().when(storageMaintainer).writeMetricsConfig(any(), any());
+
+ return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations,
+ storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL);
+ }
+
+ private void mockGetContainer(DockerImage dockerImage, boolean isRunning) {
Optional<Container> container = dockerImage != null ?
Optional.of(new Container(
hostName,
@@ -699,12 +755,6 @@ public class NodeAgentImplTest {
isRunning ? 1 : 0)) :
Optional.empty();
- when(dockerOperations.getContainerStats(any())).thenReturn(Optional.of(emptyContainerStats));
when(dockerOperations.getContainer(eq(containerName))).thenReturn(container);
- doNothing().when(storageMaintainer).writeFilebeatConfig(any(), any());
- doNothing().when(storageMaintainer).writeMetricsConfig(any(), any());
-
- return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations,
- storageMaintainer, aclMaintainer, environment, clock, NODE_AGENT_SCAN_INTERVAL);
}
}