summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2017-06-23 12:39:10 +0200
committerGitHub <noreply@github.com>2017-06-23 12:39:10 +0200
commitdb248f137d72467667fb08b986ecd484f15eefc3 (patch)
treea8dfa202b9dc2f9b7a9ef35fc2328f70a916d2a3 /node-admin
parentc26ede5e0e0b1b54314464f6a99eebc70ac885c5 (diff)
parentcae79eb731f2d07868d9f818c05fd3118ba6f301 (diff)
Merge pull request #2873 from yahoo/freva/reset-container-state-on-docker-exception
Freva/reset container state on docker exception
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java58
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java2
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java2
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java2
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java2
6 files changed, 35 insertions, 33 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
index 9dd593ade0f..d8ba6e645a2 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
@@ -27,8 +27,6 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
-import static com.yahoo.vespa.defaults.Defaults.getDefaults;
-
/**
* Class that wraps the Docker class and have some tools related to running programs in docker.
*
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index c217128675b..8169b9daee8 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -6,6 +6,7 @@ import com.yahoo.concurrent.ThreadFactoryFactory;
import com.yahoo.vespa.hosted.dockerapi.Container;
import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.dockerapi.Docker;
+import com.yahoo.vespa.hosted.dockerapi.DockerException;
import com.yahoo.vespa.hosted.dockerapi.DockerExecTimeoutException;
import com.yahoo.vespa.hosted.dockerapi.DockerImage;
import com.yahoo.vespa.hosted.dockerapi.ProcessResult;
@@ -42,8 +43,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.ABSENT;
-import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.RUNNING;
-import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN;
+import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.UNKNOWN;
/**
* @author dybis
@@ -66,7 +66,6 @@ public class NodeAgentImpl implements NodeAgent {
private final Orchestrator orchestrator;
private final DockerOperations dockerOperations;
private final Optional<StorageMaintainer> storageMaintainer;
- private final MetricReceiverWrapper metricReceiver;
private final Environment environment;
private final Clock clock;
private final Optional<AclMaintainer> aclMaintainer;
@@ -85,13 +84,21 @@ public class NodeAgentImpl implements NodeAgent {
private final Consumer<String> serviceRestarter;
private Future<?> currentFilebeatRestarter;
+ private boolean resumeScriptRun = false;
+
+ /**
+ * ABSENT means container is definitely absent - A container that was absent will not suddenly appear without
+ * NodeAgent explicitly starting it.
+ * Otherwise we can't be certain. A container that was running a minute ago may no longer be running without
+ * NodeAgent doing anything (container could have crashed). Therefore we always have to ask docker daemon
+ * to get updated state of the container.
+ */
enum ContainerState {
ABSENT,
- RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN,
- RUNNING
+ UNKNOWN
}
- private ContainerState containerState = ABSENT;
+ private ContainerState containerState = UNKNOWN;
// The attributes of the last successful node repo attribute update for this node. Used to avoid redundant calls.
private NodeAttributes lastAttributesSet = null;
@@ -104,7 +111,6 @@ public class NodeAgentImpl implements NodeAgent {
final Orchestrator orchestrator,
final DockerOperations dockerOperations,
final Optional<StorageMaintainer> storageMaintainer,
- final MetricReceiverWrapper metricReceiver,
final Environment environment,
final Clock clock,
final Optional<AclMaintainer> aclMaintainer) {
@@ -115,7 +121,6 @@ public class NodeAgentImpl implements NodeAgent {
this.dockerOperations = dockerOperations;
this.storageMaintainer = storageMaintainer;
this.logger = PrefixLogger.getNodeAgentLogger(NodeAgentImpl.class, containerName);
- this.metricReceiver = metricReceiver;
this.environment = environment;
this.clock = clock;
this.aclMaintainer = aclMaintainer;
@@ -132,13 +137,6 @@ public class NodeAgentImpl implements NodeAgent {
logger.error("Failed to restart service " + service, e);
}
};
-
- // If the container is already running, initialize vespaVersion and lastCpuMetric
- dockerOperations.getContainer(containerName)
- .ifPresent(container -> {
- containerState = RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN;
- logger.info("Container is already running, setting containerState to " + containerState);
- });
}
@Override
@@ -219,14 +217,11 @@ public class NodeAgentImpl implements NodeAgent {
}
private void runLocalResumeScriptIfNeeded(final ContainerNodeSpec nodeSpec) {
- if (containerState != RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN) {
- return;
+ if (! resumeScriptRun) {
+ addDebugMessage("Starting optional node program resume command");
+ dockerOperations.resumeNode(containerName);
+ resumeScriptRun = true;
}
-
- addDebugMessage("Starting optional node program resume command");
- dockerOperations.resumeNode(containerName);
- containerState = RUNNING;
- logger.info("Resume command successfully executed, new containerState is " + containerState);
}
private void updateNodeRepoWithCurrentAttributes(final ContainerNodeSpec nodeSpec) {
@@ -264,10 +259,8 @@ public class NodeAgentImpl implements NodeAgent {
maintainer.writeFilebeatConfig(containerName, nodeSpec);
});
-
- addDebugMessage("startContainerIfNeeded: containerState " + containerState + " -> " +
- RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN);
- containerState = RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN;
+ resumeScriptRun = false;
+ containerState = UNKNOWN;
logger.info("Container successfully started, new containerState is " + containerState);
}
@@ -409,6 +402,15 @@ public class NodeAgentImpl implements NodeAgent {
} catch (OrchestratorException e) {
logger.info(e.getMessage());
addDebugMessage(e.getMessage());
+ } catch (DockerException e) {
+ // When a new version of node-admin app is released, there is a brief period of time when both
+ // new and old version run together. If one of them stats/stops/deletes the container it manages,
+ // the other's assumption of containerState may become incorrect. It'll then start making invalid
+ // requests, for example to start a container that is already running, the containerState should
+ // therefore be reset if we get an exception from docker.
+ numberOfUnhandledException++;
+ containerState = UNKNOWN;
+ logger.error("Caught a DockerExecption, resetting containerState to " + containerState, e);
} catch (Exception e) {
numberOfUnhandledException++;
logger.error("Unhandled exception, ignoring.", e);
@@ -586,7 +588,9 @@ public class NodeAgentImpl implements NodeAgent {
private Optional<Container> getContainer() {
if (containerState == ABSENT) return Optional.empty();
- return dockerOperations.getContainer(containerName);
+ Optional<Container> container = dockerOperations.getContainer(containerName);
+ if (! container.isPresent()) containerState = ABSENT;
+ return container;
}
@Override
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java
index 7c0f9f94277..0e8a4c4106d 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java
@@ -69,7 +69,7 @@ public class ComponentsProviderImpl implements ComponentsProvider {
Function<String, NodeAgent> nodeAgentFactory =
(hostName) -> new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations,
- storageMaintainer, metricReceiver, environment, clock, aclMaintainer);
+ storageMaintainer, environment, clock, aclMaintainer);
NodeAdmin nodeAdmin = new NodeAdminImpl(dockerOperations, nodeAgentFactory, storageMaintainer,
NODE_AGENT_SCAN_INTERVAL_MILLIS, metricReceiver, aclMaintainer, clock);
nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, nodeAdmin, clock, orchestrator, baseHostName);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java
index b0add5115bf..12d304712f8 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java
@@ -34,7 +34,7 @@ public class ComponentsProviderWithMocks implements ComponentsProvider {
private final MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation);
private final Function<String, NodeAgent> nodeAgentFactory =
(hostName) -> new NodeAgentImpl(hostName, nodeRepositoryMock, orchestratorMock,
- dockerOperationsMock, Optional.empty(), mr, environment, Clock.systemUTC(), Optional.empty());
+ dockerOperationsMock, Optional.empty(), environment, Clock.systemUTC(), Optional.empty());
private final NodeAdmin nodeAdmin = new NodeAdminImpl(dockerOperationsMock, nodeAgentFactory, Optional.empty(), 100, mr, Optional.empty(), Clock.systemUTC());
private final NodeAdminStateUpdater nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepositoryMock, nodeAdmin, Clock.systemUTC(), orchestratorMock, "localhost.test.yahoo.com");
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
index 38ce96006f7..473d306e5bd 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
@@ -58,7 +58,7 @@ public class DockerTester implements AutoCloseable {
MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation);
final DockerOperations dockerOperations = new DockerOperationsImpl(dockerMock, environment);
Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl(hostName, nodeRepositoryMock,
- orchestratorMock, dockerOperations, Optional.of(storageMaintainer), mr, environment, clock, Optional.empty());
+ orchestratorMock, dockerOperations, Optional.of(storageMaintainer), environment, clock, Optional.empty());
nodeAdmin = new NodeAdminImpl(dockerOperations, nodeAgentFactory, Optional.of(storageMaintainer), 100, mr, Optional.empty(), Clock.systemUTC());
updater = new NodeAdminStateUpdater(nodeRepositoryMock, nodeAdmin, clock, orchestratorMock, "basehostname");
updater.start(5);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 9e684b4d797..0264e2ac102 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -593,6 +593,6 @@ public class NodeAgentImplTest {
doNothing().when(storageMaintainer).writeMetricsConfig(any(), any());
return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations,
- Optional.of(storageMaintainer), metricReceiver, environment, clock, Optional.of(aclMaintainer));
+ Optional.of(storageMaintainer), environment, clock, Optional.of(aclMaintainer));
}
}