diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2017-06-23 12:39:10 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-23 12:39:10 +0200 |
commit | db248f137d72467667fb08b986ecd484f15eefc3 (patch) | |
tree | a8dfa202b9dc2f9b7a9ef35fc2328f70a916d2a3 /node-admin | |
parent | c26ede5e0e0b1b54314464f6a99eebc70ac885c5 (diff) | |
parent | cae79eb731f2d07868d9f818c05fd3118ba6f301 (diff) |
Merge pull request #2873 from yahoo/freva/reset-container-state-on-docker-exception
Freva/reset container state on docker exception
Diffstat (limited to 'node-admin')
6 files changed, 35 insertions, 33 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java index 9dd593ade0f..d8ba6e645a2 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java @@ -27,8 +27,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; -import static com.yahoo.vespa.defaults.Defaults.getDefaults; - /** * Class that wraps the Docker class and have some tools related to running programs in docker. * diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index c217128675b..8169b9daee8 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -6,6 +6,7 @@ import com.yahoo.concurrent.ThreadFactoryFactory; import com.yahoo.vespa.hosted.dockerapi.Container; import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.dockerapi.Docker; +import com.yahoo.vespa.hosted.dockerapi.DockerException; import com.yahoo.vespa.hosted.dockerapi.DockerExecTimeoutException; import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.dockerapi.ProcessResult; @@ -42,8 +43,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.ABSENT; -import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.RUNNING; -import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN; +import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.UNKNOWN; /** * @author dybis @@ -66,7 +66,6 @@ public class NodeAgentImpl implements NodeAgent { private final Orchestrator orchestrator; private final DockerOperations dockerOperations; private final Optional<StorageMaintainer> storageMaintainer; - private final MetricReceiverWrapper metricReceiver; private final Environment environment; private final Clock clock; private final Optional<AclMaintainer> aclMaintainer; @@ -85,13 +84,21 @@ public class NodeAgentImpl implements NodeAgent { private final Consumer<String> serviceRestarter; private Future<?> currentFilebeatRestarter; + private boolean resumeScriptRun = false; + + /** + * ABSENT means container is definitely absent - A container that was absent will not suddenly appear without + * NodeAgent explicitly starting it. + * Otherwise we can't be certain. A container that was running a minute ago may no longer be running without + * NodeAgent doing anything (container could have crashed). Therefore we always have to ask docker daemon + * to get updated state of the container. + */ enum ContainerState { ABSENT, - RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN, - RUNNING + UNKNOWN } - private ContainerState containerState = ABSENT; + private ContainerState containerState = UNKNOWN; // The attributes of the last successful node repo attribute update for this node. Used to avoid redundant calls. private NodeAttributes lastAttributesSet = null; @@ -104,7 +111,6 @@ public class NodeAgentImpl implements NodeAgent { final Orchestrator orchestrator, final DockerOperations dockerOperations, final Optional<StorageMaintainer> storageMaintainer, - final MetricReceiverWrapper metricReceiver, final Environment environment, final Clock clock, final Optional<AclMaintainer> aclMaintainer) { @@ -115,7 +121,6 @@ public class NodeAgentImpl implements NodeAgent { this.dockerOperations = dockerOperations; this.storageMaintainer = storageMaintainer; this.logger = PrefixLogger.getNodeAgentLogger(NodeAgentImpl.class, containerName); - this.metricReceiver = metricReceiver; this.environment = environment; this.clock = clock; this.aclMaintainer = aclMaintainer; @@ -132,13 +137,6 @@ public class NodeAgentImpl implements NodeAgent { logger.error("Failed to restart service " + service, e); } }; - - // If the container is already running, initialize vespaVersion and lastCpuMetric - dockerOperations.getContainer(containerName) - .ifPresent(container -> { - containerState = RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN; - logger.info("Container is already running, setting containerState to " + containerState); - }); } @Override @@ -219,14 +217,11 @@ public class NodeAgentImpl implements NodeAgent { } private void runLocalResumeScriptIfNeeded(final ContainerNodeSpec nodeSpec) { - if (containerState != RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN) { - return; + if (! resumeScriptRun) { + addDebugMessage("Starting optional node program resume command"); + dockerOperations.resumeNode(containerName); + resumeScriptRun = true; } - - addDebugMessage("Starting optional node program resume command"); - dockerOperations.resumeNode(containerName); - containerState = RUNNING; - logger.info("Resume command successfully executed, new containerState is " + containerState); } private void updateNodeRepoWithCurrentAttributes(final ContainerNodeSpec nodeSpec) { @@ -264,10 +259,8 @@ public class NodeAgentImpl implements NodeAgent { maintainer.writeFilebeatConfig(containerName, nodeSpec); }); - - addDebugMessage("startContainerIfNeeded: containerState " + containerState + " -> " + - RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN); - containerState = RUNNING_HOWEVER_RESUME_SCRIPT_NOT_RUN; + resumeScriptRun = false; + containerState = UNKNOWN; logger.info("Container successfully started, new containerState is " + containerState); } @@ -409,6 +402,15 @@ public class NodeAgentImpl implements NodeAgent { } catch (OrchestratorException e) { logger.info(e.getMessage()); addDebugMessage(e.getMessage()); + } catch (DockerException e) { + // When a new version of node-admin app is released, there is a brief period of time when both + // new and old version run together. If one of them stats/stops/deletes the container it manages, + // the other's assumption of containerState may become incorrect. It'll then start making invalid + // requests, for example to start a container that is already running, the containerState should + // therefore be reset if we get an exception from docker. + numberOfUnhandledException++; + containerState = UNKNOWN; + logger.error("Caught a DockerExecption, resetting containerState to " + containerState, e); } catch (Exception e) { numberOfUnhandledException++; logger.error("Unhandled exception, ignoring.", e); @@ -586,7 +588,9 @@ public class NodeAgentImpl implements NodeAgent { private Optional<Container> getContainer() { if (containerState == ABSENT) return Optional.empty(); - return dockerOperations.getContainer(containerName); + Optional<Container> container = dockerOperations.getContainer(containerName); + if (! container.isPresent()) containerState = ABSENT; + return container; } @Override diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java index 7c0f9f94277..0e8a4c4106d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java @@ -69,7 +69,7 @@ public class ComponentsProviderImpl implements ComponentsProvider { Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations, - storageMaintainer, metricReceiver, environment, clock, aclMaintainer); + storageMaintainer, environment, clock, aclMaintainer); NodeAdmin nodeAdmin = new NodeAdminImpl(dockerOperations, nodeAgentFactory, storageMaintainer, NODE_AGENT_SCAN_INTERVAL_MILLIS, metricReceiver, aclMaintainer, clock); nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, nodeAdmin, clock, orchestrator, baseHostName); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java index b0add5115bf..12d304712f8 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/ComponentsProviderWithMocks.java @@ -34,7 +34,7 @@ public class ComponentsProviderWithMocks implements ComponentsProvider { private final MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation); private final Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl(hostName, nodeRepositoryMock, orchestratorMock, - dockerOperationsMock, Optional.empty(), mr, environment, Clock.systemUTC(), Optional.empty()); + dockerOperationsMock, Optional.empty(), environment, Clock.systemUTC(), Optional.empty()); private final NodeAdmin nodeAdmin = new NodeAdminImpl(dockerOperationsMock, nodeAgentFactory, Optional.empty(), 100, mr, Optional.empty(), Clock.systemUTC()); private final NodeAdminStateUpdater nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepositoryMock, nodeAdmin, Clock.systemUTC(), orchestratorMock, "localhost.test.yahoo.com"); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index 38ce96006f7..473d306e5bd 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -58,7 +58,7 @@ public class DockerTester implements AutoCloseable { MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation); final DockerOperations dockerOperations = new DockerOperationsImpl(dockerMock, environment); Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl(hostName, nodeRepositoryMock, - orchestratorMock, dockerOperations, Optional.of(storageMaintainer), mr, environment, clock, Optional.empty()); + orchestratorMock, dockerOperations, Optional.of(storageMaintainer), environment, clock, Optional.empty()); nodeAdmin = new NodeAdminImpl(dockerOperations, nodeAgentFactory, Optional.of(storageMaintainer), 100, mr, Optional.empty(), Clock.systemUTC()); updater = new NodeAdminStateUpdater(nodeRepositoryMock, nodeAdmin, clock, orchestratorMock, "basehostname"); updater.start(5); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 9e684b4d797..0264e2ac102 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -593,6 +593,6 @@ public class NodeAgentImplTest { doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); return new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations, - Optional.of(storageMaintainer), metricReceiver, environment, clock, Optional.of(aclMaintainer)); + Optional.of(storageMaintainer), environment, clock, Optional.of(aclMaintainer)); } } |