diff options
author | Valerij Fredriksen <valerijf@oath.com> | 2018-11-07 15:44:33 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@oath.com> | 2018-11-07 15:44:33 +0100 |
commit | a106d7a01e5bbbc8782c70707d98f25cb78540cd (patch) | |
tree | 49f995cba2b8faffa887c3408177f1ad051379cf /node-admin | |
parent | 4524732e995a2fe1ffe983b2f0f65a2feb20970f (diff) |
Store current reboot and restart generation in memory
Diffstat (limited to 'node-admin')
2 files changed, 34 insertions, 9 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index dbb4e26b18e..7a6ac75ecb8 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -83,6 +83,9 @@ public class NodeAgentImpl implements NodeAgent { private DockerImage imageBeingDownloaded = null; private Instant lastConverge; + private long currentRebootGeneration = 0; + private Optional<Long> currentRestartGeneration = Optional.empty(); + private final Thread loopThread; private final ScheduledExecutorService filebeatRestarter = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter")); @@ -223,14 +226,14 @@ public class NodeAgentImpl implements NodeAgent { final NodeAttributes currentNodeAttributes = new NodeAttributes(); final NodeAttributes newNodeAttributes = new NodeAttributes(); - if (!Objects.equals(node.getCurrentRestartGeneration(), node.getWantedRestartGeneration())) { + if (!Objects.equals(node.getCurrentRestartGeneration(), currentRestartGeneration)) { currentNodeAttributes.withRestartGeneration(node.getCurrentRestartGeneration()); - newNodeAttributes.withRestartGeneration(node.getWantedRestartGeneration()); + newNodeAttributes.withRestartGeneration(currentRestartGeneration); } - if (!Objects.equals(node.getCurrentRebootGeneration(), node.getWantedRebootGeneration())) { + if (!Objects.equals(node.getCurrentRebootGeneration(), currentRebootGeneration)) { currentNodeAttributes.withRebootGeneration(node.getCurrentRebootGeneration()); - newNodeAttributes.withRebootGeneration(node.getWantedRebootGeneration()); + newNodeAttributes.withRebootGeneration(currentRebootGeneration); } Optional<DockerImage> actualDockerImage = node.getWantedDockerImage().filter(n -> containerState == UNKNOWN); @@ -268,6 +271,7 @@ public class NodeAgentImpl implements NodeAgent { shouldRestartServices(node).ifPresent(restartReason -> { context.log(logger, "Will restart services: " + restartReason); restartServices(node, container); + currentRestartGeneration = node.getWantedRestartGeneration(); }); return container; }); @@ -277,9 +281,9 @@ public class NodeAgentImpl implements NodeAgent { if (!node.getWantedRestartGeneration().isPresent()) return Optional.empty(); // Restart generation is only optional because it does not exist for unallocated nodes - if (node.getCurrentRestartGeneration().get() < node.getWantedRestartGeneration().get()) { + if (currentRestartGeneration.get() < node.getWantedRestartGeneration().get()) { return Optional.of("Restart requested - wanted restart generation has been bumped: " - + node.getCurrentRestartGeneration().get() + " -> " + node.getWantedRestartGeneration().get()); + + currentRestartGeneration.get() + " -> " + node.getWantedRestartGeneration().get()); } return Optional.empty(); } @@ -341,9 +345,9 @@ public class NodeAgentImpl implements NodeAgent { wantedContainerResources + ", actual: " + existingContainer.resources); } - if (node.getCurrentRebootGeneration() < node.getWantedRebootGeneration()) { + if (currentRebootGeneration < node.getWantedRebootGeneration()) { return Optional.of(String.format("Container reboot wanted. Current: %d, Wanted: %d", - node.getCurrentRebootGeneration(), node.getWantedRebootGeneration())); + currentRebootGeneration, node.getWantedRebootGeneration())); } if (containerState == STARTING) return Optional.of("Container failed to start"); @@ -372,6 +376,7 @@ public class NodeAgentImpl implements NodeAgent { stopFilebeatSchedulerIfNeeded(); storageMaintainer.handleCoreDumpsForContainer(context, node, Optional.of(existingContainer)); dockerOperations.removeContainer(context, existingContainer); + currentRebootGeneration = node.getWantedRebootGeneration(); containerState = ABSENT; context.log(logger, "Container successfully removed, new containerState is " + containerState); return Optional.empty(); @@ -463,6 +468,12 @@ public class NodeAgentImpl implements NodeAgent { if (!node.equals(lastNode)) { logChangesToNodeSpec(lastNode, node); + if (currentRebootGeneration < node.getCurrentRebootGeneration()) + currentRebootGeneration = node.getCurrentRebootGeneration(); + + if (currentRestartGeneration.isPresent() != node.getCurrentRestartGeneration().isPresent()) + currentRestartGeneration = node.getCurrentRestartGeneration(); + // Every time the node spec changes, we should clear the metrics for this container as the dimensions // will change and we will be reporting duplicate metrics. if (container.map(c -> c.state.isRunning()).orElse(false)) { diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 5dbc3ecd1e9..83ee9b57918 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -19,6 +19,7 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.acl.AclMaintainer; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; import com.yahoo.vespa.hosted.node.admin.maintenance.identity.AthenzCredentialsMaintainer; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import com.yahoo.vespa.hosted.provision.Node; import org.junit.Test; import org.mockito.InOrder; @@ -312,13 +313,26 @@ public class NodeAgentImplTest { when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); + doThrow(new ConvergenceException("Connection refused")).doNothing() + .when(healthChecker).verifyHealth(eq(context)); - nodeAgent.converge(); + try { + nodeAgent.converge(); + } catch (ConvergenceException ignored) {} + // First time we fail to resume because health verification fails verify(orchestrator, times(1)).suspend(eq(hostName)); verify(dockerOperations, times(1)).removeContainer(eq(context), any()); verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); verify(dockerOperations, times(1)).startContainer(eq(context)); + verify(orchestrator, never()).resume(eq(hostName)); + verify(nodeRepository, never()).updateNodeAttributes(any(), any()); + + nodeAgent.converge(); + + // Do not reboot the container again + verify(dockerOperations, times(1)).removeContainer(eq(context), any()); + verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); verify(orchestrator, times(1)).resume(eq(hostName)); verify(nodeRepository, times(1)).updateNodeAttributes(eq(hostName), eq(new NodeAttributes() .withRebootGeneration(wantedRebootGeneration))); |