summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2018-11-07 15:44:33 +0100
committerValerij Fredriksen <valerijf@oath.com>2018-11-07 15:44:33 +0100
commita106d7a01e5bbbc8782c70707d98f25cb78540cd (patch)
tree49f995cba2b8faffa887c3408177f1ad051379cf /node-admin
parent4524732e995a2fe1ffe983b2f0f65a2feb20970f (diff)
Store current reboot and restart generation in memory
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java27
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java16
2 files changed, 34 insertions, 9 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index dbb4e26b18e..7a6ac75ecb8 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -83,6 +83,9 @@ public class NodeAgentImpl implements NodeAgent {
private DockerImage imageBeingDownloaded = null;
private Instant lastConverge;
+ private long currentRebootGeneration = 0;
+ private Optional<Long> currentRestartGeneration = Optional.empty();
+
private final Thread loopThread;
private final ScheduledExecutorService filebeatRestarter =
Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter"));
@@ -223,14 +226,14 @@ public class NodeAgentImpl implements NodeAgent {
final NodeAttributes currentNodeAttributes = new NodeAttributes();
final NodeAttributes newNodeAttributes = new NodeAttributes();
- if (!Objects.equals(node.getCurrentRestartGeneration(), node.getWantedRestartGeneration())) {
+ if (!Objects.equals(node.getCurrentRestartGeneration(), currentRestartGeneration)) {
currentNodeAttributes.withRestartGeneration(node.getCurrentRestartGeneration());
- newNodeAttributes.withRestartGeneration(node.getWantedRestartGeneration());
+ newNodeAttributes.withRestartGeneration(currentRestartGeneration);
}
- if (!Objects.equals(node.getCurrentRebootGeneration(), node.getWantedRebootGeneration())) {
+ if (!Objects.equals(node.getCurrentRebootGeneration(), currentRebootGeneration)) {
currentNodeAttributes.withRebootGeneration(node.getCurrentRebootGeneration());
- newNodeAttributes.withRebootGeneration(node.getWantedRebootGeneration());
+ newNodeAttributes.withRebootGeneration(currentRebootGeneration);
}
Optional<DockerImage> actualDockerImage = node.getWantedDockerImage().filter(n -> containerState == UNKNOWN);
@@ -268,6 +271,7 @@ public class NodeAgentImpl implements NodeAgent {
shouldRestartServices(node).ifPresent(restartReason -> {
context.log(logger, "Will restart services: " + restartReason);
restartServices(node, container);
+ currentRestartGeneration = node.getWantedRestartGeneration();
});
return container;
});
@@ -277,9 +281,9 @@ public class NodeAgentImpl implements NodeAgent {
if (!node.getWantedRestartGeneration().isPresent()) return Optional.empty();
// Restart generation is only optional because it does not exist for unallocated nodes
- if (node.getCurrentRestartGeneration().get() < node.getWantedRestartGeneration().get()) {
+ if (currentRestartGeneration.get() < node.getWantedRestartGeneration().get()) {
return Optional.of("Restart requested - wanted restart generation has been bumped: "
- + node.getCurrentRestartGeneration().get() + " -> " + node.getWantedRestartGeneration().get());
+ + currentRestartGeneration.get() + " -> " + node.getWantedRestartGeneration().get());
}
return Optional.empty();
}
@@ -341,9 +345,9 @@ public class NodeAgentImpl implements NodeAgent {
wantedContainerResources + ", actual: " + existingContainer.resources);
}
- if (node.getCurrentRebootGeneration() < node.getWantedRebootGeneration()) {
+ if (currentRebootGeneration < node.getWantedRebootGeneration()) {
return Optional.of(String.format("Container reboot wanted. Current: %d, Wanted: %d",
- node.getCurrentRebootGeneration(), node.getWantedRebootGeneration()));
+ currentRebootGeneration, node.getWantedRebootGeneration()));
}
if (containerState == STARTING) return Optional.of("Container failed to start");
@@ -372,6 +376,7 @@ public class NodeAgentImpl implements NodeAgent {
stopFilebeatSchedulerIfNeeded();
storageMaintainer.handleCoreDumpsForContainer(context, node, Optional.of(existingContainer));
dockerOperations.removeContainer(context, existingContainer);
+ currentRebootGeneration = node.getWantedRebootGeneration();
containerState = ABSENT;
context.log(logger, "Container successfully removed, new containerState is " + containerState);
return Optional.empty();
@@ -463,6 +468,12 @@ public class NodeAgentImpl implements NodeAgent {
if (!node.equals(lastNode)) {
logChangesToNodeSpec(lastNode, node);
+ if (currentRebootGeneration < node.getCurrentRebootGeneration())
+ currentRebootGeneration = node.getCurrentRebootGeneration();
+
+ if (currentRestartGeneration.isPresent() != node.getCurrentRestartGeneration().isPresent())
+ currentRestartGeneration = node.getCurrentRestartGeneration();
+
// Every time the node spec changes, we should clear the metrics for this container as the dimensions
// will change and we will be reporting duplicate metrics.
if (container.map(c -> c.state.isRunning()).orElse(false)) {
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 5dbc3ecd1e9..83ee9b57918 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -19,6 +19,7 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.acl.AclMaintainer;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository;
import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator;
import com.yahoo.vespa.hosted.node.admin.maintenance.identity.AthenzCredentialsMaintainer;
+import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import com.yahoo.vespa.hosted.provision.Node;
import org.junit.Test;
import org.mockito.InOrder;
@@ -312,13 +313,26 @@ public class NodeAgentImplTest {
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
+ doThrow(new ConvergenceException("Connection refused")).doNothing()
+ .when(healthChecker).verifyHealth(eq(context));
- nodeAgent.converge();
+ try {
+ nodeAgent.converge();
+ } catch (ConvergenceException ignored) {}
+ // First time we fail to resume because health verification fails
verify(orchestrator, times(1)).suspend(eq(hostName));
verify(dockerOperations, times(1)).removeContainer(eq(context), any());
verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any());
verify(dockerOperations, times(1)).startContainer(eq(context));
+ verify(orchestrator, never()).resume(eq(hostName));
+ verify(nodeRepository, never()).updateNodeAttributes(any(), any());
+
+ nodeAgent.converge();
+
+ // Do not reboot the container again
+ verify(dockerOperations, times(1)).removeContainer(eq(context), any());
+ verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any());
verify(orchestrator, times(1)).resume(eq(hostName));
verify(nodeRepository, times(1)).updateNodeAttributes(eq(hostName), eq(new NodeAttributes()
.withRebootGeneration(wantedRebootGeneration)));