summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2020-01-16 18:48:59 +0100
committerGitHub <noreply@github.com>2020-01-16 18:48:59 +0100
commit64b473d3b6f00623f109711d162cf963a8b4b563 (patch)
treecbd40df3dc61ecbe110309a9bf080aa011d469f4 /node-admin
parent21045fc38e60f44ac26b6e39a0f3a639cb94c793 (diff)
parent91bc981ee8e4857827a8606dff4005c36fdd7f1d (diff)
Merge pull request #11818 from vespa-engine/freva/reboot-on-memory-change
Reboot on memory change
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java73
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java128
2 files changed, 87 insertions, 114 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 0e651383adc..fdb9428dc3f 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -26,6 +26,8 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.identity.CredentialsMaintai
import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
@@ -197,6 +199,8 @@ public class NodeAgentImpl implements NodeAgent {
dockerOperations.createContainer(context, containerData, getContainerResources(context));
dockerOperations.startContainer(context);
+ currentRebootGeneration = context.node().wantedRebootGeneration();
+ currentRestartGeneration = context.node().wantedRestartGeneration();
hasStartedServices = true; // Automatically started with the container
hasResumedNode = false;
context.log(logger, "Container successfully started, new containerState is " + containerState);
@@ -205,15 +209,17 @@ public class NodeAgentImpl implements NodeAgent {
private Optional<Container> removeContainerIfNeededUpdateContainerState(
NodeAgentContext context, Optional<Container> existingContainer) {
if (existingContainer.isPresent()) {
- Optional<String> reason = shouldRemoveContainer(context, existingContainer.get());
- if (reason.isPresent()) {
- removeContainer(context, existingContainer.get(), reason.get(), false);
+ List<String> reasons = shouldRemoveContainer(context, existingContainer.get());
+ if (!reasons.isEmpty()) {
+ removeContainer(context, existingContainer.get(), reasons, false);
return Optional.empty();
}
shouldRestartServices(context, existingContainer.get()).ifPresent(restartReason -> {
context.log(logger, "Will restart services: " + restartReason);
- restartServices(context, existingContainer.get());
+ orchestratorSuspendNode(context);
+
+ dockerOperations.restartVespa(context);
currentRestartGeneration = context.node().wantedRestartGeneration();
});
}
@@ -223,7 +229,7 @@ public class NodeAgentImpl implements NodeAgent {
private Optional<String> shouldRestartServices( NodeAgentContext context, Container existingContainer) {
NodeSpec node = context.node();
- if (node.wantedRestartGeneration().isEmpty()) return Optional.empty();
+ if (!existingContainer.state.isRunning() || node.state() != NodeState.active) return Optional.empty();
// Restart generation is only optional because it does not exist for unallocated nodes
if (currentRestartGeneration.get() < node.wantedRestartGeneration().get()) {
@@ -231,25 +237,9 @@ public class NodeAgentImpl implements NodeAgent {
+ currentRestartGeneration.get() + " -> " + node.wantedRestartGeneration().get());
}
- // Restart services if wanted memory changes (searchnode and container needs to be restarted to pick up changes)
- ContainerResources wantedContainerResources = getContainerResources(context);
- if (!wantedContainerResources.equalsMemory(existingContainer.resources)) {
- return Optional.of("Container should be running with different memory allocation, wanted: " +
- wantedContainerResources.toStringMemory() + ", actual: " + existingContainer.resources.toStringMemory());
- }
-
return Optional.empty();
}
- private void restartServices(NodeAgentContext context, Container existingContainer) {
- if (existingContainer.state.isRunning() && context.node().state() == NodeState.active) {
- context.log(logger, "Restarting services");
- // Since we are restarting the services we need to suspend the node.
- orchestratorSuspendNode(context);
- dockerOperations.restartVespa(context);
- }
- }
-
private void stopServicesIfNeeded(NodeAgentContext context) {
if (hasStartedServices && context.node().owner().isEmpty())
stopServices(context);
@@ -268,7 +258,7 @@ public class NodeAgentImpl implements NodeAgent {
@Override
public void stopForHostSuspension(NodeAgentContext context) {
- getContainer(context).ifPresent(container -> removeContainer(context, container, "suspending host", true));
+ getContainer(context).ifPresent(container -> removeContainer(context, container, List.of("Suspending host"), true));
}
public void suspend(NodeAgentContext context) {
@@ -286,31 +276,40 @@ public class NodeAgentImpl implements NodeAgent {
}
}
- private Optional<String> shouldRemoveContainer(NodeAgentContext context, Container existingContainer) {
+ private List<String> shouldRemoveContainer(NodeAgentContext context, Container existingContainer) {
final NodeState nodeState = context.node().state();
- if (nodeState == NodeState.dirty || nodeState == NodeState.provisioned) {
- return Optional.of("Node in state " + nodeState + ", container should no longer be running");
- }
+ List<String> reasons = new ArrayList<>();
+ if (nodeState == NodeState.dirty || nodeState == NodeState.provisioned)
+ reasons.add("Node in state " + nodeState + ", container should no longer be running");
+
if (context.node().wantedDockerImage().isPresent() &&
!context.node().wantedDockerImage().get().equals(existingContainer.image)) {
- return Optional.of("The node is supposed to run a new Docker image: "
+ reasons.add("The node is supposed to run a new Docker image: "
+ existingContainer.image.asString() + " -> " + context.node().wantedDockerImage().get().asString());
}
- if (!existingContainer.state.isRunning()) {
- return Optional.of("Container no longer running");
- }
+
+ if (!existingContainer.state.isRunning())
+ reasons.add("Container no longer running");
if (currentRebootGeneration < context.node().wantedRebootGeneration()) {
- return Optional.of(String.format("Container reboot wanted. Current: %d, Wanted: %d",
+ reasons.add(String.format("Container reboot wanted. Current: %d, Wanted: %d",
currentRebootGeneration, context.node().wantedRebootGeneration()));
}
- if (containerState == STARTING) return Optional.of("Container failed to start");
- return Optional.empty();
+ ContainerResources wantedContainerResources = getContainerResources(context);
+ if (!wantedContainerResources.equalsMemory(existingContainer.resources)) {
+ reasons.add("Container should be running with different memory allocation, wanted: " +
+ wantedContainerResources.toStringMemory() + ", actual: " + existingContainer.resources.toStringMemory());
+ }
+
+ if (containerState == STARTING)
+ reasons.add("Container failed to start");
+
+ return reasons;
}
- private void removeContainer(NodeAgentContext context, Container existingContainer, String reason, boolean alreadySuspended) {
- context.log(logger, "Will remove container: " + reason);
+ private void removeContainer(NodeAgentContext context, Container existingContainer, List<String> reasons, boolean alreadySuspended) {
+ context.log(logger, "Will remove container: " + String.join(", ", reasons));
if (existingContainer.state.isRunning()) {
if (!alreadySuspended) {
@@ -329,7 +328,6 @@ public class NodeAgentImpl implements NodeAgent {
storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer));
dockerOperations.removeContainer(context, existingContainer);
- currentRebootGeneration = context.node().wantedRebootGeneration();
containerState = ABSENT;
context.log(logger, "Container successfully removed, new containerState is " + containerState);
}
@@ -343,7 +341,8 @@ public class NodeAgentImpl implements NodeAgent {
orchestratorSuspendNode(context);
- dockerOperations.updateContainer(context, wantedContainerResources);
+ // Only update CPU resources
+ dockerOperations.updateContainer(context, wantedContainerResources.withMemoryBytes(existingContainer.resources.memoryBytes()));
}
private ContainerResources getContainerResources(NodeAgentContext context) {
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index f07c63c9f4c..9e1f84f07a1 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.node.admin.nodeagent;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.DockerImage;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.InMemoryFlagSource;
@@ -44,9 +45,7 @@ import static org.mockito.Mockito.when;
* @author Øyvind Bakksjø
*/
public class NodeAgentImplTest {
- private static final double MIN_CPU_CORES = 2;
- private static final double MIN_MAIN_MEMORY_AVAILABLE_GB = 16;
- private static final double MIN_DISK_AVAILABLE_GB = 250;
+ private static final NodeResources resources = new NodeResources(2, 16, 250, 1, NodeResources.DiskSpeed.fast, NodeResources.StorageType.local);
private static final Version vespaVersion = Version.fromString("1.2.3");
private final String hostName = "host1.test.yahoo.com";
@@ -54,9 +53,7 @@ public class NodeAgentImplTest {
.hostname(hostName)
.type(NodeType.tenant)
.flavor("docker")
- .vcpu(MIN_CPU_CORES)
- .memoryGb(MIN_MAIN_MEMORY_AVAILABLE_GB)
- .diskGb(MIN_DISK_AVAILABLE_GB);
+ .resources(resources);
private final NodeAgentContextSupplier contextSupplier = mock(NodeAgentContextSupplier.class);
private final DockerImage dockerImage = DockerImage.fromString("dockerImage");
@@ -73,11 +70,10 @@ public class NodeAgentImplTest {
@Test
public void upToDateContainerIsUntouched() {
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -102,11 +98,10 @@ public class NodeAgentImplTest {
@Test
public void verifyRemoveOldFilesIfDiskFull() {
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -123,11 +118,10 @@ public class NodeAgentImplTest {
public void startsAfterStoppingServices() {
final InOrder inOrder = inOrder(dockerOperations);
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -159,13 +153,11 @@ public class NodeAgentImplTest {
@Test
public void absentContainerCausesStart() {
- final Optional<Long> restartGeneration = Optional.of(1L);
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
.state(NodeState.active)
+ .wantedDockerImage(dockerImage)
.wantedVespaVersion(vespaVersion)
- .wantedRestartGeneration(restartGeneration.get())
- .currentRestartGeneration(restartGeneration.get())
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -197,11 +189,10 @@ public class NodeAgentImplTest {
public void containerIsNotStoppedIfNewImageMustBePulled() {
final DockerImage newDockerImage = DockerImage.fromString("new-image");
final NodeSpec node = nodeBuilder
- .wantedDockerImage(newDockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
+ .wantedDockerImage(newDockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -224,11 +215,10 @@ public class NodeAgentImplTest {
@Test
public void containerIsUpdatedIfCpuChanged() {
NodeSpec.Builder specBuilder = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion);
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1);
NodeAgentContext firstContext = createContext(specBuilder.build());
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
@@ -269,15 +259,12 @@ public class NodeAgentImplTest {
}
@Test
- public void vespaIsRestartedIfMemoryChanged() {
+ public void containerIsRecreatedIfMemoryChanged() {
NodeSpec.Builder specBuilder = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
- .wantedRestartGeneration(1)
- .currentRestartGeneration(1);
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(2).currentRestartGeneration(1);
NodeAgentContext firstContext = createContext(specBuilder.build());
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
@@ -291,11 +278,12 @@ public class NodeAgentImplTest {
ContainerResources resourcesAfterThird = ContainerResources.from(0, 2, 20);
mockGetContainer(dockerImage, resourcesAfterThird, true);
- InOrder inOrder = inOrder(orchestrator, dockerOperations);
+ InOrder inOrder = inOrder(orchestrator, dockerOperations, nodeRepository);
inOrder.verify(orchestrator).resume(any(String.class));
- inOrder.verify(dockerOperations, never()).removeContainer(any(), any());
+ inOrder.verify(dockerOperations).removeContainer(eq(secondContext), any());
inOrder.verify(dockerOperations, never()).updateContainer(any(), any());
- inOrder.verify(dockerOperations).restartVespa(eq(secondContext));
+ inOrder.verify(dockerOperations, never()).restartVespa(any());
+ inOrder.verify(nodeRepository).updateNodeAttributes(eq(hostName), eq(new NodeAttributes().withRestartGeneration(2)));
nodeAgent.doConverge(secondContext);
inOrder.verify(orchestrator).resume(any(String.class));
@@ -305,16 +293,11 @@ public class NodeAgentImplTest {
@Test
public void noRestartIfOrchestratorSuspendFails() {
- final long wantedRestartGeneration = 2;
- final long currentRestartGeneration = 1;
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
- .wantedRestartGeneration(wantedRestartGeneration)
- .currentRestartGeneration(currentRestartGeneration)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(2).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -338,15 +321,12 @@ public class NodeAgentImplTest {
@Test
public void recreatesContainerIfRebootWanted() {
final long wantedRebootGeneration = 2;
- final long currentRebootGeneration = 1;
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
- .wantedRebootGeneration(wantedRebootGeneration)
- .currentRebootGeneration(currentRebootGeneration)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
+ .wantedRebootGeneration(wantedRebootGeneration).currentRebootGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -383,11 +363,9 @@ public class NodeAgentImplTest {
@Test
public void failedNodeRunningContainerShouldStillBeRunning() {
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.failed)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
.build();
NodeAgentContext context = createContext(node);
@@ -429,11 +407,9 @@ public class NodeAgentImplTest {
@Test
public void inactiveNodeRunningContainerShouldStillBeRunning() {
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.inactive)
- .wantedVespaVersion(vespaVersion)
- .currentVespaVersion(vespaVersion)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
.build();
NodeAgentContext context = createContext(node);
@@ -453,8 +429,8 @@ public class NodeAgentImplTest {
@Test
public void reservedNodeDoesNotUpdateNodeRepoWithVersion() {
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
.state(NodeState.reserved)
+ .wantedDockerImage(dockerImage)
.wantedVespaVersion(vespaVersion)
.build();
@@ -470,13 +446,11 @@ public class NodeAgentImplTest {
private void nodeRunningContainerIsTakenDownAndCleanedAndRecycled(NodeState nodeState, Optional<Long> wantedRestartGeneration) {
wantedRestartGeneration.ifPresent(restartGeneration -> nodeBuilder
- .wantedRestartGeneration(restartGeneration)
- .currentRestartGeneration(restartGeneration));
+ .wantedRestartGeneration(restartGeneration).currentRestartGeneration(restartGeneration));
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(nodeState)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
.build();
NodeAgentContext context = createContext(node);
@@ -532,9 +506,8 @@ public class NodeAgentImplTest {
@Test
public void testRestartDeadContainerAfterNodeAdminRestart() {
final NodeSpec node = nodeBuilder
- .currentDockerImage(dockerImage)
- .wantedDockerImage(dockerImage)
.state(NodeState.active)
+ .currentDockerImage(dockerImage).wantedDockerImage(dockerImage)
.currentVespaVersion(vespaVersion)
.build();
@@ -554,10 +527,10 @@ public class NodeAgentImplTest {
@Test
public void resumeProgramRunsUntilSuccess() {
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
- .currentDockerImage(dockerImage)
.state(NodeState.active)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
.currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -591,9 +564,10 @@ public class NodeAgentImplTest {
@Test
public void start_container_subtask_failure_leads_to_container_restart() {
final NodeSpec node = nodeBuilder
- .wantedDockerImage(dockerImage)
.state(NodeState.active)
+ .wantedDockerImage(dockerImage)
.wantedVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1)
.build();
NodeAgentContext context = createContext(node);
@@ -627,9 +601,9 @@ public class NodeAgentImplTest {
@Test
public void testRunningConfigServer() {
final NodeSpec node = nodeBuilder
+ .state(NodeState.active)
.type(NodeType.config)
.wantedDockerImage(dockerImage)
- .state(NodeState.active)
.wantedVespaVersion(vespaVersion)
.build();
@@ -669,11 +643,11 @@ public class NodeAgentImplTest {
private void verifyThatContainerIsStopped(NodeState nodeState, Optional<ApplicationId> owner) {
NodeSpec.Builder nodeBuilder = new NodeSpec.Builder()
+ .resources(resources)
.hostname(hostName)
.type(NodeType.tenant)
.flavor("docker")
- .currentDockerImage(dockerImage)
- .wantedDockerImage(dockerImage)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
.state(nodeState);
owner.ifPresent(nodeBuilder::owner);
@@ -706,7 +680,7 @@ public class NodeAgentImplTest {
}
private void mockGetContainer(DockerImage dockerImage, boolean isRunning) {
- mockGetContainer(dockerImage, ContainerResources.from(0, MIN_CPU_CORES, MIN_MAIN_MEMORY_AVAILABLE_GB), isRunning);
+ mockGetContainer(dockerImage, ContainerResources.from(0, resources.vcpu(), resources.memoryGb()), isRunning);
}
private void mockGetContainer(DockerImage dockerImage, ContainerResources containerResources, boolean isRunning) {