diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2020-01-16 18:48:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-01-16 18:48:59 +0100 |
commit | 64b473d3b6f00623f109711d162cf963a8b4b563 (patch) | |
tree | cbd40df3dc61ecbe110309a9bf080aa011d469f4 /node-admin | |
parent | 21045fc38e60f44ac26b6e39a0f3a639cb94c793 (diff) | |
parent | 91bc981ee8e4857827a8606dff4005c36fdd7f1d (diff) |
Merge pull request #11818 from vespa-engine/freva/reboot-on-memory-change
Reboot on memory change
Diffstat (limited to 'node-admin')
2 files changed, 87 insertions, 114 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 0e651383adc..fdb9428dc3f 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -26,6 +26,8 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.identity.CredentialsMaintai import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; @@ -197,6 +199,8 @@ public class NodeAgentImpl implements NodeAgent { dockerOperations.createContainer(context, containerData, getContainerResources(context)); dockerOperations.startContainer(context); + currentRebootGeneration = context.node().wantedRebootGeneration(); + currentRestartGeneration = context.node().wantedRestartGeneration(); hasStartedServices = true; // Automatically started with the container hasResumedNode = false; context.log(logger, "Container successfully started, new containerState is " + containerState); @@ -205,15 +209,17 @@ public class NodeAgentImpl implements NodeAgent { private Optional<Container> removeContainerIfNeededUpdateContainerState( NodeAgentContext context, Optional<Container> existingContainer) { if (existingContainer.isPresent()) { - Optional<String> reason = shouldRemoveContainer(context, existingContainer.get()); - if (reason.isPresent()) { - removeContainer(context, existingContainer.get(), reason.get(), false); + List<String> reasons = shouldRemoveContainer(context, existingContainer.get()); + if (!reasons.isEmpty()) { + removeContainer(context, existingContainer.get(), reasons, false); return Optional.empty(); } shouldRestartServices(context, existingContainer.get()).ifPresent(restartReason -> { context.log(logger, "Will restart services: " + restartReason); - restartServices(context, existingContainer.get()); + orchestratorSuspendNode(context); + + dockerOperations.restartVespa(context); currentRestartGeneration = context.node().wantedRestartGeneration(); }); } @@ -223,7 +229,7 @@ public class NodeAgentImpl implements NodeAgent { private Optional<String> shouldRestartServices( NodeAgentContext context, Container existingContainer) { NodeSpec node = context.node(); - if (node.wantedRestartGeneration().isEmpty()) return Optional.empty(); + if (!existingContainer.state.isRunning() || node.state() != NodeState.active) return Optional.empty(); // Restart generation is only optional because it does not exist for unallocated nodes if (currentRestartGeneration.get() < node.wantedRestartGeneration().get()) { @@ -231,25 +237,9 @@ public class NodeAgentImpl implements NodeAgent { + currentRestartGeneration.get() + " -> " + node.wantedRestartGeneration().get()); } - // Restart services if wanted memory changes (searchnode and container needs to be restarted to pick up changes) - ContainerResources wantedContainerResources = getContainerResources(context); - if (!wantedContainerResources.equalsMemory(existingContainer.resources)) { - return Optional.of("Container should be running with different memory allocation, wanted: " + - wantedContainerResources.toStringMemory() + ", actual: " + existingContainer.resources.toStringMemory()); - } - return Optional.empty(); } - private void restartServices(NodeAgentContext context, Container existingContainer) { - if (existingContainer.state.isRunning() && context.node().state() == NodeState.active) { - context.log(logger, "Restarting services"); - // Since we are restarting the services we need to suspend the node. - orchestratorSuspendNode(context); - dockerOperations.restartVespa(context); - } - } - private void stopServicesIfNeeded(NodeAgentContext context) { if (hasStartedServices && context.node().owner().isEmpty()) stopServices(context); @@ -268,7 +258,7 @@ public class NodeAgentImpl implements NodeAgent { @Override public void stopForHostSuspension(NodeAgentContext context) { - getContainer(context).ifPresent(container -> removeContainer(context, container, "suspending host", true)); + getContainer(context).ifPresent(container -> removeContainer(context, container, List.of("Suspending host"), true)); } public void suspend(NodeAgentContext context) { @@ -286,31 +276,40 @@ public class NodeAgentImpl implements NodeAgent { } } - private Optional<String> shouldRemoveContainer(NodeAgentContext context, Container existingContainer) { + private List<String> shouldRemoveContainer(NodeAgentContext context, Container existingContainer) { final NodeState nodeState = context.node().state(); - if (nodeState == NodeState.dirty || nodeState == NodeState.provisioned) { - return Optional.of("Node in state " + nodeState + ", container should no longer be running"); - } + List<String> reasons = new ArrayList<>(); + if (nodeState == NodeState.dirty || nodeState == NodeState.provisioned) + reasons.add("Node in state " + nodeState + ", container should no longer be running"); + if (context.node().wantedDockerImage().isPresent() && !context.node().wantedDockerImage().get().equals(existingContainer.image)) { - return Optional.of("The node is supposed to run a new Docker image: " + reasons.add("The node is supposed to run a new Docker image: " + existingContainer.image.asString() + " -> " + context.node().wantedDockerImage().get().asString()); } - if (!existingContainer.state.isRunning()) { - return Optional.of("Container no longer running"); - } + + if (!existingContainer.state.isRunning()) + reasons.add("Container no longer running"); if (currentRebootGeneration < context.node().wantedRebootGeneration()) { - return Optional.of(String.format("Container reboot wanted. Current: %d, Wanted: %d", + reasons.add(String.format("Container reboot wanted. Current: %d, Wanted: %d", currentRebootGeneration, context.node().wantedRebootGeneration())); } - if (containerState == STARTING) return Optional.of("Container failed to start"); - return Optional.empty(); + ContainerResources wantedContainerResources = getContainerResources(context); + if (!wantedContainerResources.equalsMemory(existingContainer.resources)) { + reasons.add("Container should be running with different memory allocation, wanted: " + + wantedContainerResources.toStringMemory() + ", actual: " + existingContainer.resources.toStringMemory()); + } + + if (containerState == STARTING) + reasons.add("Container failed to start"); + + return reasons; } - private void removeContainer(NodeAgentContext context, Container existingContainer, String reason, boolean alreadySuspended) { - context.log(logger, "Will remove container: " + reason); + private void removeContainer(NodeAgentContext context, Container existingContainer, List<String> reasons, boolean alreadySuspended) { + context.log(logger, "Will remove container: " + String.join(", ", reasons)); if (existingContainer.state.isRunning()) { if (!alreadySuspended) { @@ -329,7 +328,6 @@ public class NodeAgentImpl implements NodeAgent { storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer)); dockerOperations.removeContainer(context, existingContainer); - currentRebootGeneration = context.node().wantedRebootGeneration(); containerState = ABSENT; context.log(logger, "Container successfully removed, new containerState is " + containerState); } @@ -343,7 +341,8 @@ public class NodeAgentImpl implements NodeAgent { orchestratorSuspendNode(context); - dockerOperations.updateContainer(context, wantedContainerResources); + // Only update CPU resources + dockerOperations.updateContainer(context, wantedContainerResources.withMemoryBytes(existingContainer.resources.memoryBytes())); } private ContainerResources getContainerResources(NodeAgentContext context) { diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index f07c63c9f4c..9e1f84f07a1 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.node.admin.nodeagent; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.DockerImage; +import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.InMemoryFlagSource; @@ -44,9 +45,7 @@ import static org.mockito.Mockito.when; * @author Øyvind Bakksjø */ public class NodeAgentImplTest { - private static final double MIN_CPU_CORES = 2; - private static final double MIN_MAIN_MEMORY_AVAILABLE_GB = 16; - private static final double MIN_DISK_AVAILABLE_GB = 250; + private static final NodeResources resources = new NodeResources(2, 16, 250, 1, NodeResources.DiskSpeed.fast, NodeResources.StorageType.local); private static final Version vespaVersion = Version.fromString("1.2.3"); private final String hostName = "host1.test.yahoo.com"; @@ -54,9 +53,7 @@ public class NodeAgentImplTest { .hostname(hostName) .type(NodeType.tenant) .flavor("docker") - .vcpu(MIN_CPU_CORES) - .memoryGb(MIN_MAIN_MEMORY_AVAILABLE_GB) - .diskGb(MIN_DISK_AVAILABLE_GB); + .resources(resources); private final NodeAgentContextSupplier contextSupplier = mock(NodeAgentContextSupplier.class); private final DockerImage dockerImage = DockerImage.fromString("dockerImage"); @@ -73,11 +70,10 @@ public class NodeAgentImplTest { @Test public void upToDateContainerIsUntouched() { final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -102,11 +98,10 @@ public class NodeAgentImplTest { @Test public void verifyRemoveOldFilesIfDiskFull() { final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -123,11 +118,10 @@ public class NodeAgentImplTest { public void startsAfterStoppingServices() { final InOrder inOrder = inOrder(dockerOperations); final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -159,13 +153,11 @@ public class NodeAgentImplTest { @Test public void absentContainerCausesStart() { - final Optional<Long> restartGeneration = Optional.of(1L); final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) .state(NodeState.active) + .wantedDockerImage(dockerImage) .wantedVespaVersion(vespaVersion) - .wantedRestartGeneration(restartGeneration.get()) - .currentRestartGeneration(restartGeneration.get()) + .wantedRestartGeneration(1).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -197,11 +189,10 @@ public class NodeAgentImplTest { public void containerIsNotStoppedIfNewImageMustBePulled() { final DockerImage newDockerImage = DockerImage.fromString("new-image"); final NodeSpec node = nodeBuilder - .wantedDockerImage(newDockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) + .wantedDockerImage(newDockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -224,11 +215,10 @@ public class NodeAgentImplTest { @Test public void containerIsUpdatedIfCpuChanged() { NodeSpec.Builder specBuilder = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion); + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1); NodeAgentContext firstContext = createContext(specBuilder.build()); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); @@ -269,15 +259,12 @@ public class NodeAgentImplTest { } @Test - public void vespaIsRestartedIfMemoryChanged() { + public void containerIsRecreatedIfMemoryChanged() { NodeSpec.Builder specBuilder = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) - .wantedRestartGeneration(1) - .currentRestartGeneration(1); + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(2).currentRestartGeneration(1); NodeAgentContext firstContext = createContext(specBuilder.build()); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); @@ -291,11 +278,12 @@ public class NodeAgentImplTest { ContainerResources resourcesAfterThird = ContainerResources.from(0, 2, 20); mockGetContainer(dockerImage, resourcesAfterThird, true); - InOrder inOrder = inOrder(orchestrator, dockerOperations); + InOrder inOrder = inOrder(orchestrator, dockerOperations, nodeRepository); inOrder.verify(orchestrator).resume(any(String.class)); - inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); + inOrder.verify(dockerOperations).removeContainer(eq(secondContext), any()); inOrder.verify(dockerOperations, never()).updateContainer(any(), any()); - inOrder.verify(dockerOperations).restartVespa(eq(secondContext)); + inOrder.verify(dockerOperations, never()).restartVespa(any()); + inOrder.verify(nodeRepository).updateNodeAttributes(eq(hostName), eq(new NodeAttributes().withRestartGeneration(2))); nodeAgent.doConverge(secondContext); inOrder.verify(orchestrator).resume(any(String.class)); @@ -305,16 +293,11 @@ public class NodeAgentImplTest { @Test public void noRestartIfOrchestratorSuspendFails() { - final long wantedRestartGeneration = 2; - final long currentRestartGeneration = 1; final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) - .wantedRestartGeneration(wantedRestartGeneration) - .currentRestartGeneration(currentRestartGeneration) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(2).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -338,15 +321,12 @@ public class NodeAgentImplTest { @Test public void recreatesContainerIfRebootWanted() { final long wantedRebootGeneration = 2; - final long currentRebootGeneration = 1; final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) - .wantedRebootGeneration(wantedRebootGeneration) - .currentRebootGeneration(currentRebootGeneration) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1) + .wantedRebootGeneration(wantedRebootGeneration).currentRebootGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -383,11 +363,9 @@ public class NodeAgentImplTest { @Test public void failedNodeRunningContainerShouldStillBeRunning() { final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.failed) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) .build(); NodeAgentContext context = createContext(node); @@ -429,11 +407,9 @@ public class NodeAgentImplTest { @Test public void inactiveNodeRunningContainerShouldStillBeRunning() { final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.inactive) - .wantedVespaVersion(vespaVersion) - .currentVespaVersion(vespaVersion) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) .build(); NodeAgentContext context = createContext(node); @@ -453,8 +429,8 @@ public class NodeAgentImplTest { @Test public void reservedNodeDoesNotUpdateNodeRepoWithVersion() { final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) .state(NodeState.reserved) + .wantedDockerImage(dockerImage) .wantedVespaVersion(vespaVersion) .build(); @@ -470,13 +446,11 @@ public class NodeAgentImplTest { private void nodeRunningContainerIsTakenDownAndCleanedAndRecycled(NodeState nodeState, Optional<Long> wantedRestartGeneration) { wantedRestartGeneration.ifPresent(restartGeneration -> nodeBuilder - .wantedRestartGeneration(restartGeneration) - .currentRestartGeneration(restartGeneration)); + .wantedRestartGeneration(restartGeneration).currentRestartGeneration(restartGeneration)); final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(nodeState) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) .build(); NodeAgentContext context = createContext(node); @@ -532,9 +506,8 @@ public class NodeAgentImplTest { @Test public void testRestartDeadContainerAfterNodeAdminRestart() { final NodeSpec node = nodeBuilder - .currentDockerImage(dockerImage) - .wantedDockerImage(dockerImage) .state(NodeState.active) + .currentDockerImage(dockerImage).wantedDockerImage(dockerImage) .currentVespaVersion(vespaVersion) .build(); @@ -554,10 +527,10 @@ public class NodeAgentImplTest { @Test public void resumeProgramRunsUntilSuccess() { final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) - .currentDockerImage(dockerImage) .state(NodeState.active) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) .currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -591,9 +564,10 @@ public class NodeAgentImplTest { @Test public void start_container_subtask_failure_leads_to_container_restart() { final NodeSpec node = nodeBuilder - .wantedDockerImage(dockerImage) .state(NodeState.active) + .wantedDockerImage(dockerImage) .wantedVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1) .build(); NodeAgentContext context = createContext(node); @@ -627,9 +601,9 @@ public class NodeAgentImplTest { @Test public void testRunningConfigServer() { final NodeSpec node = nodeBuilder + .state(NodeState.active) .type(NodeType.config) .wantedDockerImage(dockerImage) - .state(NodeState.active) .wantedVespaVersion(vespaVersion) .build(); @@ -669,11 +643,11 @@ public class NodeAgentImplTest { private void verifyThatContainerIsStopped(NodeState nodeState, Optional<ApplicationId> owner) { NodeSpec.Builder nodeBuilder = new NodeSpec.Builder() + .resources(resources) .hostname(hostName) .type(NodeType.tenant) .flavor("docker") - .currentDockerImage(dockerImage) - .wantedDockerImage(dockerImage) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) .state(nodeState); owner.ifPresent(nodeBuilder::owner); @@ -706,7 +680,7 @@ public class NodeAgentImplTest { } private void mockGetContainer(DockerImage dockerImage, boolean isRunning) { - mockGetContainer(dockerImage, ContainerResources.from(0, MIN_CPU_CORES, MIN_MAIN_MEMORY_AVAILABLE_GB), isRunning); + mockGetContainer(dockerImage, ContainerResources.from(0, resources.vcpu(), resources.memoryGb()), isRunning); } private void mockGetContainer(DockerImage dockerImage, ContainerResources containerResources, boolean isRunning) { |