diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-02-23 16:41:10 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-02-23 16:41:10 +0100 |
commit | f91ce5cbbe344834af616ca8c20eafafdbe720bb (patch) | |
tree | 56d56004568e35c658c43b9da6d0740bd1e6fff1 /node-admin | |
parent | 5c105fa6ffe3ed27c633e48f17206d2b31307936 (diff) |
Avoid resume unless suspended
This is to reduce the contention for the application lock in Orchestrator.
Diffstat (limited to 'node-admin')
2 files changed, 26 insertions, 12 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 2f92ef8affe..3b6145aa6a8 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -76,6 +76,7 @@ public class NodeAgentImpl implements NodeAgent { private boolean hasResumedNode = false; private boolean hasStartedServices = true; private Optional<Instant> firstSuccessfulHealthCheckInstant = Optional.empty(); + private boolean suspendedInOrchestrator = false; private int numberOfUnhandledException = 0; private long currentRebootGeneration = 0; @@ -397,6 +398,7 @@ public class NodeAgentImpl implements NodeAgent { public void converge(NodeAgentContext context) { try { doConverge(context); + context.log(logger, LogLevel.INFO, "Converged"); } catch (ConvergenceException e) { context.log(logger, e.getMessage()); } catch (ContainerNotFoundException e) { @@ -411,7 +413,7 @@ public class NodeAgentImpl implements NodeAgent { } } - // Public for testing + // Non-private for testing void doConverge(NodeAgentContext context) { NodeSpec node = context.node(); Optional<Container> container = getContainer(context); @@ -488,8 +490,11 @@ public class NodeAgentImpl implements NodeAgent { // - Slobrok and internal orchestrator state is used to determine whether // to allow upgrade (suspend). updateNodeRepoWithCurrentAttributes(context); - context.log(logger, "Call resume against Orchestrator"); - orchestrator.resume(context.hostname().value()); + if (suspendedInOrchestrator || node.allowedToBeDown().orElse(false)) { + context.log(logger, "Call resume against Orchestrator"); + orchestrator.resume(context.hostname().value()); + suspendedInOrchestrator = false; + } break; case provisioned: nodeRepository.setNodeState(context.hostname().value(), NodeState.dirty); @@ -562,6 +567,7 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, "Ask Orchestrator for permission to suspend node"); try { orchestrator.suspend(context.hostname().value()); + suspendedInOrchestrator = true; } catch (OrchestratorException e) { // Ensure the ACLs are up to date: The reason we're unable to suspend may be because some other // node is unable to resume because the ACL rules of SOME Docker container is wrong... diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index f65ad379a63..06d15a3f577 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -80,6 +80,7 @@ public class NodeAgentImplTest { .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) .wantedRestartGeneration(1).currentRestartGeneration(1) + .allowedToBeDown(false) .build(); NodeAgentContext context = createContext(node); @@ -98,7 +99,7 @@ public class NodeAgentImplTest { // TODO: Verify this isn't run unless 1st time inOrder.verify(dockerOperations, never()).startServices(eq(context)); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); - inOrder.verify(orchestrator).resume(hostName); + inOrder.verify(orchestrator, never()).resume(hostName); } @Test @@ -188,7 +189,7 @@ public class NodeAgentImplTest { inOrder.verify(healthChecker, times(1)).verifyHealth(eq(context)); inOrder.verify(nodeRepository).updateNodeAttributes( hostName, new NodeAttributes().withDockerImage(dockerImage).withVespaVersion(dockerImage.tagAsVersion())); - inOrder.verify(orchestrator).resume(hostName); + inOrder.verify(orchestrator, never()).resume(hostName); } @Test @@ -224,7 +225,8 @@ public class NodeAgentImplTest { .state(NodeState.active) .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) - .wantedRestartGeneration(1).currentRestartGeneration(1); + .wantedRestartGeneration(1).currentRestartGeneration(1) + .allowedToBeDown(false); NodeAgentContext firstContext = createContext(specBuilder.build()); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); @@ -232,17 +234,20 @@ public class NodeAgentImplTest { when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true); when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); + InOrder inOrder = inOrder(orchestrator, dockerOperations); + nodeAgent.doConverge(firstContext); + inOrder.verify(orchestrator, never()).resume(any(String.class)); + NodeAgentContext secondContext = createContext(specBuilder.diskGb(200).build()); nodeAgent.doConverge(secondContext); + inOrder.verify(orchestrator, never()).resume(any(String.class)); + NodeAgentContext thirdContext = new NodeAgentContextImpl.Builder(specBuilder.vcpu(5).build()).cpuSpeedUp(1.25).build(); nodeAgent.doConverge(thirdContext); ContainerResources resourcesAfterThird = ContainerResources.from(0, 4, 16); mockGetContainer(dockerImage, resourcesAfterThird, true); - InOrder inOrder = inOrder(orchestrator, dockerOperations); - inOrder.verify(orchestrator).resume(any(String.class)); - inOrder.verify(orchestrator).resume(any(String.class)); inOrder.verify(orchestrator).suspend(any(String.class)); inOrder.verify(dockerOperations).updateContainer(eq(thirdContext), eq(resourcesAfterThird)); inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); @@ -254,7 +259,7 @@ public class NodeAgentImplTest { inOrder.verify(orchestrator, never()).suspend(any(String.class)); inOrder.verify(dockerOperations, never()).updateContainer(eq(thirdContext), any()); inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); - inOrder.verify(orchestrator).resume(any(String.class)); + inOrder.verify(orchestrator, never()).resume(any(String.class)); // Set the feature flag flagSource.withDoubleFlag(Flags.CONTAINER_CPU_CAP.id(), 2.3); @@ -537,6 +542,7 @@ public class NodeAgentImplTest { .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) .currentVespaVersion(vespaVersion) .wantedRestartGeneration(1).currentRestartGeneration(1) + .allowedToBeDown(true) .build(); NodeAgentContext context = createContext(node); @@ -611,6 +617,7 @@ public class NodeAgentImplTest { .type(NodeType.config) .wantedDockerImage(dockerImage) .wantedVespaVersion(vespaVersion) + .allowedToBeDown(true) .build(); NodeAgentContext context = createContext(node); @@ -689,6 +696,7 @@ public class NodeAgentImplTest { clock.advance(Duration.ofSeconds(31)); nodeAgent.doConverge(context); + inOrder.verify(orchestrator).suspend(hostName); inOrder.verify(dockerOperations).updateContainer(eq(context), eq(ContainerResources.from(0, 2, 16))); inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); inOrder.verify(dockerOperations, never()).startContainer(any()); @@ -699,7 +707,7 @@ public class NodeAgentImplTest { inOrder.verify(orchestrator, never()).suspend(any(String.class)); inOrder.verify(dockerOperations, never()).updateContainer(eq(context), any()); inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); - inOrder.verify(orchestrator).resume(any(String.class)); + inOrder.verify(orchestrator, never()).resume(any(String.class)); } @Test @@ -724,7 +732,7 @@ public class NodeAgentImplTest { inOrder.verify(orchestrator, never()).suspend(any(String.class)); inOrder.verify(dockerOperations, never()).updateContainer(eq(context), any()); inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); - inOrder.verify(orchestrator).resume(any(String.class)); + inOrder.verify(orchestrator, never()).resume(any(String.class)); } private void verifyThatContainerIsStopped(NodeState nodeState, Optional<ApplicationId> owner) { |