From f3689e3c0254681002106800519b9e0b71cfabd5 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Mon, 3 Feb 2020 17:34:15 +0100 Subject: Start container with uncapped CPU and start capping 30s after first successful health check --- .../vespa/hosted/dockerapi/ContainerResources.java | 4 + .../hosted/node/admin/nodeagent/NodeAgentImpl.java | 78 +++++++++++---- .../node/admin/integrationTests/DockerTester.java | 9 +- .../node/admin/nodeagent/NodeAgentImplTest.java | 106 ++++++++++++++++++++- 4 files changed, 171 insertions(+), 26 deletions(-) diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java index ffd5dffece9..c7b9d342043 100644 --- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java +++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java @@ -124,4 +124,8 @@ public class ContainerResources { public ContainerResources withMemoryBytes(long memoryBytes) { return new ContainerResources(cpus, cpuShares, memoryBytes); } + + public ContainerResources withUnlimitedCpus() { + return new ContainerResources(0, 0, memoryBytes); + } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 68178418e62..6e382794828 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -26,6 +26,9 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.identity.CredentialsMaintai import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import java.nio.file.Path; +import java.time.Clock; +import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.List; import java.util.Objects; @@ -47,11 +50,10 @@ public class NodeAgentImpl implements NodeAgent { // This is used as a definition of 1 GB when comparing flavor specs in node-repo private static final long BYTES_IN_GB = 1_000_000_000L; - private static final Logger logger = Logger.getLogger(NodeAgentImpl.class.getName()); + // Container is started with uncapped CPU and is kept that way until the first successful health check + this duration + private static final Duration DEFAULT_WARM_UP_DURATION = Duration.ofSeconds(30); - private final AtomicBoolean terminated = new AtomicBoolean(false); - private boolean hasResumedNode = false; - private boolean hasStartedServices = true; + private static final Logger logger = Logger.getLogger(NodeAgentImpl.class.getName()); private final NodeAgentContextSupplier contextSupplier; private final NodeRepository nodeRepository; @@ -61,12 +63,19 @@ public class NodeAgentImpl implements NodeAgent { private final Optional credentialsMaintainer; private final Optional aclMaintainer; private final Optional healthChecker; + private final Clock clock; + private final Duration warmUpDuration; private final DoubleFlag containerCpuCap; private Thread loopThread; private ContainerState containerState = UNKNOWN; private NodeSpec lastNode; + private final AtomicBoolean terminated = new AtomicBoolean(false); + private boolean hasResumedNode = false; + private boolean hasStartedServices = true; + private Optional firstSuccessfulHealthCheckInstant = Optional.empty(); + private int numberOfUnhandledException = 0; private long currentRebootGeneration = 0; private Optional currentRestartGeneration = Optional.empty(); @@ -87,16 +96,18 @@ public class NodeAgentImpl implements NodeAgent { // Created in NodeAdminImpl - public NodeAgentImpl( - final NodeAgentContextSupplier contextSupplier, - final NodeRepository nodeRepository, - final Orchestrator orchestrator, - final DockerOperations dockerOperations, - final StorageMaintainer storageMaintainer, - final FlagSource flagSource, - final Optional credentialsMaintainer, - final Optional aclMaintainer, - final Optional healthChecker) { + public NodeAgentImpl(NodeAgentContextSupplier contextSupplier, NodeRepository nodeRepository, + Orchestrator orchestrator, DockerOperations dockerOperations, StorageMaintainer storageMaintainer, + FlagSource flagSource, Optional credentialsMaintainer, + Optional aclMaintainer, Optional healthChecker, Clock clock) { + this(contextSupplier, nodeRepository, orchestrator, dockerOperations, storageMaintainer, flagSource, credentialsMaintainer, + aclMaintainer, healthChecker, clock, DEFAULT_WARM_UP_DURATION); + } + + public NodeAgentImpl(NodeAgentContextSupplier contextSupplier, NodeRepository nodeRepository, + Orchestrator orchestrator, DockerOperations dockerOperations, StorageMaintainer storageMaintainer, + FlagSource flagSource, Optional credentialsMaintainer, + Optional aclMaintainer, Optional healthChecker, Clock clock, Duration warmUpDuration) { this.contextSupplier = contextSupplier; this.nodeRepository = nodeRepository; this.orchestrator = orchestrator; @@ -105,6 +116,8 @@ public class NodeAgentImpl implements NodeAgent { this.credentialsMaintainer = credentialsMaintainer; this.aclMaintainer = aclMaintainer; this.healthChecker = healthChecker; + this.clock = clock; + this.warmUpDuration = warmUpDuration; this.containerCpuCap = Flags.CONTAINER_CPU_CAP.bindTo(flagSource); } @@ -194,9 +207,11 @@ public class NodeAgentImpl implements NodeAgent { } } - private void startContainer(NodeAgentContext context) { + private Container startContainer(NodeAgentContext context) { ContainerData containerData = createContainerData(context); - dockerOperations.createContainer(context, containerData, getContainerResources(context)); + ContainerResources wantedResources = warmUpDuration.isNegative() ? + getContainerResources(context) : getContainerResources(context).withUnlimitedCpus(); + dockerOperations.createContainer(context, containerData, wantedResources); dockerOperations.startContainer(context); currentRebootGeneration = context.node().wantedRebootGeneration(); @@ -204,6 +219,8 @@ public class NodeAgentImpl implements NodeAgent { hasStartedServices = true; // Automatically started with the container hasResumedNode = false; context.log(logger, "Container successfully started, new containerState is " + containerState); + return dockerOperations.getContainer(context).orElseThrow(() -> + new ConvergenceException("Did not find container that was just started")); } private Optional removeContainerIfNeededUpdateContainerState( @@ -250,6 +267,7 @@ public class NodeAgentImpl implements NodeAgent { if (containerState == ABSENT) return; try { hasStartedServices = hasResumedNode = false; + firstSuccessfulHealthCheckInstant = Optional.empty(); dockerOperations.stopServices(context); } catch (ContainerNotFoundException e) { containerState = ABSENT; @@ -333,9 +351,16 @@ public class NodeAgentImpl implements NodeAgent { } - private void updateContainerIfNeeded(NodeAgentContext context, Container existingContainer) { + /** Returns true iff container is correctly sized */ + private Container updateContainerIfNeeded(NodeAgentContext context, Container existingContainer) { ContainerResources wantedContainerResources = getContainerResources(context); - if (wantedContainerResources.equalsCpu(existingContainer.resources)) return; + + if (healthChecker.isPresent() && firstSuccessfulHealthCheckInstant + .map(clock.instant().minus(warmUpDuration)::isBefore) + .orElse(true)) + return existingContainer; + + if (wantedContainerResources.equalsCpu(existingContainer.resources)) return existingContainer; context.log(logger, "Container should be running with different CPU allocation, wanted: %s, current: %s", wantedContainerResources.toStringCpu(), existingContainer.resources.toStringCpu()); @@ -343,6 +368,8 @@ public class NodeAgentImpl implements NodeAgent { // Only update CPU resources dockerOperations.updateContainer(context, wantedContainerResources.withMemoryBytes(existingContainer.resources.memoryBytes())); + return dockerOperations.getContainer(context).orElseThrow(() -> + new ConvergenceException("Did not find container that was just updated")); } private ContainerResources getContainerResources(NodeAgentContext context) { @@ -430,16 +457,25 @@ public class NodeAgentImpl implements NodeAgent { credentialsMaintainer.ifPresent(maintainer -> maintainer.converge(context)); if (container.isEmpty()) { containerState = STARTING; - startContainer(context); + container = Optional.of(startContainer(context)); containerState = UNKNOWN; } else { - updateContainerIfNeeded(context, container.get()); + container = Optional.of(updateContainerIfNeeded(context, container.get())); } aclMaintainer.ifPresent(maintainer -> maintainer.converge(context)); startServicesIfNeeded(context); resumeNodeIfNeeded(context); - healthChecker.ifPresent(checker -> checker.verifyHealth(context)); + if (healthChecker.isPresent()) { + healthChecker.get().verifyHealth(context); + if (firstSuccessfulHealthCheckInstant.isEmpty()) + firstSuccessfulHealthCheckInstant = Optional.of(clock.instant()); + + Duration timeLeft = Duration.between(firstSuccessfulHealthCheckInstant.get(), clock.instant()); + if (!container.get().resources.equalsCpu(getContainerResources(context))) + throw new ConvergenceException("Refusing to resume until warm up period ends (" + + (timeLeft.isNegative() ? " next tick" : "in " + timeLeft) + ")"); + } // Because it's more important to stop a bad release from rolling out in prod, // we put the resume call last. So if we fail after updating the node repo attributes diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index 653a6f7f091..ea41ab21b9d 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -86,18 +86,19 @@ public class DockerTester implements AutoCloseable { .build(); nodeRepository.updateNodeRepositoryNode(hostSpec); + Clock clock = Clock.systemUTC(); + Metrics metrics = new Metrics(); FileSystem fileSystem = TestFileSystem.create(); DockerOperations dockerOperations = new DockerOperationsImpl(docker, terminal, ipAddresses, flagSource); - Metrics metrics = new Metrics(); NodeAgentFactory nodeAgentFactory = (contextSupplier, nodeContext) -> new NodeAgentImpl( contextSupplier, nodeRepository, orchestrator, dockerOperations, storageMaintainer, flagSource, - Optional.empty(), Optional.empty(), Optional.empty()); - nodeAdmin = new NodeAdminImpl(nodeAgentFactory, metrics, Clock.systemUTC(), Duration.ofMillis(10), Duration.ZERO); + Optional.empty(), Optional.empty(), Optional.empty(), clock, Duration.ofSeconds(-1)); + nodeAdmin = new NodeAdminImpl(nodeAgentFactory, metrics, clock, Duration.ofMillis(10), Duration.ZERO); NodeAgentContextFactory nodeAgentContextFactory = (nodeSpec, acl) -> new NodeAgentContextImpl.Builder(nodeSpec).acl(acl).pathToContainerStorageFromFileSystem(fileSystem).build(); nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeAgentContextFactory, nodeRepository, orchestrator, - nodeAdmin, HOST_HOSTNAME, Clock.systemUTC()); + nodeAdmin, HOST_HOSTNAME, clock); loopThread = new Thread(() -> { nodeAdminStateUpdater.start(); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 9e1f84f07a1..2f24a68c079 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.DockerImage; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import com.yahoo.test.ManualClock; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.hosted.dockerapi.Container; @@ -26,12 +27,17 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import org.junit.Test; import org.mockito.InOrder; +import java.time.Duration; +import java.time.Instant; import java.util.Optional; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.inOrder; import static org.mockito.Mockito.mock; @@ -65,6 +71,7 @@ public class NodeAgentImplTest { private final HealthChecker healthChecker = mock(HealthChecker.class); private final CredentialsMaintainer credentialsMaintainer = mock(CredentialsMaintainer.class); private final InMemoryFlagSource flagSource = new InMemoryFlagSource(); + private final ManualClock clock = new ManualClock(Instant.now()); @Test @@ -641,6 +648,86 @@ public class NodeAgentImplTest { verifyThatContainerIsStopped(NodeState.inactive, Optional.of(ApplicationId.defaultId())); } + @Test + public void initial_cpu_cap_test() { + NodeSpec.Builder specBuilder = nodeBuilder + .state(NodeState.active) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1); + + NodeAgentContext context = createContext(specBuilder.build()); + NodeAgentImpl nodeAgent = makeNodeAgent(null, false, Duration.ofSeconds(30)); + + when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); + + InOrder inOrder = inOrder(orchestrator, dockerOperations); + + ConvergenceException healthCheckException = new ConvergenceException("Not yet up"); + doThrow(healthCheckException).when(healthChecker).verifyHealth(any()); + for (int i = 0; i < 3; i++) { + try { + nodeAgent.doConverge(context); + fail("Expected to fail with health check exception"); + } catch (ConvergenceException e) { + assertEquals(healthCheckException, e); + } + clock.advance(Duration.ofSeconds(30)); + } + + doNothing().when(healthChecker).verifyHealth(any()); + try { + nodeAgent.doConverge(context); + fail("Expected to fail due to warm up period not yet done"); + } catch (ConvergenceException e) { + assertTrue(e.getMessage().startsWith("Refusing to resume until warm up period ends")); + } + inOrder.verify(orchestrator, never()).resume(any()); + inOrder.verify(orchestrator, never()).suspend(any()); + inOrder.verify(dockerOperations, never()).updateContainer(any(), any()); + + + clock.advance(Duration.ofSeconds(31)); + nodeAgent.doConverge(context); + + inOrder.verify(dockerOperations).updateContainer(eq(context), eq(ContainerResources.from(0, 2, 16))); + inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); + inOrder.verify(dockerOperations, never()).startContainer(any()); + inOrder.verify(orchestrator).resume(any(String.class)); + + // No changes + nodeAgent.converge(context); + inOrder.verify(orchestrator, never()).suspend(any(String.class)); + inOrder.verify(dockerOperations, never()).updateContainer(eq(context), any()); + inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); + inOrder.verify(orchestrator).resume(any(String.class)); + } + + @Test + public void resumes_normally_if_container_is_already_capped_on_start() { + NodeSpec.Builder specBuilder = nodeBuilder + .state(NodeState.active) + .wantedDockerImage(dockerImage).currentDockerImage(dockerImage) + .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion) + .wantedRestartGeneration(1).currentRestartGeneration(1); + + NodeAgentContext context = createContext(specBuilder.build()); + NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true, Duration.ofSeconds(30)); + mockGetContainer(dockerImage, ContainerResources.from(0, 2, 16), true); + + when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); + + InOrder inOrder = inOrder(orchestrator, dockerOperations); + + nodeAgent.doConverge(context); + + nodeAgent.converge(context); + inOrder.verify(orchestrator, never()).suspend(any(String.class)); + inOrder.verify(dockerOperations, never()).updateContainer(eq(context), any()); + inOrder.verify(dockerOperations, never()).removeContainer(any(), any()); + inOrder.verify(orchestrator).resume(any(String.class)); + } + private void verifyThatContainerIsStopped(NodeState nodeState, Optional owner) { NodeSpec.Builder nodeBuilder = new NodeSpec.Builder() .resources(resources) @@ -672,11 +759,28 @@ public class NodeAgentImplTest { } private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning) { + return makeNodeAgent(dockerImage, isRunning, Duration.ofSeconds(-1)); + } + + private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning, Duration warmUpDuration) { mockGetContainer(dockerImage, isRunning); + doAnswer(invoc -> { + NodeAgentContext context = invoc.getArgument(0, NodeAgentContext.class); + ContainerResources resources = invoc.getArgument(2, ContainerResources.class); + mockGetContainer(context.node().wantedDockerImage().get(), resources, true); + return null; + }).when(dockerOperations).createContainer(any(), any(), any()); + + doAnswer(invoc -> { + NodeAgentContext context = invoc.getArgument(0, NodeAgentContext.class); + ContainerResources resources = invoc.getArgument(1, ContainerResources.class); + mockGetContainer(context.node().wantedDockerImage().get(), resources, true); + return null; + }).when(dockerOperations).updateContainer(any(), any()); return new NodeAgentImpl(contextSupplier, nodeRepository, orchestrator, dockerOperations, storageMaintainer, flagSource, Optional.of(credentialsMaintainer), Optional.of(aclMaintainer), - Optional.of(healthChecker)); + Optional.of(healthChecker), clock, warmUpDuration); } private void mockGetContainer(DockerImage dockerImage, boolean isRunning) { -- cgit v1.2.3 From c80937058b7fb2b04e370480dba9c759a16b76ad Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Mon, 3 Feb 2020 17:44:14 +0100 Subject: Fix time remaining calculation --- .../com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java | 2 +- .../com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 6e382794828..488e97eec22 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -471,7 +471,7 @@ public class NodeAgentImpl implements NodeAgent { if (firstSuccessfulHealthCheckInstant.isEmpty()) firstSuccessfulHealthCheckInstant = Optional.of(clock.instant()); - Duration timeLeft = Duration.between(firstSuccessfulHealthCheckInstant.get(), clock.instant()); + Duration timeLeft = Duration.between(clock.instant(), firstSuccessfulHealthCheckInstant.get().plus(warmUpDuration)); if (!container.get().resources.equalsCpu(getContainerResources(context))) throw new ConvergenceException("Refusing to resume until warm up period ends (" + (timeLeft.isNegative() ? " next tick" : "in " + timeLeft) + ")"); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 2f24a68c079..f65ad379a63 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -32,7 +32,6 @@ import java.time.Instant; import java.util.Optional; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; @@ -680,7 +679,7 @@ public class NodeAgentImplTest { nodeAgent.doConverge(context); fail("Expected to fail due to warm up period not yet done"); } catch (ConvergenceException e) { - assertTrue(e.getMessage().startsWith("Refusing to resume until warm up period ends")); + assertEquals("Refusing to resume until warm up period ends (in PT30S)", e.getMessage()); } inOrder.verify(orchestrator, never()).resume(any()); inOrder.verify(orchestrator, never()).suspend(any()); -- cgit v1.2.3 From a09f4224f7cbf2a6f7578ba85d50e2fa901ceff7 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Mon, 3 Feb 2020 18:52:35 +0100 Subject: Remove invalid doc --- .../java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java | 1 - 1 file changed, 1 deletion(-) diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 488e97eec22..790bbd03e45 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -351,7 +351,6 @@ public class NodeAgentImpl implements NodeAgent { } - /** Returns true iff container is correctly sized */ private Container updateContainerIfNeeded(NodeAgentContext context, Container existingContainer) { ContainerResources wantedContainerResources = getContainerResources(context); -- cgit v1.2.3