aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2020-02-03 18:53:16 +0100
committerGitHub <noreply@github.com>2020-02-03 18:53:16 +0100
commit99ab443250858f95d5646d80332306f857a47568 (patch)
tree15dae17c472bcfe16d916ec71f000b6dcaad0b8b
parent6ad62b3385c54a367616ea7a07ba2165ce8961ff (diff)
parenta09f4224f7cbf2a6f7578ba85d50e2fa901ceff7 (diff)
Merge pull request #12054 from vespa-engine/freva/start-unthrottled
Start container with uncapped CPU and start capping 30s after first s…
-rw-r--r--docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java4
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java77
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java9
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java105
4 files changed, 169 insertions, 26 deletions
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java
index ffd5dffece9..c7b9d342043 100644
--- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java
+++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java
@@ -124,4 +124,8 @@ public class ContainerResources {
public ContainerResources withMemoryBytes(long memoryBytes) {
return new ContainerResources(cpus, cpuShares, memoryBytes);
}
+
+ public ContainerResources withUnlimitedCpus() {
+ return new ContainerResources(0, 0, memoryBytes);
+ }
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 68178418e62..790bbd03e45 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -26,6 +26,9 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.identity.CredentialsMaintai
import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import java.nio.file.Path;
+import java.time.Clock;
+import java.time.Duration;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
@@ -47,11 +50,10 @@ public class NodeAgentImpl implements NodeAgent {
// This is used as a definition of 1 GB when comparing flavor specs in node-repo
private static final long BYTES_IN_GB = 1_000_000_000L;
- private static final Logger logger = Logger.getLogger(NodeAgentImpl.class.getName());
+ // Container is started with uncapped CPU and is kept that way until the first successful health check + this duration
+ private static final Duration DEFAULT_WARM_UP_DURATION = Duration.ofSeconds(30);
- private final AtomicBoolean terminated = new AtomicBoolean(false);
- private boolean hasResumedNode = false;
- private boolean hasStartedServices = true;
+ private static final Logger logger = Logger.getLogger(NodeAgentImpl.class.getName());
private final NodeAgentContextSupplier contextSupplier;
private final NodeRepository nodeRepository;
@@ -61,12 +63,19 @@ public class NodeAgentImpl implements NodeAgent {
private final Optional<CredentialsMaintainer> credentialsMaintainer;
private final Optional<AclMaintainer> aclMaintainer;
private final Optional<HealthChecker> healthChecker;
+ private final Clock clock;
+ private final Duration warmUpDuration;
private final DoubleFlag containerCpuCap;
private Thread loopThread;
private ContainerState containerState = UNKNOWN;
private NodeSpec lastNode;
+ private final AtomicBoolean terminated = new AtomicBoolean(false);
+ private boolean hasResumedNode = false;
+ private boolean hasStartedServices = true;
+ private Optional<Instant> firstSuccessfulHealthCheckInstant = Optional.empty();
+
private int numberOfUnhandledException = 0;
private long currentRebootGeneration = 0;
private Optional<Long> currentRestartGeneration = Optional.empty();
@@ -87,16 +96,18 @@ public class NodeAgentImpl implements NodeAgent {
// Created in NodeAdminImpl
- public NodeAgentImpl(
- final NodeAgentContextSupplier contextSupplier,
- final NodeRepository nodeRepository,
- final Orchestrator orchestrator,
- final DockerOperations dockerOperations,
- final StorageMaintainer storageMaintainer,
- final FlagSource flagSource,
- final Optional<CredentialsMaintainer> credentialsMaintainer,
- final Optional<AclMaintainer> aclMaintainer,
- final Optional<HealthChecker> healthChecker) {
+ public NodeAgentImpl(NodeAgentContextSupplier contextSupplier, NodeRepository nodeRepository,
+ Orchestrator orchestrator, DockerOperations dockerOperations, StorageMaintainer storageMaintainer,
+ FlagSource flagSource, Optional<CredentialsMaintainer> credentialsMaintainer,
+ Optional<AclMaintainer> aclMaintainer, Optional<HealthChecker> healthChecker, Clock clock) {
+ this(contextSupplier, nodeRepository, orchestrator, dockerOperations, storageMaintainer, flagSource, credentialsMaintainer,
+ aclMaintainer, healthChecker, clock, DEFAULT_WARM_UP_DURATION);
+ }
+
+ public NodeAgentImpl(NodeAgentContextSupplier contextSupplier, NodeRepository nodeRepository,
+ Orchestrator orchestrator, DockerOperations dockerOperations, StorageMaintainer storageMaintainer,
+ FlagSource flagSource, Optional<CredentialsMaintainer> credentialsMaintainer,
+ Optional<AclMaintainer> aclMaintainer, Optional<HealthChecker> healthChecker, Clock clock, Duration warmUpDuration) {
this.contextSupplier = contextSupplier;
this.nodeRepository = nodeRepository;
this.orchestrator = orchestrator;
@@ -105,6 +116,8 @@ public class NodeAgentImpl implements NodeAgent {
this.credentialsMaintainer = credentialsMaintainer;
this.aclMaintainer = aclMaintainer;
this.healthChecker = healthChecker;
+ this.clock = clock;
+ this.warmUpDuration = warmUpDuration;
this.containerCpuCap = Flags.CONTAINER_CPU_CAP.bindTo(flagSource);
}
@@ -194,9 +207,11 @@ public class NodeAgentImpl implements NodeAgent {
}
}
- private void startContainer(NodeAgentContext context) {
+ private Container startContainer(NodeAgentContext context) {
ContainerData containerData = createContainerData(context);
- dockerOperations.createContainer(context, containerData, getContainerResources(context));
+ ContainerResources wantedResources = warmUpDuration.isNegative() ?
+ getContainerResources(context) : getContainerResources(context).withUnlimitedCpus();
+ dockerOperations.createContainer(context, containerData, wantedResources);
dockerOperations.startContainer(context);
currentRebootGeneration = context.node().wantedRebootGeneration();
@@ -204,6 +219,8 @@ public class NodeAgentImpl implements NodeAgent {
hasStartedServices = true; // Automatically started with the container
hasResumedNode = false;
context.log(logger, "Container successfully started, new containerState is " + containerState);
+ return dockerOperations.getContainer(context).orElseThrow(() ->
+ new ConvergenceException("Did not find container that was just started"));
}
private Optional<Container> removeContainerIfNeededUpdateContainerState(
@@ -250,6 +267,7 @@ public class NodeAgentImpl implements NodeAgent {
if (containerState == ABSENT) return;
try {
hasStartedServices = hasResumedNode = false;
+ firstSuccessfulHealthCheckInstant = Optional.empty();
dockerOperations.stopServices(context);
} catch (ContainerNotFoundException e) {
containerState = ABSENT;
@@ -333,9 +351,15 @@ public class NodeAgentImpl implements NodeAgent {
}
- private void updateContainerIfNeeded(NodeAgentContext context, Container existingContainer) {
+ private Container updateContainerIfNeeded(NodeAgentContext context, Container existingContainer) {
ContainerResources wantedContainerResources = getContainerResources(context);
- if (wantedContainerResources.equalsCpu(existingContainer.resources)) return;
+
+ if (healthChecker.isPresent() && firstSuccessfulHealthCheckInstant
+ .map(clock.instant().minus(warmUpDuration)::isBefore)
+ .orElse(true))
+ return existingContainer;
+
+ if (wantedContainerResources.equalsCpu(existingContainer.resources)) return existingContainer;
context.log(logger, "Container should be running with different CPU allocation, wanted: %s, current: %s",
wantedContainerResources.toStringCpu(), existingContainer.resources.toStringCpu());
@@ -343,6 +367,8 @@ public class NodeAgentImpl implements NodeAgent {
// Only update CPU resources
dockerOperations.updateContainer(context, wantedContainerResources.withMemoryBytes(existingContainer.resources.memoryBytes()));
+ return dockerOperations.getContainer(context).orElseThrow(() ->
+ new ConvergenceException("Did not find container that was just updated"));
}
private ContainerResources getContainerResources(NodeAgentContext context) {
@@ -430,16 +456,25 @@ public class NodeAgentImpl implements NodeAgent {
credentialsMaintainer.ifPresent(maintainer -> maintainer.converge(context));
if (container.isEmpty()) {
containerState = STARTING;
- startContainer(context);
+ container = Optional.of(startContainer(context));
containerState = UNKNOWN;
} else {
- updateContainerIfNeeded(context, container.get());
+ container = Optional.of(updateContainerIfNeeded(context, container.get()));
}
aclMaintainer.ifPresent(maintainer -> maintainer.converge(context));
startServicesIfNeeded(context);
resumeNodeIfNeeded(context);
- healthChecker.ifPresent(checker -> checker.verifyHealth(context));
+ if (healthChecker.isPresent()) {
+ healthChecker.get().verifyHealth(context);
+ if (firstSuccessfulHealthCheckInstant.isEmpty())
+ firstSuccessfulHealthCheckInstant = Optional.of(clock.instant());
+
+ Duration timeLeft = Duration.between(clock.instant(), firstSuccessfulHealthCheckInstant.get().plus(warmUpDuration));
+ if (!container.get().resources.equalsCpu(getContainerResources(context)))
+ throw new ConvergenceException("Refusing to resume until warm up period ends (" +
+ (timeLeft.isNegative() ? " next tick" : "in " + timeLeft) + ")");
+ }
// Because it's more important to stop a bad release from rolling out in prod,
// we put the resume call last. So if we fail after updating the node repo attributes
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
index 653a6f7f091..ea41ab21b9d 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
@@ -86,18 +86,19 @@ public class DockerTester implements AutoCloseable {
.build();
nodeRepository.updateNodeRepositoryNode(hostSpec);
+ Clock clock = Clock.systemUTC();
+ Metrics metrics = new Metrics();
FileSystem fileSystem = TestFileSystem.create();
DockerOperations dockerOperations = new DockerOperationsImpl(docker, terminal, ipAddresses, flagSource);
- Metrics metrics = new Metrics();
NodeAgentFactory nodeAgentFactory = (contextSupplier, nodeContext) -> new NodeAgentImpl(
contextSupplier, nodeRepository, orchestrator, dockerOperations, storageMaintainer, flagSource,
- Optional.empty(), Optional.empty(), Optional.empty());
- nodeAdmin = new NodeAdminImpl(nodeAgentFactory, metrics, Clock.systemUTC(), Duration.ofMillis(10), Duration.ZERO);
+ Optional.empty(), Optional.empty(), Optional.empty(), clock, Duration.ofSeconds(-1));
+ nodeAdmin = new NodeAdminImpl(nodeAgentFactory, metrics, clock, Duration.ofMillis(10), Duration.ZERO);
NodeAgentContextFactory nodeAgentContextFactory = (nodeSpec, acl) ->
new NodeAgentContextImpl.Builder(nodeSpec).acl(acl).pathToContainerStorageFromFileSystem(fileSystem).build();
nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeAgentContextFactory, nodeRepository, orchestrator,
- nodeAdmin, HOST_HOSTNAME, Clock.systemUTC());
+ nodeAdmin, HOST_HOSTNAME, clock);
loopThread = new Thread(() -> {
nodeAdminStateUpdater.start();
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 9e1f84f07a1..f65ad379a63 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.DockerImage;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.test.ManualClock;
import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.hosted.dockerapi.Container;
@@ -26,12 +27,16 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import org.junit.Test;
import org.mockito.InOrder;
+import java.time.Duration;
+import java.time.Instant;
import java.util.Optional;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.doAnswer;
+import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.mock;
@@ -65,6 +70,7 @@ public class NodeAgentImplTest {
private final HealthChecker healthChecker = mock(HealthChecker.class);
private final CredentialsMaintainer credentialsMaintainer = mock(CredentialsMaintainer.class);
private final InMemoryFlagSource flagSource = new InMemoryFlagSource();
+ private final ManualClock clock = new ManualClock(Instant.now());
@Test
@@ -641,6 +647,86 @@ public class NodeAgentImplTest {
verifyThatContainerIsStopped(NodeState.inactive, Optional.of(ApplicationId.defaultId()));
}
+ @Test
+ public void initial_cpu_cap_test() {
+ NodeSpec.Builder specBuilder = nodeBuilder
+ .state(NodeState.active)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1);
+
+ NodeAgentContext context = createContext(specBuilder.build());
+ NodeAgentImpl nodeAgent = makeNodeAgent(null, false, Duration.ofSeconds(30));
+
+ when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L));
+
+ InOrder inOrder = inOrder(orchestrator, dockerOperations);
+
+ ConvergenceException healthCheckException = new ConvergenceException("Not yet up");
+ doThrow(healthCheckException).when(healthChecker).verifyHealth(any());
+ for (int i = 0; i < 3; i++) {
+ try {
+ nodeAgent.doConverge(context);
+ fail("Expected to fail with health check exception");
+ } catch (ConvergenceException e) {
+ assertEquals(healthCheckException, e);
+ }
+ clock.advance(Duration.ofSeconds(30));
+ }
+
+ doNothing().when(healthChecker).verifyHealth(any());
+ try {
+ nodeAgent.doConverge(context);
+ fail("Expected to fail due to warm up period not yet done");
+ } catch (ConvergenceException e) {
+ assertEquals("Refusing to resume until warm up period ends (in PT30S)", e.getMessage());
+ }
+ inOrder.verify(orchestrator, never()).resume(any());
+ inOrder.verify(orchestrator, never()).suspend(any());
+ inOrder.verify(dockerOperations, never()).updateContainer(any(), any());
+
+
+ clock.advance(Duration.ofSeconds(31));
+ nodeAgent.doConverge(context);
+
+ inOrder.verify(dockerOperations).updateContainer(eq(context), eq(ContainerResources.from(0, 2, 16)));
+ inOrder.verify(dockerOperations, never()).removeContainer(any(), any());
+ inOrder.verify(dockerOperations, never()).startContainer(any());
+ inOrder.verify(orchestrator).resume(any(String.class));
+
+ // No changes
+ nodeAgent.converge(context);
+ inOrder.verify(orchestrator, never()).suspend(any(String.class));
+ inOrder.verify(dockerOperations, never()).updateContainer(eq(context), any());
+ inOrder.verify(dockerOperations, never()).removeContainer(any(), any());
+ inOrder.verify(orchestrator).resume(any(String.class));
+ }
+
+ @Test
+ public void resumes_normally_if_container_is_already_capped_on_start() {
+ NodeSpec.Builder specBuilder = nodeBuilder
+ .state(NodeState.active)
+ .wantedDockerImage(dockerImage).currentDockerImage(dockerImage)
+ .wantedVespaVersion(vespaVersion).currentVespaVersion(vespaVersion)
+ .wantedRestartGeneration(1).currentRestartGeneration(1);
+
+ NodeAgentContext context = createContext(specBuilder.build());
+ NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true, Duration.ofSeconds(30));
+ mockGetContainer(dockerImage, ContainerResources.from(0, 2, 16), true);
+
+ when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L));
+
+ InOrder inOrder = inOrder(orchestrator, dockerOperations);
+
+ nodeAgent.doConverge(context);
+
+ nodeAgent.converge(context);
+ inOrder.verify(orchestrator, never()).suspend(any(String.class));
+ inOrder.verify(dockerOperations, never()).updateContainer(eq(context), any());
+ inOrder.verify(dockerOperations, never()).removeContainer(any(), any());
+ inOrder.verify(orchestrator).resume(any(String.class));
+ }
+
private void verifyThatContainerIsStopped(NodeState nodeState, Optional<ApplicationId> owner) {
NodeSpec.Builder nodeBuilder = new NodeSpec.Builder()
.resources(resources)
@@ -672,11 +758,28 @@ public class NodeAgentImplTest {
}
private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning) {
+ return makeNodeAgent(dockerImage, isRunning, Duration.ofSeconds(-1));
+ }
+
+ private NodeAgentImpl makeNodeAgent(DockerImage dockerImage, boolean isRunning, Duration warmUpDuration) {
mockGetContainer(dockerImage, isRunning);
+ doAnswer(invoc -> {
+ NodeAgentContext context = invoc.getArgument(0, NodeAgentContext.class);
+ ContainerResources resources = invoc.getArgument(2, ContainerResources.class);
+ mockGetContainer(context.node().wantedDockerImage().get(), resources, true);
+ return null;
+ }).when(dockerOperations).createContainer(any(), any(), any());
+
+ doAnswer(invoc -> {
+ NodeAgentContext context = invoc.getArgument(0, NodeAgentContext.class);
+ ContainerResources resources = invoc.getArgument(1, ContainerResources.class);
+ mockGetContainer(context.node().wantedDockerImage().get(), resources, true);
+ return null;
+ }).when(dockerOperations).updateContainer(any(), any());
return new NodeAgentImpl(contextSupplier, nodeRepository, orchestrator, dockerOperations,
storageMaintainer, flagSource, Optional.of(credentialsMaintainer), Optional.of(aclMaintainer),
- Optional.of(healthChecker));
+ Optional.of(healthChecker), clock, warmUpDuration);
}
private void mockGetContainer(DockerImage dockerImage, boolean isRunning) {