From 57cbdcc67d9b0f35f05325d6d72d4a0cc8187a4b Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Wed, 17 Jun 2020 10:27:06 +0200 Subject: Test SpareCapacityMaintainer --- .../com/yahoo/vespa/hosted/provision/NodeList.java | 14 +- .../provision/maintenance/CapacityChecker.java | 37 ++-- .../maintenance/MaintenanceDeployment.java | 24 ++- .../maintenance/NodeRepositoryMaintenance.java | 25 ++- .../hosted/provision/maintenance/Rebalancer.java | 2 +- .../maintenance/SpareCapacityMaintainer.java | 20 +- .../provision/persistence/NodeSerializer.java | 14 +- .../hosted/provision/testutils/MockDeployer.java | 65 ++++++- .../provision/maintenance/CapacityCheckerTest.java | 1 + .../maintenance/CapacityCheckerTester.java | 20 +- .../maintenance/SpareCapacityMaintainerTest.java | 214 +++++++++++++++++++++ 11 files changed, 368 insertions(+), 68 deletions(-) create mode 100644 node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java (limited to 'node-repository/src') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index 1cc2abef734..2a6db1ffe6e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -32,7 +32,7 @@ public class NodeList extends AbstractFilteringList { /** Returns the subset of nodes which are retired */ public NodeList retired() { - return matching(node -> node.allocation().get().membership().retired()); + return matching(node -> node.allocation().isPresent() && node.allocation().get().membership().retired()); } /** Returns the subset of nodes that are being deprovisioned */ @@ -42,7 +42,7 @@ public class NodeList extends AbstractFilteringList { /** Returns the subset of nodes which are removable */ public NodeList removable() { - return matching(node -> node.allocation().get().isRemovable()); + return matching(node -> node.allocation().isPresent() && node.allocation().get().isRemovable()); } /** Returns the subset of nodes having exactly the given resources */ @@ -146,6 +146,16 @@ public class NodeList extends AbstractFilteringList { return matching(node -> nodeStates.contains(node.state())); } + /** Returns the subset of nodes in the active state */ + public NodeList active() { + return matching(node -> node.state()== Node.State.active); + } + + /** Returns the subset of nodes which wantToRetire set true */ + public NodeList wantToRetire() { + return matching((node -> node.status().wantToRetire())); + } + /** Returns the parent nodes of the given child nodes */ public NodeList parentsOf(Collection children) { return children.stream() diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java index 97810c0b329..1c078dbd0c4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java @@ -206,20 +206,24 @@ public class CapacityChecker { var containedAllocations = collateAllocations(nodechildren); var resourceMap = new HashMap<>(availableResources); List validAllocationTargets = allHosts.stream() - .filter(h -> !hostsToRemove.contains(h)) - .collect(Collectors.toList()); - if (validAllocationTargets.size() == 0) { + .filter(h -> !hostsToRemove.contains(h)) + .collect(Collectors.toList()); + if (validAllocationTargets.size() == 0) return Optional.of(HostRemovalFailure.none()); - } allocationHistory = new AllocationHistory(); for (var host : hostsToRemove) { Optional unallocatedNode = tryAllocateNodes(nodechildren.get(host), - validAllocationTargets, resourceMap, containedAllocations, true); + validAllocationTargets, + resourceMap, + containedAllocations, + true); if (unallocatedNode.isPresent()) { AllocationFailureReasonList failures = collateAllocationFailures(unallocatedNode.get(), - validAllocationTargets, resourceMap, containedAllocations); + validAllocationTargets, + resourceMap, + containedAllocations); return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(), failures)); } } @@ -332,6 +336,11 @@ public class CapacityChecker { public List hostsCausingFailure; public HostRemovalFailure failureReason; + @Override + public String toString() { + return "failure path: " + failureReason + " upon removing " + hostsCausingFailure; + } + } /** @@ -346,17 +355,15 @@ public class CapacityChecker { public AllocationFailureReasonList allocationFailures; public static HostRemovalFailure none() { - return new HostRemovalFailure( - Optional.empty(), - Optional.empty(), - new AllocationFailureReasonList(List.of())); + return new HostRemovalFailure(Optional.empty(), + Optional.empty(), + new AllocationFailureReasonList(List.of())); } public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) { - return new HostRemovalFailure( - Optional.of(host), - Optional.of(tenant), - failureReasons); + return new HostRemovalFailure(Optional.of(host), + Optional.of(tenant), + failureReasons); } private HostRemovalFailure(Optional host, Optional tenant, AllocationFailureReasonList allocationFailures) { @@ -367,7 +374,7 @@ public class CapacityChecker { @Override public String toString() { - if (host.isEmpty() || tenant.isEmpty()) return "No removal candidates exists."; + if (host.isEmpty() || tenant.isEmpty()) return "No removal candidates exists"; return String.format( "Failure to remove host %s" + "\n\tNo new host found for tenant %s:" + diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java index db331e88d64..2b4775cbeb1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java @@ -147,9 +147,12 @@ class MaintenanceDeployment implements Closeable { /** * Try to deploy to make this move. * + * @param verifyTarget true to only make this move if the node ends up at the expected target host, + * false if we should perform it as long as it moves from the source host * @return true if the move was done, false if it couldn't be */ - public boolean execute(Agent agent, Deployer deployer, Metric metric, NodeRepository nodeRepository) { + public boolean execute(boolean verifyTarget, + Agent agent, Deployer deployer, Metric metric, NodeRepository nodeRepository) { if (isEmpty()) return false; ApplicationId application = node.allocation().get().owner(); try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository)) { @@ -161,16 +164,19 @@ class MaintenanceDeployment implements Closeable { Optional expectedNewNode = Optional.empty(); try { if ( ! deployment.prepare()) return false; - expectedNewNode = - nodeRepository.getNodes(application, Node.State.reserved).stream() - .filter(n -> !n.hostname().equals(node.hostname())) - .filter(n -> n.allocation().get().membership().cluster().id().equals(node.allocation().get().membership().cluster().id())) - .findAny(); - if (expectedNewNode.isEmpty()) return false; - if ( ! expectedNewNode.get().hasParent(toHost.hostname())) return false; + if (verifyTarget) { + expectedNewNode = + nodeRepository.getNodes(application, Node.State.reserved).stream() + .filter(n -> !n.hostname().equals(node.hostname())) + .filter(n -> n.allocation().get().membership().cluster().id().equals(node.allocation().get().membership().cluster().id())) + .findAny(); + if (expectedNewNode.isEmpty()) return false; + if (!expectedNewNode.get().hasParent(toHost.hostname())) return false; + } if ( ! deployment.activate()) return false; - log.info(agent + " redeployed " + application + " to " + this); + log.info(agent + " redeployed " + application + " to " + + ( verifyTarget ? this : "move " + (node.hostname() + " from " + fromHost))); return true; } finally { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index c3f8afae4a4..7875afeb28c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -168,25 +168,24 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final NodeFailer.ThrottlePolicy throttlePolicy; DefaultTimes(Zone zone) { - failGrace = Duration.ofMinutes(30); - periodicRedeployInterval = Duration.ofMinutes(30); - // Don't redeploy in test environments - redeployMaintainerInterval = Duration.ofMinutes(1); - operatorChangeRedeployInterval = Duration.ofMinutes(1); + autoscalingInterval = Duration.ofMinutes(5); + dynamicProvisionerInterval = Duration.ofMinutes(5); failedExpirerInterval = Duration.ofMinutes(10); - provisionedExpiry = Duration.ofHours(4); - spareCapacityMaintenanceInterval = Duration.ofMinutes(10); - metricsInterval = Duration.ofMinutes(1); + failGrace = Duration.ofMinutes(30); infrastructureProvisionInterval = Duration.ofMinutes(1); - throttlePolicy = NodeFailer.ThrottlePolicy.hosted; loadBalancerExpirerInterval = Duration.ofMinutes(5); - reservationExpiry = Duration.ofMinutes(15); // Need to be long enough for deployment to be finished for all config model versions - dynamicProvisionerInterval = Duration.ofMinutes(5); + metricsInterval = Duration.ofMinutes(1); + nodeMetricsCollectionInterval = Duration.ofMinutes(1); + operatorChangeRedeployInterval = Duration.ofMinutes(1); osUpgradeActivatorInterval = zone.system().isCd() ? Duration.ofSeconds(30) : Duration.ofMinutes(5); + periodicRedeployInterval = Duration.ofMinutes(30); + provisionedExpiry = Duration.ofHours(4); rebalancerInterval = Duration.ofMinutes(40); - nodeMetricsCollectionInterval = Duration.ofMinutes(1); - autoscalingInterval = Duration.ofMinutes(5); + redeployMaintainerInterval = Duration.ofMinutes(1); + reservationExpiry = Duration.ofMinutes(15); // Need to be long enough for deployment to be finished for all config model versions scalingSuggestionsInterval = Duration.ofMinutes(31); + spareCapacityMaintenanceInterval = Duration.ofMinutes(10); + throttlePolicy = NodeFailer.ThrottlePolicy.hosted; if (zone.environment().equals(Environment.prod) && ! zone.system().isCd()) { inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index e10ac68fde0..3df20fa9d08 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -46,7 +46,7 @@ public class Rebalancer extends NodeRepositoryMaintainer { NodeList allNodes = nodeRepository().list(); updateSkewMetric(allNodes); if ( ! zoneIsStable(allNodes)) return; - findBestMove(allNodes).execute(Agent.Rebalancer, deployer, metric, nodeRepository()); + findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository()); } /** We do this here rather than in MetricsReporter because it is expensive and frequent updates are unnecessary */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 4179d6d3f83..a396bbda5c9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.NodeResources; +import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -68,11 +69,9 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1; if (spareHostCapacity == 0) { Move move = findMitigatingMove(failurePath.get()); - boolean success = move.execute(Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository()); - if (success) { - // This may strictly be a (noble) lie: We succeeded in taking one step to mitigate, - // but not necessarily *all* steps needed to justify adding 1 to spare capacity. - // However, we alert on this being 0 and we don't want to do that as long as we're not stuck. + if (moving(move)) { + // We succeeded or are in the process of taking a step to mitigate. + // Report with the assumption this will eventually succeed to avoid alerting before we're stuck spareHostCapacity++; } } @@ -80,6 +79,12 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } } + private boolean moving(Move move) { + if (move.isEmpty()) return false; + if (move.node().allocation().get().membership().retired()) return true; // Move already in progress + return move.execute(false, Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository()); + } + private Move findMitigatingMove(CapacityChecker.HostFailurePath failurePath) { Optional nodeWhichCantMove = failurePath.failureReason.tenant; if (nodeWhichCantMove.isEmpty()) return Move.empty(); @@ -237,9 +242,10 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { @Override public List next() { - next = null; if ( ! hasNext()) throw new IllegalStateException("No more elements"); - return next; + var current = next; + next = null; + return current; } private boolean hasOneAtPosition(int position, int number) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 15be7796187..37842115949 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -386,15 +386,16 @@ public class NodeSerializer { case "operator" : return Agent.operator; case "application" : return Agent.application; case "system" : return Agent.system; - case "NodeFailer" : return Agent.NodeFailer; - case "Rebalancer" : return Agent.Rebalancer; case "DirtyExpirer" : return Agent.DirtyExpirer; + case "DynamicProvisioningMaintainer" : return Agent.DynamicProvisioningMaintainer; case "FailedExpirer" : return Agent.FailedExpirer; case "InactiveExpirer" : return Agent.InactiveExpirer; + case "NodeFailer" : return Agent.NodeFailer; case "ProvisionedExpirer" : return Agent.ProvisionedExpirer; + case "Rebalancer" : return Agent.Rebalancer; case "ReservationExpirer" : return Agent.ReservationExpirer; - case "DynamicProvisioningMaintainer" : return Agent.DynamicProvisioningMaintainer; case "RetiringUpgrader" : return Agent.RetiringUpgrader; + case "SpareCapacityMaintainer": return Agent.SpareCapacityMaintainer; } throw new IllegalArgumentException("Unknown node event agent '" + eventAgentField.asString() + "'"); } @@ -403,15 +404,16 @@ public class NodeSerializer { case operator : return "operator"; case application : return "application"; case system : return "system"; - case NodeFailer : return "NodeFailer"; - case Rebalancer : return "Rebalancer"; case DirtyExpirer : return "DirtyExpirer"; + case DynamicProvisioningMaintainer : return "DynamicProvisioningMaintainer"; case FailedExpirer : return "FailedExpirer"; case InactiveExpirer : return "InactiveExpirer"; + case NodeFailer : return "NodeFailer"; case ProvisionedExpirer : return "ProvisionedExpirer"; + case Rebalancer : return "Rebalancer"; case ReservationExpirer : return "ReservationExpirer"; - case DynamicProvisioningMaintainer : return "DynamicProvisioningMaintainer"; case RetiringUpgrader: return "RetiringUpgrader"; + case SpareCapacityMaintainer: return "SpareCapacityMaintainer"; } throw new IllegalArgumentException("Serialized form of '" + agent + "' not defined"); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java index 5ec5c2c08e8..95f98a351a1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java @@ -10,6 +10,9 @@ import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostFilter; import com.yahoo.config.provision.HostSpec; import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; import java.time.Clock; @@ -27,18 +30,22 @@ import java.util.stream.Collectors; */ public class MockDeployer implements Deployer { + // For actual deploy mode private final NodeRepositoryProvisioner provisioner; private final Map applications; - private final Map lastDeployTimes = new HashMap<>(); + // For mock deploy anything, changing wantToRetire to retired only + private final NodeRepository nodeRepository; /** The number of redeployments done to this */ public int redeployments = 0; + private final Map lastDeployTimes = new HashMap<>(); private final Clock clock; private final ReentrantLock lock = new ReentrantLock(); private boolean failActivate = false; + /** Create a mock deployer which returns empty on every deploy request. */ @Inject @SuppressWarnings("unused") public MockDeployer() { @@ -46,15 +53,30 @@ public class MockDeployer implements Deployer { } /** - * Create a mock deployer which contains a substitute for an application repository, fullfilled to + * Create a mock deployer which returns a deployment on every request, + * and fulfills it by not actually deploying but only changing any wantToRetire nodes + * for the application to retired. + */ + public MockDeployer(NodeRepository nodeRepository) { + this.provisioner = null; + this.applications = Map.of(); + this.nodeRepository = nodeRepository; + + this.clock = nodeRepository.clock(); + } + + /** + * Create a mock deployer which contains a substitute for an application repository, filled to * be able to call provision with the right parameters. */ public MockDeployer(NodeRepositoryProvisioner provisioner, Clock clock, Map applications) { this.provisioner = provisioner; - this.clock = clock; this.applications = new HashMap<>(applications); + this.nodeRepository = null; + + this.clock = clock; } public ReentrantLock lock() { return lock; } @@ -74,9 +96,13 @@ public class MockDeployer implements Deployer { throw new RuntimeException(e); } try { - return Optional.ofNullable(applications.get(id)) - .map(application -> new MockDeployment(provisioner, application)); - } finally { + if (provisioner != null) + return Optional.ofNullable(applications.get(id)) + .map(application -> new MockDeployment(provisioner, application)); + else + return Optional.of(new RetiringOnlyMockDeployment(nodeRepository, id)); + } + finally { lock.unlock(); } } @@ -135,6 +161,33 @@ public class MockDeployer implements Deployer { } + public class RetiringOnlyMockDeployment implements Deployment { + + private final NodeRepository nodeRepository; + private final ApplicationId applicationId; + + private RetiringOnlyMockDeployment(NodeRepository nodeRepository, ApplicationId applicationId) { + this.nodeRepository = nodeRepository; + this.applicationId = applicationId; + } + + @Override + public void prepare() { } + + @Override + public void activate() { + redeployments++; + lastDeployTimes.put(applicationId, clock.instant()); + + for (Node node : nodeRepository.list().owner(applicationId).active().wantToRetire().asList()) + nodeRepository.write(node.retire(nodeRepository.clock().instant()), nodeRepository.lock(node)); + } + + @Override + public void restart(HostFilter filter) {} + + } + /** An application context which substitutes for an application repository */ public static class ApplicationContext { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java index 5813585554d..5e72cfc53ac 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java @@ -29,6 +29,7 @@ public class CapacityCheckerTest { var failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure(); assertTrue(failurePath.isPresent()); assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.get().hostsCausingFailure)); + assertEquals(5, failurePath.get().hostsCausingFailure.size()); } @Test diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java index 464bdd6405a..62e9a227109 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java @@ -45,6 +45,7 @@ import java.util.stream.IntStream; * @author mgimle */ public class CapacityCheckerTester { + public static final Zone zone = new Zone(Environment.prod, RegionName.from("us-east")); // Components with state @@ -138,8 +139,7 @@ public class CapacityCheckerTester { .mapToObj(n -> String.format("%04X::%04X", hostindex, n)) .collect(Collectors.toSet()); - NodeResources nr = containingNodeResources(childResources, - excessCapacity); + NodeResources nr = containingNodeResources(childResources, excessCapacity); Node node = nodeRepository.createNode(hostname, hostname, new IP.Config(Set.of("::"), availableIps), Optional.empty(), new Flavor(nr), Optional.empty(), NodeType.host); @@ -159,7 +159,8 @@ public class CapacityCheckerTester { Set availableIps = IntStream.range(0, ips) .mapToObj(n -> String.format("%04X::%04X", hostid, n)) .collect(Collectors.toSet()); - Node node = nodeRepository.createNode(hostname, hostname, + Node node = nodeRepository.createNode(hostname, + hostname, new IP.Config(Set.of("::"), availableIps), Optional.empty(), new Flavor(capacity), Optional.empty(), NodeType.host); hosts.add(node); @@ -175,8 +176,8 @@ public class CapacityCheckerTester { ); createNodes(childrenPerHost, numDistinctChildren, childResources, - numHosts, hostExcessCapacity, hostExcessIps, - numEmptyHosts, emptyHostExcessCapacity, emptyHostExcessIps); + numHosts, hostExcessCapacity, hostExcessIps, + numEmptyHosts, emptyHostExcessCapacity, emptyHostExcessIps); } void createNodes(int childrenPerHost, int numDistinctChildren, List childResources, int numHosts, NodeResources hostExcessCapacity, int hostExcessIps, @@ -264,10 +265,11 @@ public class CapacityCheckerTester { owner = ApplicationId.from(nodeModel.owner.tenant, nodeModel.owner.application, nodeModel.owner.instance); } - NodeResources.DiskSpeed diskSpeed; - NodeResources nr = new NodeResources(nodeModel.minCpuCores, nodeModel.minMainMemoryAvailableGb, - nodeModel.minDiskAvailableGb, nodeModel.bandwidth * 1000, - nodeModel.fastDisk ? NodeResources.DiskSpeed.fast : NodeResources.DiskSpeed.slow); + NodeResources nr = new NodeResources(nodeModel.minCpuCores, + nodeModel.minMainMemoryAvailableGb, + nodeModel.minDiskAvailableGb, + nodeModel.bandwidth * 1000, + nodeModel.fastDisk ? NodeResources.DiskSpeed.fast : NodeResources.DiskSpeed.slow); Flavor f = new Flavor(nr); Node node = nodeRepository.createNode(nodeModel.id, nodeModel.hostname, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java new file mode 100644 index 00000000000..7d7b3910cdb --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java @@ -0,0 +1,214 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ClusterMembership; +import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.DockerImage; +import com.yahoo.config.provision.Environment; +import com.yahoo.config.provision.Flavor; +import com.yahoo.config.provision.NodeFlavors; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.RegionName; +import com.yahoo.config.provision.Zone; +import com.yahoo.test.ManualClock; +import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.curator.mock.MockCurator; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.IP; +import com.yahoo.vespa.hosted.provision.provisioning.EmptyProvisionServiceProvider; +import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; +import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; +import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; +import org.junit.Test; + +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class SpareCapacityMaintainerTest { + + @Test + public void testEmpty() { + var tester = new SpareCapacityMaintainerTester(); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + } + + @Test + public void testOneSpare() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(2, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 1, new NodeResources(10, 100, 1000, 1), 0); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testTwoSpares() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(3, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 1, new NodeResources(10, 100, 1000, 1), 0); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(2, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testNoSpares() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(2, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(10, 100, 1000, 1), 0); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(0, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testAllWorksAsSpares() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(4, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(5, 50, 500, 0.5), 0); + tester.addNodes(1, 2, new NodeResources(5, 50, 500, 0.5), 2); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(2, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testMoveIsNeededToHaveSpares() { + // Moving application id 1 and 2 to the same nodes frees up spares for application 0 + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(6, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(10, 100, 1000, 1), 0); + tester.addNodes(1, 2, new NodeResources(5, 50, 500, 0.5), 2); + tester.addNodes(2, 2, new NodeResources(5, 50, 500, 0.5), 4); + tester.maintainer.maintain(); + assertEquals(1, tester.deployer.redeployments); + assertEquals(1, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + + // Maintaining again is a no-op since the node to move is already retired + tester.maintainer.maintain(); + assertEquals(1, tester.deployer.redeployments); + assertEquals(1, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + } + + private static class SpareCapacityMaintainerTester { + + NodeRepository nodeRepository; + MockDeployer deployer; + TestMetric metric = new TestMetric(); + SpareCapacityMaintainer maintainer; + private int hostIndex = 0; + private int nodeIndex = 0; + + private SpareCapacityMaintainerTester() { + NodeFlavors flavors = new NodeFlavors(new FlavorConfigBuilder().build()); + nodeRepository = new NodeRepository(flavors, + new EmptyProvisionServiceProvider().getHostResourcesCalculator(), + new MockCurator(), + new ManualClock(), + new Zone(Environment.prod, RegionName.from("us-east-3")), + new MockNameResolver().mockAnyLookup(), + DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), true, false); + deployer = new MockDeployer(nodeRepository); + maintainer = new SpareCapacityMaintainer(deployer, nodeRepository, metric, Duration.ofMinutes(1)); + } + + private void addHosts(int count, NodeResources resources) { + List hosts = new ArrayList<>(); + for (int i = 0; i < count; i++) { + Node host = nodeRepository.createNode("host" + hostIndex, + "host" + hostIndex + ".yahoo.com", + ipConfig(hostIndex + nodeIndex, true), + Optional.empty(), + new Flavor(resources), + Optional.empty(), + NodeType.host); + hosts.add(host); + hostIndex++; + } + hosts = nodeRepository.addNodes(hosts, Agent.system); + hosts = nodeRepository.setReady(hosts, Agent.system, "Test"); + var transaction = new NestedTransaction(); + nodeRepository.activate(hosts, transaction); + transaction.commit(); + } + + private void addNodes(int id, int count, NodeResources resources, int hostOffset) { + List nodes = new ArrayList<>(); + ApplicationId application = ApplicationId.from("tenant" + id, "application" + id, "default"); + for (int i = 0; i < count; i++) { + ClusterMembership membership = ClusterMembership.from(ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("cluster" + id)) + .group(ClusterSpec.Group.from(0)) + .vespaVersion("7") + .build(), + i); + Node node = nodeRepository.createNode("node" + nodeIndex, + "node" + nodeIndex + ".yahoo.com", + ipConfig(hostIndex + nodeIndex, false), + Optional.of("host" + ( hostOffset + i) + ".yahoo.com"), + new Flavor(resources), + Optional.empty(), + NodeType.tenant); + node = node.allocate(application, membership, node.resources(), Instant.now()); + nodes.add(node); + nodeIndex++; + } + nodes = nodeRepository.addNodes(nodes, Agent.system); + for (int i = 0; i < count; i++) { + Node node = nodes.get(i); + ClusterMembership membership = ClusterMembership.from(ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("cluster" + id)) + .group(ClusterSpec.Group.from(0)) + .vespaVersion("7") + .build(), + i); + node = node.allocate(application, membership, node.resources(), Instant.now()); + nodes.set(i, node); + } + nodes = nodeRepository.reserve(nodes); + var transaction = new NestedTransaction(); + nodes = nodeRepository.activate(nodes, transaction); + transaction.commit(); + } + + private IP.Config ipConfig(int id, boolean host) { + return new IP.Config(Set.of(String.format("%04X::%04X", id, 0)), + host ? IntStream.range(0, 10) + .mapToObj(n -> String.format("%04X::%04X", id, n)) + .collect(Collectors.toSet()) + : Set.of()); + } + + private void dumpState() { + for (Node host : nodeRepository.list().hosts().asList()) { + System.out.println("Host " + host.hostname() + " " + host.resources()); + for (Node node : nodeRepository.list().childrenOf(host).asList()) + System.out.println(" Node " + node.hostname() + " " + node.resources() + " allocation " +node.allocation()); + } + } + + } + +} -- cgit v1.2.3