diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-06-09 17:51:55 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-06-09 17:51:55 +0200 |
commit | 7f55a0f05ea9ab8d684e278f832a175ab543b9df (patch) | |
tree | b663b895df4948f26d1dcacb499665ffaf385124 /node-repository | |
parent | 629759da8d37bca6c982413da9bedc1df171b895 (diff) |
Spare capacity maintainenance skeleton
Diffstat (limited to 'node-repository')
9 files changed, 122 insertions, 93 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index b41820a461b..bec35e7ee4f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -102,7 +102,7 @@ public class NodeRepository extends AbstractComponent { private final DockerImages dockerImages; private final JobControl jobControl; private final Applications applications; - private final boolean canProvisionHostsWhenRequired; + private final boolean canProvisionHosts; /** * Creates a node repository from a zookeeper provider. @@ -136,7 +136,7 @@ public class NodeRepository extends AbstractComponent { NameResolver nameResolver, DockerImage dockerImage, boolean useCuratorClientCache, - boolean canProvisionHostsWhenRequired) { + boolean canProvisionHosts) { this.db = new CuratorDatabaseClient(flavors, curator, clock, zone, useCuratorClientCache); this.zone = zone; this.clock = clock; @@ -149,7 +149,7 @@ public class NodeRepository extends AbstractComponent { this.dockerImages = new DockerImages(db, dockerImage); this.jobControl = new JobControl(db); this.applications = new Applications(db); - this.canProvisionHostsWhenRequired = canProvisionHostsWhenRequired; + this.canProvisionHosts = canProvisionHosts; // read and write all nodes to make sure they are stored in the latest version of the serialized format for (State state : State.values()) @@ -800,16 +800,14 @@ public class NodeRepository extends AbstractComponent { if (host.status().wantToRetire()) return false; if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false; - if ( canProvisionHostsWhenRequired()) + if ( canProvisionHosts()) return EnumSet.of(State.active, State.ready, State.provisioned).contains(host.state()); else return host.state() == State.active; } - /** Returns whether this has the ability to conjure hosts when required */ - public boolean canProvisionHostsWhenRequired() { - return canProvisionHostsWhenRequired; - } + /** Returns whether this repository can provision hosts on demand */ + public boolean canProvisionHosts() { return canProvisionHosts; } /** Returns the time keeper of this system */ public Clock clock() { return clock; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java index ca8399da629..0ab343f0795 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java @@ -11,6 +11,9 @@ import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; +/** + * @author mgimle + */ public class CapacityChecker { private List<Node> hosts; @@ -42,15 +45,15 @@ public class CapacityChecker { } public List<Node> nodesFromHostnames(List<String> hostnames) { - List<Node> nodes = hostnames.stream() - .filter(h -> nodeMap.containsKey(h)) - .map(h -> nodeMap.get(h)) - .collect(Collectors.toList()); + List<Node> nodes = hostnames.stream().filter(h -> nodeMap.containsKey(h)) + .map(h -> nodeMap.get(h)) + .collect(Collectors.toList()); + if (nodes.size() != hostnames.size()) { Set<String> notFoundNodes = new HashSet<>(hostnames); notFoundNodes.removeAll(nodes.stream().map(Node::hostname).collect(Collectors.toList())); throw new IllegalArgumentException(String.format("Host(s) not found: [ %s ]", - String.join(", ", notFoundNodes))); + String.join(", ", notFoundNodes))); } return nodes; @@ -92,9 +95,9 @@ public class CapacityChecker { if (hosts.size() == 0) return Optional.empty(); List<Node> parentRemovalPriorityList = heuristic.entrySet().stream() - .sorted(Comparator.comparingInt(Map.Entry::getValue)) - .map(Map.Entry::getKey) - .collect(Collectors.toList()); + .sorted(Comparator.comparingInt(Map.Entry::getValue)) + .map(Map.Entry::getKey) + .collect(Collectors.toList()); for (int i = 1; i <= parentRemovalPriorityList.size(); i++) { List<Node> hostsToRemove = parentRemovalPriorityList.subList(0, i); @@ -116,12 +119,12 @@ public class CapacityChecker { private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) { Map<Node, List<Node>> nodeChildren = tenants.stream() - .filter(n -> n.parentHostname().isPresent()) - .filter(n -> hostnameToNode.containsKey(n.parentHostname().get())) - .collect(Collectors.groupingBy( - n -> hostnameToNode.get(n.parentHostname().orElseThrow()))); + .filter(n -> n.parentHostname().isPresent()) + .filter(n -> hostnameToNode.containsKey(n.parentHostname().get())) + .collect(Collectors.groupingBy(n -> hostnameToNode.get(n.parentHostname().orElseThrow()))); - for (var host : hosts) nodeChildren.putIfAbsent(host, List.of()); + for (var host : hosts) + nodeChildren.putIfAbsent(host, List.of()); return nodeChildren; } @@ -149,10 +152,8 @@ public class CapacityChecker { private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren, Map<Node, AllocationResources> availableResources) { - Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap( - Function.identity(), - __ -> Integer.MAX_VALUE - )); + Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(Function.identity(), + __ -> Integer.MAX_VALUE)); for (Node host : hosts) { List<Node> children = nodeChildren.get(host); if (children.size() == 0) continue; @@ -326,8 +327,10 @@ public class CapacityChecker { * as well as the specific host and tenant which caused it. */ public static class HostFailurePath { + public List<Node> hostsCausingFailure; public HostRemovalFailure failureReason; + } /** @@ -336,6 +339,7 @@ public class CapacityChecker { * will be empty. */ public static class HostRemovalFailure { + public Optional<Node> host; public Optional<Node> tenant; public AllocationFailureReasonList allocationFailures; @@ -406,6 +410,7 @@ public class CapacityChecker { public AllocationResources subtract(AllocationResources other) { return new AllocationResources(this.nodeResources.subtract(other.nodeResources), this.availableIPs - other.availableIPs); } + } /** @@ -449,6 +454,7 @@ public class CapacityChecker { return String.format("[%s]", String.join(", ", reasons)); } + } /** @@ -487,6 +493,7 @@ public class CapacityChecker { insufficientVcpu(), insufficientMemoryGb(), insufficientDiskGb(), incompatibleDiskSpeed(), incompatibleStorageType(), insufficientAvailableIps(), violatesParentHostPolicy()); } + } public static class AllocationHistory { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java deleted file mode 100644 index f6cadabec54..00000000000 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.provision.maintenance; - -import com.yahoo.jdisc.Metric; -import com.yahoo.vespa.hosted.provision.Node; -import com.yahoo.vespa.hosted.provision.NodeRepository; - -import java.time.Duration; -import java.util.List; -import java.util.Objects; -import java.util.Optional; -import java.util.logging.Level; -import java.util.logging.Logger; -import java.util.stream.Collectors; - -/** - * Performs analysis on the node repository to produce metrics that pertain to the capacity of the node repository. - * These metrics include: - * Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's - * unable to find a new home for orphaned tenants. - * Overcommitted hosts, which tracks if there are any hosts whose capacity is less than the sum of its children's. - * - * @author mgimle - */ -public class CapacityReportMaintainer extends NodeRepositoryMaintainer { - - private final Metric metric; - private final NodeRepository nodeRepository; - private static final Logger log = Logger.getLogger(CapacityReportMaintainer.class.getName()); - - CapacityReportMaintainer(NodeRepository nodeRepository, - Metric metric, - Duration interval) { - super(nodeRepository, interval); - this.nodeRepository = nodeRepository; - this.metric = Objects.requireNonNull(metric); - } - - @Override - protected void maintain() { - if (nodeRepository.zone().getCloud().dynamicProvisioning()) return; // Hosts and nodes are 1-1 - - CapacityChecker capacityChecker = new CapacityChecker(this.nodeRepository); - List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts(); - if (overcommittedHosts.size() != 0) { - log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]", overcommittedHosts.size(), - overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", ")))); - } - metric.set("overcommittedHosts", overcommittedHosts.size(), null); - - Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); - if (failurePath.isPresent()) { - int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size(); - metric.set("spareHostCapacity", worstCaseHostLoss - 1, null); - } - } - -} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 8a82c74dd17..caf845d36cb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -47,7 +47,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final InfrastructureProvisioner infrastructureProvisioner; private final Optional<LoadBalancerExpirer> loadBalancerExpirer; private final Optional<DynamicProvisioningMaintainer> dynamicProvisioningMaintainer; - private final CapacityReportMaintainer capacityReportMaintainer; + private final SpareCapacityMaintainer spareCapacityMaintainer; private final OsUpgradeActivator osUpgradeActivator; private final Rebalancer rebalancer; private final NodeMetricsDbMaintainer nodeMetricsDbMaintainer; @@ -88,7 +88,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService)); dynamicProvisioningMaintainer = provisionServiceProvider.getHostProvisioner().map(hostProvisioner -> new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource)); - capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, defaults.capacityReportInterval); + spareCapacityMaintainer = new SpareCapacityMaintainer(deployer, nodeRepository, metric, clock, defaults.spareCapacityMaintenanceInterval); osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval); rebalancer = new Rebalancer(deployer, nodeRepository, metric, clock, defaults.rebalancerInterval); nodeMetricsDbMaintainer = new NodeMetricsDbMaintainer(nodeRepository, nodeMetrics, nodeMetricsDb, defaults.nodeMetricsCollectionInterval); @@ -110,7 +110,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { failedExpirer.close(); dirtyExpirer.close(); nodeRebooter.close(); - capacityReportMaintainer.close(); + spareCapacityMaintainer.close(); provisionedExpirer.close(); metricsReporter.close(); infrastructureProvisioner.close(); @@ -153,7 +153,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration failedExpirerInterval; private final Duration dirtyExpiry; private final Duration provisionedExpiry; - private final Duration capacityReportInterval; + private final Duration spareCapacityMaintenanceInterval; private final Duration metricsInterval; private final Duration retiredInterval; private final Duration infrastructureProvisionInterval; @@ -175,7 +175,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { operatorChangeRedeployInterval = Duration.ofMinutes(1); failedExpirerInterval = Duration.ofMinutes(10); provisionedExpiry = Duration.ofHours(4); - capacityReportInterval = Duration.ofMinutes(10); + spareCapacityMaintenanceInterval = Duration.ofMinutes(10); metricsInterval = Duration.ofMinutes(1); infrastructureProvisionInterval = Duration.ofMinutes(1); throttlePolicy = NodeFailer.ThrottlePolicy.hosted; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 12990447eee..2cb46a6a78e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -41,7 +41,7 @@ public class Rebalancer extends NodeRepositoryMaintainer { @Override protected void maintain() { - if (nodeRepository().canProvisionHostsWhenRequired()) return; // All nodes will be allocated on new hosts, so rebalancing makes no sense + if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; // Rebalancing not necessary if (nodeRepository().zone().environment().isTest()) return; // Test zones have short lived deployments, no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java new file mode 100644 index 00000000000..05bfac47fb9 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -0,0 +1,81 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.Deployer; +import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; + +import java.time.Clock; +import java.time.Duration; +import java.util.List; +import java.util.Optional; +import java.util.logging.Level; +import java.util.stream.Collectors; + +/** + * A maintainer which attempts to ensure there is spare capacity available in chunks which can fit + * all node resource configuration in use, such that the system is able to quickly replace a failed node + * if necessary. + * + * This also emits the following metrics: + * - Overcommitted hosts: Hosts whose capacity is less than the sum of its children's + * - Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's + * unable to find a new home for orphaned tenants. + * + * @author mgimle + * @author bratseth + */ +public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { + + private final Deployer deployer; + private final Metric metric; + private final Clock clock; + + public SpareCapacityMaintainer(Deployer deployer, + NodeRepository nodeRepository, + Metric metric, + Clock clock, + Duration interval) { + super(nodeRepository, interval); + this.deployer = deployer; + this.metric = metric; + this.clock = clock; + } + + @Override + protected void maintain() { + if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; + + CapacityChecker capacityChecker = new CapacityChecker(nodeRepository()); + + List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts(); + if (overcommittedHosts.size() != 0) { + log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]", + overcommittedHosts.size(), + overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", ")))); + } + metric.set("overcommittedHosts", overcommittedHosts.size(), null); + + Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); + if (failurePath.isPresent()) { + int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size(); + metric.set("spareHostCapacity", worstCaseHostLoss - 1, null); + if (worstCaseHostLoss <= 1) { + Optional<Node> moveCandidate = identifyMoveCandidate(failurePath.get()); + if (moveCandidate.isPresent()) + move(moveCandidate.get()); + } + } + } + + private Optional<Node> identifyMoveCandidate(CapacityChecker.HostFailurePath failurePath) { + return Optional.empty(); + } + + private void move(Node node) { + + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index caecf8edf2f..d3e5f60599f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -58,7 +58,7 @@ public class GroupPreparer { // active config model which is changed on activate public List<Node> prepare(ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes, List<Node> surplusActiveNodes, MutableInteger highestIndex, int spareCount, int wantedGroups) { - boolean dynamicProvisioningEnabled = nodeRepository.canProvisionHostsWhenRequired() && nodeRepository.zone().getCloud().dynamicProvisioning(); + boolean dynamicProvisioningEnabled = nodeRepository.canProvisionHosts() && nodeRepository.zone().getCloud().dynamicProvisioning(); boolean allocateFully = dynamicProvisioningEnabled && preprovisionCapacityFlag.value().isEmpty(); try (Mutex lock = nodeRepository.lock(application)) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java index 12a29707303..7e81a9cc002 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java @@ -20,6 +20,7 @@ import java.util.Optional; * @author mgimle */ public class HostCapacityResponse extends HttpResponse { + private final StringBuilder text; private final Slime slime; private final CapacityChecker capacityChecker; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json index e041a7b8b54..6bb30d90218 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json @@ -4,9 +4,6 @@ "name": "AutoscalingMaintainer" }, { - "name": "CapacityReportMaintainer" - }, - { "name": "DirtyExpirer" }, { @@ -56,6 +53,9 @@ }, { "name":"ScalingSuggestionsMaintainer" + }, + { + "name": "SpareCapacityMaintainer" } ], "inactive": [ |