aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-06-09 17:51:55 +0200
committerJon Bratseth <bratseth@gmail.com>2020-06-09 17:51:55 +0200
commit7f55a0f05ea9ab8d684e278f832a175ab543b9df (patch)
treeb663b895df4948f26d1dcacb499665ffaf385124 /node-repository
parent629759da8d37bca6c982413da9bedc1df171b895 (diff)
Spare capacity maintainenance skeleton
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java41
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java58
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java81
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java1
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json6
9 files changed, 122 insertions, 93 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
index b41820a461b..bec35e7ee4f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
@@ -102,7 +102,7 @@ public class NodeRepository extends AbstractComponent {
private final DockerImages dockerImages;
private final JobControl jobControl;
private final Applications applications;
- private final boolean canProvisionHostsWhenRequired;
+ private final boolean canProvisionHosts;
/**
* Creates a node repository from a zookeeper provider.
@@ -136,7 +136,7 @@ public class NodeRepository extends AbstractComponent {
NameResolver nameResolver,
DockerImage dockerImage,
boolean useCuratorClientCache,
- boolean canProvisionHostsWhenRequired) {
+ boolean canProvisionHosts) {
this.db = new CuratorDatabaseClient(flavors, curator, clock, zone, useCuratorClientCache);
this.zone = zone;
this.clock = clock;
@@ -149,7 +149,7 @@ public class NodeRepository extends AbstractComponent {
this.dockerImages = new DockerImages(db, dockerImage);
this.jobControl = new JobControl(db);
this.applications = new Applications(db);
- this.canProvisionHostsWhenRequired = canProvisionHostsWhenRequired;
+ this.canProvisionHosts = canProvisionHosts;
// read and write all nodes to make sure they are stored in the latest version of the serialized format
for (State state : State.values())
@@ -800,16 +800,14 @@ public class NodeRepository extends AbstractComponent {
if (host.status().wantToRetire()) return false;
if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false;
- if ( canProvisionHostsWhenRequired())
+ if ( canProvisionHosts())
return EnumSet.of(State.active, State.ready, State.provisioned).contains(host.state());
else
return host.state() == State.active;
}
- /** Returns whether this has the ability to conjure hosts when required */
- public boolean canProvisionHostsWhenRequired() {
- return canProvisionHostsWhenRequired;
- }
+ /** Returns whether this repository can provision hosts on demand */
+ public boolean canProvisionHosts() { return canProvisionHosts; }
/** Returns the time keeper of this system */
public Clock clock() { return clock; }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java
index ca8399da629..0ab343f0795 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java
@@ -11,6 +11,9 @@ import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
+/**
+ * @author mgimle
+ */
public class CapacityChecker {
private List<Node> hosts;
@@ -42,15 +45,15 @@ public class CapacityChecker {
}
public List<Node> nodesFromHostnames(List<String> hostnames) {
- List<Node> nodes = hostnames.stream()
- .filter(h -> nodeMap.containsKey(h))
- .map(h -> nodeMap.get(h))
- .collect(Collectors.toList());
+ List<Node> nodes = hostnames.stream().filter(h -> nodeMap.containsKey(h))
+ .map(h -> nodeMap.get(h))
+ .collect(Collectors.toList());
+
if (nodes.size() != hostnames.size()) {
Set<String> notFoundNodes = new HashSet<>(hostnames);
notFoundNodes.removeAll(nodes.stream().map(Node::hostname).collect(Collectors.toList()));
throw new IllegalArgumentException(String.format("Host(s) not found: [ %s ]",
- String.join(", ", notFoundNodes)));
+ String.join(", ", notFoundNodes)));
}
return nodes;
@@ -92,9 +95,9 @@ public class CapacityChecker {
if (hosts.size() == 0) return Optional.empty();
List<Node> parentRemovalPriorityList = heuristic.entrySet().stream()
- .sorted(Comparator.comparingInt(Map.Entry::getValue))
- .map(Map.Entry::getKey)
- .collect(Collectors.toList());
+ .sorted(Comparator.comparingInt(Map.Entry::getValue))
+ .map(Map.Entry::getKey)
+ .collect(Collectors.toList());
for (int i = 1; i <= parentRemovalPriorityList.size(); i++) {
List<Node> hostsToRemove = parentRemovalPriorityList.subList(0, i);
@@ -116,12 +119,12 @@ public class CapacityChecker {
private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) {
Map<Node, List<Node>> nodeChildren = tenants.stream()
- .filter(n -> n.parentHostname().isPresent())
- .filter(n -> hostnameToNode.containsKey(n.parentHostname().get()))
- .collect(Collectors.groupingBy(
- n -> hostnameToNode.get(n.parentHostname().orElseThrow())));
+ .filter(n -> n.parentHostname().isPresent())
+ .filter(n -> hostnameToNode.containsKey(n.parentHostname().get()))
+ .collect(Collectors.groupingBy(n -> hostnameToNode.get(n.parentHostname().orElseThrow())));
- for (var host : hosts) nodeChildren.putIfAbsent(host, List.of());
+ for (var host : hosts)
+ nodeChildren.putIfAbsent(host, List.of());
return nodeChildren;
}
@@ -149,10 +152,8 @@ public class CapacityChecker {
private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts,
Map<Node, List<Node>> nodeChildren,
Map<Node, AllocationResources> availableResources) {
- Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(
- Function.identity(),
- __ -> Integer.MAX_VALUE
- ));
+ Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(Function.identity(),
+ __ -> Integer.MAX_VALUE));
for (Node host : hosts) {
List<Node> children = nodeChildren.get(host);
if (children.size() == 0) continue;
@@ -326,8 +327,10 @@ public class CapacityChecker {
* as well as the specific host and tenant which caused it.
*/
public static class HostFailurePath {
+
public List<Node> hostsCausingFailure;
public HostRemovalFailure failureReason;
+
}
/**
@@ -336,6 +339,7 @@ public class CapacityChecker {
* will be empty.
*/
public static class HostRemovalFailure {
+
public Optional<Node> host;
public Optional<Node> tenant;
public AllocationFailureReasonList allocationFailures;
@@ -406,6 +410,7 @@ public class CapacityChecker {
public AllocationResources subtract(AllocationResources other) {
return new AllocationResources(this.nodeResources.subtract(other.nodeResources), this.availableIPs - other.availableIPs);
}
+
}
/**
@@ -449,6 +454,7 @@ public class CapacityChecker {
return String.format("[%s]", String.join(", ", reasons));
}
+
}
/**
@@ -487,6 +493,7 @@ public class CapacityChecker {
insufficientVcpu(), insufficientMemoryGb(), insufficientDiskGb(), incompatibleDiskSpeed(),
incompatibleStorageType(), insufficientAvailableIps(), violatesParentHostPolicy());
}
+
}
public static class AllocationHistory {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java
deleted file mode 100644
index f6cadabec54..00000000000
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hosted.provision.maintenance;
-
-import com.yahoo.jdisc.Metric;
-import com.yahoo.vespa.hosted.provision.Node;
-import com.yahoo.vespa.hosted.provision.NodeRepository;
-
-import java.time.Duration;
-import java.util.List;
-import java.util.Objects;
-import java.util.Optional;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import java.util.stream.Collectors;
-
-/**
- * Performs analysis on the node repository to produce metrics that pertain to the capacity of the node repository.
- * These metrics include:
- * Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's
- * unable to find a new home for orphaned tenants.
- * Overcommitted hosts, which tracks if there are any hosts whose capacity is less than the sum of its children's.
- *
- * @author mgimle
- */
-public class CapacityReportMaintainer extends NodeRepositoryMaintainer {
-
- private final Metric metric;
- private final NodeRepository nodeRepository;
- private static final Logger log = Logger.getLogger(CapacityReportMaintainer.class.getName());
-
- CapacityReportMaintainer(NodeRepository nodeRepository,
- Metric metric,
- Duration interval) {
- super(nodeRepository, interval);
- this.nodeRepository = nodeRepository;
- this.metric = Objects.requireNonNull(metric);
- }
-
- @Override
- protected void maintain() {
- if (nodeRepository.zone().getCloud().dynamicProvisioning()) return; // Hosts and nodes are 1-1
-
- CapacityChecker capacityChecker = new CapacityChecker(this.nodeRepository);
- List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts();
- if (overcommittedHosts.size() != 0) {
- log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]", overcommittedHosts.size(),
- overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", "))));
- }
- metric.set("overcommittedHosts", overcommittedHosts.size(), null);
-
- Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure();
- if (failurePath.isPresent()) {
- int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size();
- metric.set("spareHostCapacity", worstCaseHostLoss - 1, null);
- }
- }
-
-}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 8a82c74dd17..caf845d36cb 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -47,7 +47,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
private final InfrastructureProvisioner infrastructureProvisioner;
private final Optional<LoadBalancerExpirer> loadBalancerExpirer;
private final Optional<DynamicProvisioningMaintainer> dynamicProvisioningMaintainer;
- private final CapacityReportMaintainer capacityReportMaintainer;
+ private final SpareCapacityMaintainer spareCapacityMaintainer;
private final OsUpgradeActivator osUpgradeActivator;
private final Rebalancer rebalancer;
private final NodeMetricsDbMaintainer nodeMetricsDbMaintainer;
@@ -88,7 +88,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService));
dynamicProvisioningMaintainer = provisionServiceProvider.getHostProvisioner().map(hostProvisioner ->
new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource));
- capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, defaults.capacityReportInterval);
+ spareCapacityMaintainer = new SpareCapacityMaintainer(deployer, nodeRepository, metric, clock, defaults.spareCapacityMaintenanceInterval);
osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval);
rebalancer = new Rebalancer(deployer, nodeRepository, metric, clock, defaults.rebalancerInterval);
nodeMetricsDbMaintainer = new NodeMetricsDbMaintainer(nodeRepository, nodeMetrics, nodeMetricsDb, defaults.nodeMetricsCollectionInterval);
@@ -110,7 +110,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
failedExpirer.close();
dirtyExpirer.close();
nodeRebooter.close();
- capacityReportMaintainer.close();
+ spareCapacityMaintainer.close();
provisionedExpirer.close();
metricsReporter.close();
infrastructureProvisioner.close();
@@ -153,7 +153,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
private final Duration failedExpirerInterval;
private final Duration dirtyExpiry;
private final Duration provisionedExpiry;
- private final Duration capacityReportInterval;
+ private final Duration spareCapacityMaintenanceInterval;
private final Duration metricsInterval;
private final Duration retiredInterval;
private final Duration infrastructureProvisionInterval;
@@ -175,7 +175,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
operatorChangeRedeployInterval = Duration.ofMinutes(1);
failedExpirerInterval = Duration.ofMinutes(10);
provisionedExpiry = Duration.ofHours(4);
- capacityReportInterval = Duration.ofMinutes(10);
+ spareCapacityMaintenanceInterval = Duration.ofMinutes(10);
metricsInterval = Duration.ofMinutes(1);
infrastructureProvisionInterval = Duration.ofMinutes(1);
throttlePolicy = NodeFailer.ThrottlePolicy.hosted;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
index 12990447eee..2cb46a6a78e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
@@ -41,7 +41,7 @@ public class Rebalancer extends NodeRepositoryMaintainer {
@Override
protected void maintain() {
- if (nodeRepository().canProvisionHostsWhenRequired()) return; // All nodes will be allocated on new hosts, so rebalancing makes no sense
+ if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; // Rebalancing not necessary
if (nodeRepository().zone().environment().isTest()) return; // Test zones have short lived deployments, no need to rebalance
// Work with an unlocked snapshot as this can take a long time and full consistency is not needed
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
new file mode 100644
index 00000000000..05bfac47fb9
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
@@ -0,0 +1,81 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.maintenance;
+
+import com.yahoo.config.provision.Deployer;
+import com.yahoo.jdisc.Metric;
+import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
+
+import java.time.Clock;
+import java.time.Duration;
+import java.util.List;
+import java.util.Optional;
+import java.util.logging.Level;
+import java.util.stream.Collectors;
+
+/**
+ * A maintainer which attempts to ensure there is spare capacity available in chunks which can fit
+ * all node resource configuration in use, such that the system is able to quickly replace a failed node
+ * if necessary.
+ *
+ * This also emits the following metrics:
+ * - Overcommitted hosts: Hosts whose capacity is less than the sum of its children's
+ * - Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's
+ * unable to find a new home for orphaned tenants.
+ *
+ * @author mgimle
+ * @author bratseth
+ */
+public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
+
+ private final Deployer deployer;
+ private final Metric metric;
+ private final Clock clock;
+
+ public SpareCapacityMaintainer(Deployer deployer,
+ NodeRepository nodeRepository,
+ Metric metric,
+ Clock clock,
+ Duration interval) {
+ super(nodeRepository, interval);
+ this.deployer = deployer;
+ this.metric = metric;
+ this.clock = clock;
+ }
+
+ @Override
+ protected void maintain() {
+ if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return;
+
+ CapacityChecker capacityChecker = new CapacityChecker(nodeRepository());
+
+ List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts();
+ if (overcommittedHosts.size() != 0) {
+ log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]",
+ overcommittedHosts.size(),
+ overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", "))));
+ }
+ metric.set("overcommittedHosts", overcommittedHosts.size(), null);
+
+ Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure();
+ if (failurePath.isPresent()) {
+ int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size();
+ metric.set("spareHostCapacity", worstCaseHostLoss - 1, null);
+ if (worstCaseHostLoss <= 1) {
+ Optional<Node> moveCandidate = identifyMoveCandidate(failurePath.get());
+ if (moveCandidate.isPresent())
+ move(moveCandidate.get());
+ }
+ }
+ }
+
+ private Optional<Node> identifyMoveCandidate(CapacityChecker.HostFailurePath failurePath) {
+ return Optional.empty();
+ }
+
+ private void move(Node node) {
+
+ }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
index caecf8edf2f..d3e5f60599f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
@@ -58,7 +58,7 @@ public class GroupPreparer {
// active config model which is changed on activate
public List<Node> prepare(ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes,
List<Node> surplusActiveNodes, MutableInteger highestIndex, int spareCount, int wantedGroups) {
- boolean dynamicProvisioningEnabled = nodeRepository.canProvisionHostsWhenRequired() && nodeRepository.zone().getCloud().dynamicProvisioning();
+ boolean dynamicProvisioningEnabled = nodeRepository.canProvisionHosts() && nodeRepository.zone().getCloud().dynamicProvisioning();
boolean allocateFully = dynamicProvisioningEnabled && preprovisionCapacityFlag.value().isEmpty();
try (Mutex lock = nodeRepository.lock(application)) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java
index 12a29707303..7e81a9cc002 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java
@@ -20,6 +20,7 @@ import java.util.Optional;
* @author mgimle
*/
public class HostCapacityResponse extends HttpResponse {
+
private final StringBuilder text;
private final Slime slime;
private final CapacityChecker capacityChecker;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json
index e041a7b8b54..6bb30d90218 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json
@@ -4,9 +4,6 @@
"name": "AutoscalingMaintainer"
},
{
- "name": "CapacityReportMaintainer"
- },
- {
"name": "DirtyExpirer"
},
{
@@ -56,6 +53,9 @@
},
{
"name":"ScalingSuggestionsMaintainer"
+ },
+ {
+ "name": "SpareCapacityMaintainer"
}
],
"inactive": [