summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java52
1 files changed, 46 insertions, 6 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
index dd2f3dfd6c0..0861473a7c0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
@@ -1,6 +1,8 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.jdisc.Metric;
@@ -10,6 +12,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.maintenance.MaintenanceDeployment.Move;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.provisioning.HostCapacity;
+import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceComparator;
import java.time.Duration;
import java.util.ArrayList;
@@ -67,15 +70,12 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
boolean success = true;
if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return success;
- CapacityChecker capacityChecker = new CapacityChecker(nodeRepository().list());
+ NodeList allNodes = nodeRepository().list();
+ CapacityChecker capacityChecker = new CapacityChecker(allNodes);
List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts();
- if (overcommittedHosts.size() != 0) {
- log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]",
- overcommittedHosts.size(),
- overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", "))));
- }
metric.set("overcommittedHosts", overcommittedHosts.size(), null);
+ retireOvercommitedHosts(allNodes, overcommittedHosts);
Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure();
if (failurePath.isPresent()) {
@@ -133,6 +133,46 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
return shortestMitigation;
}
+ private int retireOvercomittedComparator(Node n1, Node n2) {
+ ClusterSpec.Type t1 = n1.allocation().get().membership().cluster().type();
+ ClusterSpec.Type t2 = n2.allocation().get().membership().cluster().type();
+
+ // Prefer to container nodes for faster retirement
+ if (t1 == ClusterSpec.Type.container && t2 != ClusterSpec.Type.container) return -1;
+ if (t1 != ClusterSpec.Type.container && t2 == ClusterSpec.Type.container) return 1;
+
+ return NodeResourceComparator.memoryDiskCpuOrder().compare(n1.resources(), n2.resources());
+ }
+
+ private void retireOvercommitedHosts(NodeList allNodes, List<Node> overcommittedHosts) {
+ if (overcommittedHosts.isEmpty()) return;
+ log.log(Level.WARNING, String.format("%d hosts are overcommitted: %s",
+ overcommittedHosts.size(),
+ overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", "))));
+
+ if (!Rebalancer.zoneIsStable(allNodes)) return;
+
+ // Find an active node on a overcommited host and retire it
+ Optional<Node> nodeToRetire = overcommittedHosts.stream().flatMap(parent -> allNodes.childrenOf(parent).stream())
+ .filter(node -> node.state() == Node.State.active)
+ .min(this::retireOvercomittedComparator);
+ if (nodeToRetire.isEmpty()) return;
+
+ ApplicationId application = nodeToRetire.get().allocation().get().owner();
+ try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
+ if ( ! deployment.isValid()) return; // this will be done at another config server
+
+ Optional<Node> nodeWithWantToRetire = nodeRepository().getNode(nodeToRetire.get().hostname())
+ .map(node -> node.withWantToRetire(true, Agent.SpareCapacityMaintainer, nodeRepository().clock().instant()));
+ if (nodeWithWantToRetire.isEmpty()) return;
+
+ nodeRepository().write(nodeWithWantToRetire.get(), deployment.applicationLock().get());
+ log.log(Level.INFO, String.format("Redeploying %s to relocate %s from overcommited host",
+ application, nodeToRetire.get().hostname()));
+ deployment.activate();
+ }
+ }
+
private static class CapacitySolver {
private final HostCapacity hostCapacity;