summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java205
1 files changed, 179 insertions, 26 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
index c0e39b39e94..4179d6d3f83 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
@@ -2,15 +2,21 @@
package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.Deployer;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.maintenance.MaintenanceDeployment.Move;
+import com.yahoo.vespa.hosted.provision.node.Agent;
+import com.yahoo.vespa.hosted.provision.provisioning.HostCapacity;
-import java.time.Clock;
import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Iterator;
import java.util.List;
import java.util.Optional;
+import java.util.Set;
import java.util.logging.Level;
import java.util.stream.Collectors;
@@ -29,19 +35,18 @@ import java.util.stream.Collectors;
*/
public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
+ private static final int maxMoves = 5;
+
private final Deployer deployer;
private final Metric metric;
- private final Clock clock;
public SpareCapacityMaintainer(Deployer deployer,
NodeRepository nodeRepository,
Metric metric,
- Clock clock,
Duration interval) {
super(nodeRepository, interval);
this.deployer = deployer;
this.metric = metric;
- this.clock = clock;
}
@Override
@@ -60,46 +65,194 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure();
if (failurePath.isPresent()) {
- int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size();
- metric.set("spareHostCapacity", worstCaseHostLoss - 1, null);
- if (worstCaseHostLoss == 1) { // Try to get back to needing 2 hosts to fail in the worst case
- Optional<Move> moveCandidate = identifyMoveCandidate(failurePath.get());
- if (moveCandidate.isPresent())
- move(moveCandidate.get());
+ int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1;
+ if (spareHostCapacity == 0) {
+ Move move = findMitigatingMove(failurePath.get());
+ boolean success = move.execute(Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository());
+ if (success) {
+ // This may strictly be a (noble) lie: We succeeded in taking one step to mitigate,
+ // but not necessarily *all* steps needed to justify adding 1 to spare capacity.
+ // However, we alert on this being 0 and we don't want to do that as long as we're not stuck.
+ spareHostCapacity++;
+ }
}
+ metric.set("spareHostCapacity", spareHostCapacity, null);
}
}
- private Optional<Move> identifyMoveCandidate(CapacityChecker.HostFailurePath failurePath) {
+ private Move findMitigatingMove(CapacityChecker.HostFailurePath failurePath) {
Optional<Node> nodeWhichCantMove = failurePath.failureReason.tenant;
- if (nodeWhichCantMove.isEmpty()) return Optional.empty();
- return findMoveWhichMakesRoomFor(nodeWhichCantMove.get());
+ if (nodeWhichCantMove.isEmpty()) return Move.empty();
+ return moveTowardsSpareFor(nodeWhichCantMove.get());
}
- private Optional<Move> findMoveWhichMakesRoomFor(Node node) {
- return Optional.empty();
+ private Move moveTowardsSpareFor(Node node) {
+ NodeList allNodes = nodeRepository().list();
+ // Allocation will assign the two most empty nodes as "spares", which will not be allocated on
+ // unless needed for node failing. Our goal here is to make room on these spares for the given node
+ HostCapacity hostCapacity = new HostCapacity(allNodes, nodeRepository().resourcesCalculator());
+ Set<Node> spareHosts = hostCapacity.findSpareHosts(allNodes.hosts().satisfies(node.resources()).asList(), 2);
+ List<Node> hosts = allNodes.hosts().except(spareHosts).asList();
+
+ CapacitySolver capacitySolver = new CapacitySolver(hostCapacity);
+ List<Move> shortestMitigation = null;
+ for (Node spareHost : spareHosts) {
+ List<Move> mitigation = capacitySolver.makeRoomFor(node, spareHost, hosts, List.of(), maxMoves);
+ if (mitigation == null) continue;
+ if (shortestMitigation == null || shortestMitigation.size() > mitigation.size())
+ shortestMitigation = mitigation;
+ }
+ if (shortestMitigation == null || shortestMitigation.isEmpty()) return Move.empty();
+ return shortestMitigation.get(0);
}
- private void move(Move move) {
+ private static class CapacitySolver {
+
+ private final HostCapacity hostCapacity;
+
+ CapacitySolver(HostCapacity hostCapacity) {
+ this.hostCapacity = hostCapacity;
+ }
+
+ /**
+ * Finds the shortest sequence of moves which makes room for the given node on the given host,
+ * assuming the given moves already made over the given hosts' current allocation.
+ *
+ * @param node the node to make room for
+ * @param host the target host to make room on
+ * @param hosts the hosts onto which we can move nodes
+ * @param movesMade the moves already made in this scenario
+ * @return the list of movesMade with the moves needed for this appended, in the order they should be performed,
+ * or null if no sequence could be found
+ */
+ List<Move> makeRoomFor(Node node, Node host, List<Node> hosts, List<Move> movesMade, int movesLeft) {
+ if ( ! host.resources().satisfies(node.resources())) return null;
+ NodeResources freeCapacity = freeCapacityWith(movesMade, host);
+ if (freeCapacity.satisfies(node.resources())) return List.of();
+ if (movesLeft == 0) return null;
+
+ List<Move> shortest = null;
+ for (var i = Subsets(hostCapacity.allNodes().childrenOf(host), movesLeft); i.hasNext(); ) {
+ List<Node> childrenToMove = i.next();
+ if ( ! addResourcesOf(childrenToMove, freeCapacity).satisfies(node.resources())) continue;
+ List<Move> moves = move(childrenToMove, host, hosts, movesMade, movesLeft);
+ if (moves == null) continue;
+ if (shortest == null || moves.size() < shortest.size())
+ shortest = moves;
+ }
+ if (shortest == null) return null;
+ List<Move> total = append(movesMade, shortest);
+ if (total.size() > movesLeft) return null;
+ return total;
+ }
+
+ private List<Move> move(List<Node> nodes, Node host, List<Node> hosts, List<Move> movesMade, int movesLeft) {
+ List<Move> moves = new ArrayList<>();
+ for (Node childToMove : nodes) {
+ List<Move> childMoves = move(childToMove, host, hosts, append(movesMade, moves), movesLeft - moves.size());
+ if (childMoves == null) return null;
+ moves.addAll(childMoves);
+ if (moves.size() > movesLeft) return null;
+ }
+ return moves;
+ }
+
+ private List<Move> move(Node node, Node host, List<Node> hosts, List<Move> movesMade, int movesLeft) {
+ List<Move> shortest = null;
+ for (Node target : hosts) {
+ List<Move> childMoves = makeRoomFor(node, target, hosts, movesMade, movesLeft - 1);
+ if (childMoves == null) continue;
+ if (shortest == null || shortest.size() > childMoves.size() + 1) {
+ shortest = new ArrayList<>(childMoves);
+ shortest.add(new Move(node, host, target));
+ }
+ }
+ return shortest;
+ }
+
+ private NodeResources addResourcesOf(List<Node> nodes, NodeResources resources) {
+ for (Node node : nodes)
+ resources = resources.add(node.resources());
+ return resources;
+ }
+
+ private Iterator<List<Node>> Subsets(NodeList nodes, int maxLength) {
+ return new SubsetIterator(nodes.asList(), maxLength);
+ }
+
+ private List<Move> append(List<Move> a, List<Move> b) {
+ List<Move> list = new ArrayList<>();
+ list.addAll(a);
+ list.addAll(b);
+ return list;
+ }
+
+ private NodeResources freeCapacityWith(List<Move> moves, Node host) {
+ NodeResources resources = hostCapacity.freeCapacityOf(host);
+ for (Move move : moves) {
+ if ( ! move.toHost().equals(host)) continue;
+ resources = resources.subtract(move.node().resources());
+ }
+ for (Move move : moves) {
+ if ( ! move.fromHost().equals(host)) continue;
+ resources = resources.add(move.fromHost().resources());
+ }
+ return resources;
+ }
}
- private static class Move {
+ private static class SubsetIterator implements Iterator<List<Node>> {
+
+ private final List<Node> nodes;
+ private final int maxLength;
+
+ // A number whose binary representation determines which items of list we'll include
+ private int i = 0; // first "previous" = 0 -> skip the empty set
+ private List<Node> next = null;
- static final Move none = new Move(null, null);
+ public SubsetIterator(List<Node> nodes, int maxLength) {
+ this.nodes = new ArrayList<>(nodes.subList(0, Math.min(nodes.size(), 31)));
+ this.maxLength = maxLength;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (next != null) return true;
- final Node node;
- final Node toHost;
+ // find next
+ while (++i < 1<<nodes.size()) {
+ int ones = onesIn(i);
+ if (ones > maxLength) continue;
- Move(Node node, Node toHost) {
- this.node = node;
- this.toHost = toHost;
+ next = new ArrayList<>(ones);
+ for (int position = 0; position < nodes.size(); position++) {
+ if (hasOneAtPosition(position, i))
+ next.add(nodes.get(position));
+ }
+ return true;
+ }
+ return false;
}
@Override
- public String toString() {
- return "move " +
- ( node == null ? "none" : (node.hostname() + " to " + toHost));
+ public List<Node> next() {
+ next = null;
+ if ( ! hasNext()) throw new IllegalStateException("No more elements");
+ return next;
+ }
+
+ private boolean hasOneAtPosition(int position, int number) {
+ return (number & (1 << position)) > 0;
+ }
+
+ private int onesIn(int number) {
+ int ones = 0;
+ for (int position = 0; Math.pow(2, position) <= number; position++) {
+ if (hasOneAtPosition(position, number))
+ ones++;
+ }
+ return ones;
}
}