// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; import ai.vespa.metrics.ConfigServerMetrics; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.NodeResources; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.maintenance.MaintenanceDeployment.Move; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.provisioning.HostCapacity; import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceComparator; import java.time.Duration; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.logging.Level; import java.util.stream.Collectors; /** * A maintainer which attempts to ensure there is spare capacity available in chunks which can fit * all node resource configuration in use, such that the system is able to quickly replace a failed node * if necessary. * * This also emits the following metrics: * - Overcommitted hosts: Hosts whose capacity is less than the sum of its children's * - Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's * unable to find a new home for orphaned tenants. * * @author mgimle * @author bratseth */ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { private final int maxIterations; private final Deployer deployer; private final Metric metric; public SpareCapacityMaintainer(Deployer deployer, NodeRepository nodeRepository, Metric metric, Duration interval) { this(deployer, nodeRepository, metric, interval, 10_000 /* Should take less than a few minutes */); } public SpareCapacityMaintainer(Deployer deployer, NodeRepository nodeRepository, Metric metric, Duration interval, int maxIterations) { super(nodeRepository, interval, metric); this.deployer = deployer; this.metric = metric; this.maxIterations = maxIterations; } @Override protected double maintain() { if ( ! nodeRepository().nodes().isWorking()) return 0.0; // Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand. if (nodeRepository().zone().cloud().dynamicProvisioning()) return 1.0; NodeList allNodes = nodeRepository().nodes().list(); CapacityChecker capacityChecker = new CapacityChecker(allNodes); List overcommittedHosts = capacityChecker.findOvercommittedHosts(); metric.set(ConfigServerMetrics.OVERCOMMITTED_HOSTS.baseName(), overcommittedHosts.size(), null); retireOvercommitedHosts(allNodes, overcommittedHosts); boolean success = true; Optional failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); if (failurePath.isPresent()) { int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1; if (spareHostCapacity == 0) { List mitigation = findMitigation(failurePath.get()); if (execute(mitigation, failurePath.get())) { // We succeeded or are in the process of taking a step to mitigate. // Report with the assumption this will eventually succeed to avoid alerting before we're stuck spareHostCapacity++; } else { success = false; } } metric.set(ConfigServerMetrics.SPARE_HOST_CAPACITY.baseName(), spareHostCapacity, null); } return success ? 1.0 : 0.0; } private boolean execute(List mitigation, CapacityChecker.HostFailurePath failurePath) { if (mitigation.isEmpty()) { log.warning("Out of spare capacity and no mitigation possible: " + failurePath); return false; } Move firstMove = mitigation.get(0); if (firstMove.node().allocation().get().membership().retired()) return true; // Already in progress boolean success = firstMove.execute(false, Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository()); log.info("Out of spare capacity. Mitigation plan: " + mitigation + ". First move successful: " + success); return success; } private List findMitigation(CapacityChecker.HostFailurePath failurePath) { Optional nodeWhichCantMove = failurePath.failureReason.tenant; if (nodeWhichCantMove.isEmpty()) return List.of(); Node node = nodeWhichCantMove.get(); NodeList allNodes = nodeRepository().nodes().list(); // Allocation will assign the spareCount most empty nodes as "spares", which will not be allocated on // unless needed for node failing. Our goal here is to make room on these spares for the given node HostCapacity hostCapacity = new HostCapacity(allNodes, nodeRepository().resourcesCalculator()); Set spareHosts = hostCapacity.findSpareHosts(allNodes.hosts().satisfies(node.resources()).asList(), nodeRepository().spareCount()); List hosts = allNodes.hosts().except(spareHosts).asList(); CapacitySolver capacitySolver = new CapacitySolver(hostCapacity, maxIterations); List shortestMitigation = null; for (Node spareHost : spareHosts) { List mitigation = capacitySolver.makeRoomFor(node, spareHost, hosts, List.of(), List.of()); if (mitigation == null) continue; if (shortestMitigation == null || shortestMitigation.size() > mitigation.size()) shortestMitigation = mitigation; } if (shortestMitigation == null || shortestMitigation.isEmpty()) return List.of(); return shortestMitigation; } private int retireOvercomittedComparator(Node n1, Node n2) { ClusterSpec.Type t1 = n1.allocation().get().membership().cluster().type(); ClusterSpec.Type t2 = n2.allocation().get().membership().cluster().type(); // Prefer to container nodes for faster retirement if (t1 == ClusterSpec.Type.container && t2 != ClusterSpec.Type.container) return -1; if (t1 != ClusterSpec.Type.container && t2 == ClusterSpec.Type.container) return 1; return NodeResourceComparator.memoryDiskCpuOrder().compare(n1.resources(), n2.resources()); } private void retireOvercommitedHosts(NodeList allNodes, List overcommittedHosts) { if (overcommittedHosts.isEmpty()) return; log.log(Level.WARNING, String.format("%d hosts are overcommitted: %s", overcommittedHosts.size(), overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", ")))); if (!NodeMover.zoneIsStable(allNodes)) return; // Find an active node on a overcommitted host and retire it Optional nodeToRetire = overcommittedHosts.stream().flatMap(parent -> allNodes.childrenOf(parent).stream()) .filter(node -> node.state() == Node.State.active) .min(this::retireOvercomittedComparator); if (nodeToRetire.isEmpty()) return; retire(nodeToRetire.get()); } /** Mark node for retirement and redeploy its application */ private void retire(Node node) { ApplicationId owner = node.allocation().get().owner(); try (MaintenanceDeployment deployment = new MaintenanceDeployment(owner, deployer, metric, nodeRepository())) { if (!deployment.isValid()) return; if (!markWantToRetire(node.hostname())) return; log.log(Level.INFO, String.format("Redeploying %s to move %s from over-committed host", owner, node.hostname())); deployment.activate(); } } private boolean markWantToRetire(String hostname) { Optional optionalNodeMutex = nodeRepository().nodes().lockAndGet(hostname); if (optionalNodeMutex.isEmpty()) return false; try (var nodeMutex = optionalNodeMutex.get()) { Node retiredNode = nodeMutex.node().withWantToRetire(true, Agent.SpareCapacityMaintainer, nodeRepository().clock().instant()); nodeRepository().nodes().write(retiredNode, nodeMutex); return true; } } private static class CapacitySolver { private final HostCapacity hostCapacity; private final int maxIterations; private int iterations = 0; CapacitySolver(HostCapacity hostCapacity, int maxIterations) { this.hostCapacity = hostCapacity; this.maxIterations = maxIterations; } /** The map of subproblem solutions already found. The value is null when there is no solution. */ private Map> solutions = new HashMap<>(); /** * Finds the shortest sequence of moves which makes room for the given node on the given host, * assuming the given moves already made over the given hosts' current allocation. * * @param node the node to make room for * @param host the target host to make room on * @param hosts the hosts onto which we can move nodes * @param movesConsidered the moves already being considered to add as part of this scenario * (after any moves made by this) * @param movesMade the moves already made in this scenario * @return the list of movesMade with the moves needed for this appended, in the order they should be performed, * or null if no sequence could be found */ List makeRoomFor(Node node, Node host, List hosts, List movesConsidered, List movesMade) { SolutionKey solutionKey = new SolutionKey(node, host, movesConsidered, movesMade); List solution = solutions.get(solutionKey); if (solution == null) { solution = findRoomFor(node, host, hosts, movesConsidered, movesMade); solutions.put(solutionKey, solution); } return solution; } private List findRoomFor(Node node, Node host, List hosts, List movesConsidered, List movesMade) { if (iterations++ > maxIterations) return null; if ( ! host.resources().satisfies(node.resources())) return null; NodeResources freeCapacity = availableCapacityWith(movesMade, host); if (freeCapacity.satisfies(node.resources())) return List.of(); List shortest = null; for (var i = subsets(hostCapacity.allNodes().childrenOf(host), 5); i.hasNext(); ) { List childrenToMove = i.next(); if ( ! addResourcesOf(childrenToMove, freeCapacity).satisfies(node.resources())) continue; List moves = move(childrenToMove, host, hosts, movesConsidered, movesMade); if (moves == null) continue; if (shortest == null || moves.size() < shortest.size()) shortest = moves; } if (shortest == null) return null; return append(movesMade, shortest); } private List move(List nodes, Node host, List hosts, List movesConsidered, List movesMade) { List moves = new ArrayList<>(); for (Node childToMove : nodes) { List childMoves = move(childToMove, host, hosts, movesConsidered, append(movesMade, moves)); if (childMoves == null) return null; moves.addAll(childMoves); } return moves; } private List move(Node node, Node host, List hosts, List movesConsidered, List movesMade) { if (contains(node, movesConsidered)) return null; if (contains(node, movesMade)) return null; List shortest = null; for (Node target : hosts) { if (target.equals(host)) continue; Move move = new Move(node, host, target); List childMoves = makeRoomFor(node, target, hosts, append(movesConsidered, move), movesMade); if (childMoves == null) continue; if (shortest == null || shortest.size() > childMoves.size() + 1) { shortest = new ArrayList<>(childMoves); shortest.add(move); } } return shortest; } private boolean contains(Node node, List moves) { return moves.stream().anyMatch(move -> move.node().equals(node)); } private NodeResources addResourcesOf(List nodes, NodeResources resources) { for (Node node : nodes) resources = resources.add(node.resources()); return resources; } private Iterator> subsets(NodeList nodes, int maxSize) { return new SubsetIterator(nodes.asList(), maxSize); } private List append(List a, List b) { List list = new ArrayList<>(); list.addAll(a); list.addAll(b); return list; } private List append(List moves, Move move) { List list = new ArrayList<>(moves); list.add(move); return list; } private NodeResources availableCapacityWith(List moves, Node host) { NodeResources resources = hostCapacity.availableCapacityOf(host); for (Move move : moves) { if ( ! move.toHost().equals(host)) continue; resources = resources.subtract(move.node().resources()).justNumbers(); } for (Move move : moves) { if ( ! move.fromHost().equals(host)) continue; resources = resources.add(move.node().resources()).justNumbers(); } return resources; } } private static class SolutionKey { private final Node node; private final Node host; private final List movesConsidered; private final List movesMade; private final int hash; public SolutionKey(Node node, Node host, List movesConsidered, List movesMade) { this.node = node; this.host = host; this.movesConsidered = movesConsidered; this.movesMade = movesMade; hash = Objects.hash(node, host, movesConsidered, movesMade); } @Override public int hashCode() { return hash; } @Override public boolean equals(Object o) { if (o == this) return true; if (o == null || o.getClass() != this.getClass()) return false; SolutionKey other = (SolutionKey)o; if ( ! other.node.equals(this.node)) return false; if ( ! other.host.equals(this.host)) return false; if ( ! other.movesConsidered.equals(this.movesConsidered)) return false; if ( ! other.movesMade.equals(this.movesMade)) return false; return true; } } private static class SubsetIterator implements Iterator> { private final List nodes; private final int maxLength; // A number whose binary representation determines which items of list we'll include private int i = 0; // first "previous" = 0 -> skip the empty set private List next = null; public SubsetIterator(List nodes, int maxLength) { this.nodes = new ArrayList<>(nodes.subList(0, Math.min(nodes.size(), 31))); this.maxLength = maxLength; } @Override public boolean hasNext() { if (next != null) return true; // find next while (++i < 1< maxLength) continue; next = new ArrayList<>(ones); for (int position = 0; position < nodes.size(); position++) { if (hasOneAtPosition(position, i)) next.add(nodes.get(position)); } return true; } return false; } @Override public List next() { if ( ! hasNext()) throw new IllegalStateException("No more elements"); var current = next; next = null; return current; } private boolean hasOneAtPosition(int position, int number) { return (number & (1 << position)) > 0; } } }