// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceComparator; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; /** * @author mgimle */ public class CapacityChecker { // We only care about nodes in one of these states. private static final Set relevantNodeStates = EnumSet.of( Node.State.active, Node.State.inactive, Node.State.provisioned, Node.State.ready, Node.State.reserved); private final List hosts; private final Map nodeMap; private final Map> nodeChildren; private final Map availableResources; public AllocationHistory allocationHistory = null; public CapacityChecker(NodeList allNodes) { this.hosts = allNodes.hosts().state(relevantNodeStates).asList(); List tenants = getTenants(allNodes, hosts); nodeMap = constructHostnameToNodeMap(hosts); this.nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap); this.availableResources = constructAvailableResourcesMap(hosts, nodeChildren); } public List getHosts() { return hosts; } public Optional worstCaseHostLossLeadingToFailure() { Map timesNodeCanBeRemoved = computeMaximalRepeatedRemovals(); return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved); } protected List findOvercommittedHosts() { List overcommittedNodes = new ArrayList<>(); for (var entry : availableResources.entrySet()) { var resources = entry.getValue().nodeResources; if (resources.vcpu() < 0 || resources.memoryGb() < 0 || resources.diskGb() < 0) { overcommittedNodes.add(entry.getKey()); } } return overcommittedNodes; } public List nodesFromHostnames(List hostnames) { return hostnames.stream().filter(nodeMap::containsKey) .map(nodeMap::get) .toList(); } public Optional findHostRemovalFailure(List hostsToRemove) { return findHostRemovalFailure(hostsToRemove, hosts, nodeChildren, availableResources) .map(removal -> new HostFailurePath(hostsToRemove, removal)); } private List getTenants(NodeList allNodes, List hosts) { var parentNames = hosts.stream().map(Node::hostname).collect(Collectors.toSet()); return allNodes.nodeType(NodeType.tenant).state(relevantNodeStates).stream() .filter(t -> parentNames.contains(t.parentHostname().orElse(""))) .toList(); } private Optional greedyHeuristicFindFailurePath(Map heuristic) { if (hosts.isEmpty()) return Optional.empty(); List parentRemovalPriorityList = heuristic.entrySet().stream() .sorted(this::hostMitigationOrder) .map(Map.Entry::getKey) .toList(); for (int i = 1; i <= parentRemovalPriorityList.size(); i++) { List hostsToRemove = parentRemovalPriorityList.subList(0, i); var hostRemovalFailure = findHostRemovalFailure(hostsToRemove); if (hostRemovalFailure.isPresent()) return hostRemovalFailure; } throw new IllegalStateException("No path to failure found. This should be impossible!"); } private int hostMitigationOrder(Map.Entry entry1, Map.Entry entry2) { int result = Integer.compare(entry1.getValue(), entry2.getValue()); if (result != 0) return result; // Mitigate the largest hosts first return NodeResourceComparator.defaultOrder().compare(entry2.getKey().resources(), entry1.getKey().resources()); } private Map constructHostnameToNodeMap(List nodes) { return nodes.stream().collect(Collectors.toMap(Node::hostname, n -> n)); } private Map> constructNodeChildrenMap(List tenants, List hosts, Map hostnameToNode) { Map> nodeChildren = tenants.stream() .filter(n -> n.parentHostname().isPresent()) .filter(n -> hostnameToNode.containsKey(n.parentHostname().get())) .collect(Collectors.groupingBy(n -> hostnameToNode.get(n.parentHostname().orElseThrow()))); for (var host : hosts) nodeChildren.putIfAbsent(host, List.of()); return nodeChildren; } private Map constructAvailableResourcesMap(List hosts, Map> nodeChildren) { Map availableResources = new HashMap<>(); for (var host : hosts) { NodeResources hostResources = host.flavor().resources(); int occupiedIps = 0; Set ipPool = Set.copyOf(host.ipConfig().pool().ips()); for (var child : nodeChildren.get(host)) { hostResources = hostResources.subtract(child.resources().justNumbers()); occupiedIps += (int)child.ipConfig().primary().stream().filter(ipPool::contains).count(); } availableResources.put(host, new AllocationResources(hostResources, host.ipConfig().pool().ips().size() - occupiedIps)); } return availableResources; } /** * Computes a heuristic for each host, with a lower score indicating a higher perceived likelihood that removing * the host causes an unrecoverable state */ private Map computeMaximalRepeatedRemovals() { Map timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(Function.identity(), __ -> Integer.MAX_VALUE)); for (Node host : hosts) { List children = nodeChildren.get(host); if (children.size() == 0) continue; Map resourceMap = new HashMap<>(availableResources); Map> containedAllocations = collateAllocations(nodeChildren); int timesHostCanBeRemoved = 0; Optional unallocatedNode; while (timesHostCanBeRemoved < 100) { // Arbitrary upper bound unallocatedNode = tryAllocateNodes(nodeChildren.get(host), hosts, resourceMap, containedAllocations); if (unallocatedNode.isEmpty()) { timesHostCanBeRemoved++; } else break; } timesNodeCanBeRemoved.put(host, timesHostCanBeRemoved); } return timesNodeCanBeRemoved; } private Map> collateAllocations(Map> nodeChildren) { return nodeChildren.entrySet().stream().collect(Collectors.toMap( Map.Entry::getKey, e -> e.getValue().stream() .map(Node::allocation).flatMap(Optional::stream) .collect(Collectors.toCollection(ArrayList::new)) )); } /** * Tests whether it's possible to remove the provided hosts. * Does not mutate any input variable. * * @return empty optional if removal is possible, information on what caused the failure otherwise */ private Optional findHostRemovalFailure(List hostsToRemove, List allHosts, Map> nodechildren, Map availableResources) { var containedAllocations = collateAllocations(nodechildren); var resourceMap = new HashMap<>(availableResources); List validAllocationTargets = allHosts.stream() .filter(h -> !hostsToRemove.contains(h)) .filter(host -> !host.status().wantToRetire() && !host.status().wantToFail()) .toList(); if (validAllocationTargets.size() == 0) return Optional.of(HostRemovalFailure.none()); allocationHistory = new AllocationHistory(); for (var host : hostsToRemove) { Optional unallocatedNode = tryAllocateNodes(nodechildren.get(host), validAllocationTargets, resourceMap, containedAllocations, true); if (unallocatedNode.isPresent()) { AllocationFailureReasonList failures = collateAllocationFailures(unallocatedNode.get(), validAllocationTargets, resourceMap, containedAllocations); return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(), failures)); } } return Optional.empty(); } /** * Attempts to allocate the listed nodes to a new host, mutating availableResources and containedAllocations, * optionally returning the first node to fail, if one does. */ private Optional tryAllocateNodes(List nodes, List hosts, Map availableResources, Map> containedAllocations) { return tryAllocateNodes(nodes, hosts, availableResources, containedAllocations, false); } private Optional tryAllocateNodes(List nodes, List hosts, Map availableResources, Map> containedAllocations, boolean withHistory) { for (var node : nodes) { var newParent = tryAllocateNode(node, hosts, availableResources, containedAllocations); if (newParent.isEmpty()) { if (withHistory) allocationHistory.addEntry(node, null, 0); return Optional.of(node); } if (withHistory) { long eligibleParents = hosts.stream().filter(h -> !violatesParentHostPolicy(node, h, containedAllocations) && availableResources.get(h).satisfies(AllocationResources.from(node.resources()))).count(); allocationHistory.addEntry(node, newParent.get(), eligibleParents + 1); } } return Optional.empty(); } /** Returns the parent to which the node was allocated, if it was successfully allocated. */ private Optional tryAllocateNode(Node node, List hosts, Map availableResources, Map> containedAllocations) { AllocationResources requiredNodeResources = AllocationResources.from(node); for (var host : hosts) { var availableHostResources = availableResources.get(host); if (violatesParentHostPolicy(node, host, containedAllocations)) { continue; } if (availableHostResources.satisfies(requiredNodeResources)) { availableResources.put(host, availableHostResources.subtract(requiredNodeResources)); if (node.allocation().isPresent()) { containedAllocations.get(host).add(node.allocation().get()); } return Optional.of(host); } } return Optional.empty(); } private static boolean violatesParentHostPolicy(Node node, Node host, Map> containedAllocations) { if (node.allocation().isEmpty()) return false; Allocation nodeAllocation = node.allocation().get(); if (host.reservedTo() .filter(tenantName -> !tenantName.equals(nodeAllocation.owner().tenant())) .isPresent()) { return true; } for (var allocation : containedAllocations.get(host)) { if (allocation.membership().cluster().satisfies(nodeAllocation.membership().cluster()) && allocation.owner().equals(nodeAllocation.owner())) { return true; } } return false; } private AllocationFailureReasonList collateAllocationFailures(Node node, List hosts, Map availableResources, Map> containedAllocations) { List allocationFailureReasons = new ArrayList<>(); for (var host : hosts) { AllocationFailureReason reason = new AllocationFailureReason(); var availableHostResources = availableResources.get(host); reason.violatesParentHostPolicy = violatesParentHostPolicy(node, host, containedAllocations); NodeResources l = availableHostResources.nodeResources; NodeResources r = node.allocation().map(Allocation::requestedResources).orElse(node.resources()); if (l.vcpu() < r.vcpu()) reason.insufficientVcpu = true; if (l.memoryGb() < r.memoryGb()) reason.insufficientMemoryGb = true; if (l.diskGb() < r.diskGb()) reason.insufficientDiskGb = true; if (r.diskSpeed() != NodeResources.DiskSpeed.any && r.diskSpeed() != l.diskSpeed()) reason.incompatibleDiskSpeed = true; if (r.storageType() != NodeResources.StorageType.any && r.storageType() != l.storageType()) reason.incompatibleStorageType = true; if (r.architecture() != NodeResources.Architecture.any && r.architecture() != l.architecture()) reason.incompatibleStorageType = true; if (availableHostResources.availableIPs < 1) reason.insufficientAvailableIPs = true; allocationFailureReasons.add(reason); } return new AllocationFailureReasonList(allocationFailureReasons); } /** * Contains the list of hosts that, upon being removed, caused an unrecoverable state, * as well as the specific host and tenant which caused it. */ public static class HostFailurePath { public final List hostsCausingFailure; public final HostRemovalFailure failureReason; public HostFailurePath(List hostsCausingFailure, HostRemovalFailure failureReason) { this.hostsCausingFailure = hostsCausingFailure; this.failureReason = failureReason; } @Override public String toString() { return "failure path: " + failureReason + " upon removing " + hostsCausingFailure; } } /** * Data class used for detailing why removing the given tenant from the given host was unsuccessful. * A failure might not be caused by failing to allocate a specific tenant, in which case the fields * will be empty. */ public static class HostRemovalFailure { public final Optional host; public final Optional tenant; public final AllocationFailureReasonList allocationFailures; public static HostRemovalFailure none() { return new HostRemovalFailure(Optional.empty(), Optional.empty(), new AllocationFailureReasonList(List.of())); } public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) { return new HostRemovalFailure(Optional.of(host), Optional.of(tenant), failureReasons); } private HostRemovalFailure(Optional host, Optional tenant, AllocationFailureReasonList allocationFailures) { this.host = host; this.tenant = tenant; this.allocationFailures = allocationFailures; } @Override public String toString() { if (host.isEmpty() || tenant.isEmpty()) return "No removal candidates exists"; return String.format( "Failure to remove host %s" + "\n\tNo new host found for tenant %s:" + "\n\t\tSingular Reasons: %s" + "\n\t\tTotal Reasons: %s", this.host.get().hostname(), this.tenant.get().hostname(), this.allocationFailures.singularReasonFailures().toString(), this.allocationFailures.toString() ); } } /** Used to describe the resources required for a tenant, and available to a host. */ private static class AllocationResources { private final NodeResources nodeResources; private final int availableIPs; public static AllocationResources from(Node node) { if (node.allocation().isPresent()) return from(node.allocation().get().requestedResources()); else return from(node.resources()); } public static AllocationResources from(NodeResources nodeResources) { return new AllocationResources(nodeResources, 1); } public AllocationResources(NodeResources nodeResources, int availableIPs) { this.nodeResources = nodeResources; this.availableIPs = availableIPs; } public boolean satisfies(AllocationResources other) { if (!this.nodeResources.satisfies(other.nodeResources)) return false; return this.availableIPs >= other.availableIPs; } public AllocationResources subtract(AllocationResources other) { return new AllocationResources(this.nodeResources.subtract(other.nodeResources), this.availableIPs - other.availableIPs); } } /** * Keeps track of the reason why a host rejected an allocation. */ private static class AllocationFailureReason { public boolean insufficientVcpu = false; public boolean insufficientMemoryGb = false; public boolean insufficientDiskGb = false; public boolean incompatibleDiskSpeed = false; public boolean incompatibleStorageType = false; public boolean incompatibleArchitecture = false; public boolean insufficientAvailableIPs = false; public boolean violatesParentHostPolicy = false; public int numberOfReasons() { int n = 0; if (insufficientVcpu) n++; if (insufficientMemoryGb) n++; if (insufficientDiskGb) n++; if (incompatibleDiskSpeed) n++; if (incompatibleArchitecture) n++; if (insufficientAvailableIPs) n++; if (violatesParentHostPolicy) n++; return n; } @Override public String toString() { List reasons = new ArrayList<>(); if (insufficientVcpu) reasons.add("insufficientVcpu"); if (insufficientMemoryGb) reasons.add("insufficientMemoryGb"); if (insufficientDiskGb) reasons.add("insufficientDiskGb"); if (incompatibleDiskSpeed) reasons.add("incompatibleDiskSpeed"); if (incompatibleStorageType) reasons.add("incompatibleStorageType"); if (incompatibleArchitecture) reasons.add("incompatibleArchitecture"); if (insufficientAvailableIPs) reasons.add("insufficientAvailableIPs"); if (violatesParentHostPolicy) reasons.add("violatesParentHostPolicy"); return String.format("[%s]", String.join(", ", reasons)); } } /** * Provides convenient methods for tallying failures. */ public static class AllocationFailureReasonList { private final List allocationFailureReasons; public AllocationFailureReasonList(List allocationFailureReasons) { this.allocationFailureReasons = allocationFailureReasons; } public long insufficientVcpu() { return allocationFailureReasons.stream().filter(r -> r.insufficientVcpu).count(); } public long insufficientMemoryGb() { return allocationFailureReasons.stream().filter(r -> r.insufficientMemoryGb).count(); } public long insufficientDiskGb() { return allocationFailureReasons.stream().filter(r -> r.insufficientDiskGb).count(); } public long incompatibleDiskSpeed() { return allocationFailureReasons.stream().filter(r -> r.incompatibleDiskSpeed).count(); } public long incompatibleStorageType() { return allocationFailureReasons.stream().filter(r -> r.incompatibleStorageType).count(); } public long incompatibleArchitecture() { return allocationFailureReasons.stream().filter(r -> r.incompatibleArchitecture).count(); } public long insufficientAvailableIps() { return allocationFailureReasons.stream().filter(r -> r.insufficientAvailableIPs).count(); } public long violatesParentHostPolicy() { return allocationFailureReasons.stream().filter(r -> r.violatesParentHostPolicy).count(); } public AllocationFailureReasonList singularReasonFailures() { return new AllocationFailureReasonList(allocationFailureReasons.stream() .filter(reason -> reason.numberOfReasons() == 1).toList()); } public AllocationFailureReasonList multipleReasonFailures() { return new AllocationFailureReasonList(allocationFailureReasons.stream() .filter(reason -> reason.numberOfReasons() > 1).toList()); } public long size() { return allocationFailureReasons.size(); } @Override public String toString() { return String.format("CPU (%3d), Memory (%3d), Disk size (%3d), Disk speed (%3d), Storage type (%3d), Architecture (%3d), IP (%3d), Parent-Host Policy (%3d)", insufficientVcpu(), insufficientMemoryGb(), insufficientDiskGb(), incompatibleDiskSpeed(), incompatibleStorageType(), incompatibleArchitecture(), insufficientAvailableIps(), violatesParentHostPolicy()); } } public static class AllocationHistory { public static class Entry { public final Node tenant; public final Node newParent; public final long eligibleParents; public Entry(Node tenant, Node newParent, long eligibleParents) { this.tenant = tenant; this.newParent = newParent; this.eligibleParents = eligibleParents; } @Override public String toString() { return String.format("%-20s %-65s -> %15s [%3d valid]", tenant.hostname().replaceFirst("\\..+", ""), tenant.resources(), newParent == null ? "x" : newParent.hostname().replaceFirst("\\..+", ""), this.eligibleParents ); } } public final List historyEntries; public AllocationHistory() { this.historyEntries = new ArrayList<>(); } public void addEntry(Node tenant, Node newParent, long eligibleParents) { this.historyEntries.add(new Entry(tenant, newParent, eligibleParents)); } public Set oldParents() { Set oldParents = new HashSet<>(); for (var entry : historyEntries) entry.tenant.parentHostname().ifPresent(oldParents::add); return oldParents; } @Override public String toString() { StringBuilder out = new StringBuilder(); String currentParent = ""; for (var entry : historyEntries) { String parentName = entry.tenant.parentHostname().orElseThrow(); if (!parentName.equals(currentParent)) { currentParent = parentName; out.append(parentName).append("\n"); } out.append(entry.toString()).append("\n"); } return out.toString(); } } }