// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterMembership; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.SystemName; import com.yahoo.lang.MutableInteger; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Allocation; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.EnumSet; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Predicate; import java.util.stream.Collectors; /** * Used to manage a list of nodes during the node reservation process * in order to fulfill the nodespec. * * @author bratseth */ class NodeAllocation { /** List of all nodes in node-repository */ private final NodeList allNodes; /** The application this list is for */ private final ApplicationId application; /** The cluster this list is for */ private final ClusterSpec cluster; /** The requested nodes of this list */ private final NodeSpec requestedNodes; /** The node candidates this has accepted so far, keyed on hostname */ private final Map nodes = new LinkedHashMap<>(); /** The number of already allocated nodes accepted and not retired */ private int accepted = 0; /** The number of already allocated nodes accepted and not retired and not needing resize */ private int acceptedWithoutResizingRetired = 0; /** The number of nodes rejected because of clashing parentHostname */ private int rejectedDueToClashingParentHost = 0; /** The number of nodes rejected due to exclusivity constraints */ private int rejectedDueToExclusivity = 0; private int rejectedDueToInsufficientRealResources = 0; /** The number of nodes that just now was changed to retired */ private int wasRetiredJustNow = 0; /** The node indexes to verify uniqueness of each members index */ private final Set indexes = new HashSet<>(); /** The next membership index to assign to a new node */ private final MutableInteger highestIndex; private final NodeRepository nodeRepository; private final NodeResourceLimits nodeResourceLimits; NodeAllocation(NodeList allNodes, ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes, MutableInteger highestIndex, NodeRepository nodeRepository) { this.allNodes = allNodes; this.application = application; this.cluster = cluster; this.requestedNodes = requestedNodes; this.highestIndex = highestIndex; this.nodeRepository = nodeRepository; nodeResourceLimits = new NodeResourceLimits(nodeRepository); } /** * Offer some nodes to this. The nodes may have an allocation to a different application or cluster, * an allocation to this cluster, or no current allocation (in which case one is assigned). * * Note that if unallocated nodes are offered before allocated nodes, this will unnecessarily * reject allocated nodes due to index duplicates. * * @param nodesPrioritized the nodes which are potentially on offer. These may belong to a different application etc. * @return the subset of offeredNodes which was accepted, with the correct allocation assigned */ List offer(List nodesPrioritized) { List accepted = new ArrayList<>(); for (NodeCandidate candidate : nodesPrioritized) { if (candidate.allocation().isPresent()) { Allocation allocation = candidate.allocation().get(); ClusterMembership membership = allocation.membership(); if ( ! allocation.owner().equals(application)) continue; // wrong application if ( ! membership.cluster().satisfies(cluster)) continue; // wrong cluster id/type if ((! candidate.isSurplus || saturated()) && ! membership.cluster().group().equals(cluster.group())) continue; // wrong group and we can't or have no reason to change it if ( candidate.state() == Node.State.active && allocation.isRemovable()) continue; // don't accept; causes removal if ( indexes.contains(membership.index())) continue; // duplicate index (just to be sure) boolean resizeable = requestedNodes.considerRetiring() && candidate.isResizable; boolean acceptToRetire = acceptToRetire(candidate); if ((! saturated() && hasCompatibleFlavor(candidate) && requestedNodes.acceptable(candidate)) || acceptToRetire) { candidate = candidate.withNode(); if (candidate.isValid()) accepted.add(acceptNode(candidate, shouldRetire(candidate), resizeable)); } } else if (! saturated() && hasCompatibleFlavor(candidate)) { if ( ! nodeResourceLimits.isWithinRealLimits(candidate, cluster)) { ++rejectedDueToInsufficientRealResources; continue; } if ( violatesParentHostPolicy(candidate)) { ++rejectedDueToClashingParentHost; continue; } if ( violatesExclusivity(candidate)) { ++rejectedDueToExclusivity; continue; } if (candidate.wantToRetire()) { continue; } candidate = candidate.allocate(application, ClusterMembership.from(cluster, highestIndex.add(1)), requestedNodes.resources().orElse(candidate.resources()), nodeRepository.clock().instant()); if (candidate.isValid()) accepted.add(acceptNode(candidate, false, false)); } } return accepted; } private boolean shouldRetire(NodeCandidate candidate) { if ( ! requestedNodes.considerRetiring()) return candidate.allocation().map(a -> a.membership().retired()).orElse(false); // don't second-guess if already retired if ( ! nodeResourceLimits.isWithinRealLimits(candidate, cluster)) return true; if (violatesParentHostPolicy(candidate)) return true; if ( ! hasCompatibleFlavor(candidate)) return true; if (candidate.wantToRetire()) return true; if (violatesExclusivity(candidate)) return true; return false; } private boolean violatesParentHostPolicy(NodeCandidate candidate) { return checkForClashingParentHost() && offeredNodeHasParentHostnameAlreadyAccepted(candidate); } private boolean checkForClashingParentHost() { return nodeRepository.zone().system() == SystemName.main && nodeRepository.zone().environment().isProduction() && ! application.instance().isTester(); } private boolean offeredNodeHasParentHostnameAlreadyAccepted(NodeCandidate candidate) { for (NodeCandidate acceptedNode : nodes.values()) { if (acceptedNode.parentHostname().isPresent() && candidate.parentHostname().isPresent() && acceptedNode.parentHostname().get().equals(candidate.parentHostname().get())) { return true; } } return false; } private boolean violatesExclusivity(NodeCandidate candidate) { if (candidate.parentHostname().isEmpty()) return false; // In dynamic provisioned zones a node requiring exclusivity must be on a host that has exclusiveTo equal to its owner if (nodeRepository.zone().getCloud().dynamicProvisioning()) return requestedNodes.isExclusive() && ! candidate.parent.flatMap(Node::exclusiveTo).map(application::equals).orElse(false); // In non-dynamic provisioned zones we require that if either of the nodes on the host requires exclusivity, // then all the nodes on the host must have the same owner for (Node nodeOnHost : allNodes.childrenOf(candidate.parentHostname().get())) { if (nodeOnHost.allocation().isEmpty()) continue; if (requestedNodes.isExclusive() || nodeOnHost.allocation().get().membership().cluster().isExclusive()) { if ( ! nodeOnHost.allocation().get().owner().equals(application)) return true; } } return false; } /** * Returns whether this node should be accepted into the cluster even if it is not currently desired * (already enough nodes, or wrong flavor). * Such nodes will be marked retired during finalization of the list of accepted nodes. * The conditions for this are: * * This is a content or combined node. These must always be retired before being removed to allow the cluster to * migrate away data. * * This is a container node and it is not desired due to having the wrong flavor. In this case this * will (normally) obtain for all the current nodes in the cluster and so retiring before removing must * be used to avoid removing all the current nodes at once, before the newly allocated replacements are * initialized. (In the other case, where a container node is not desired because we have enough nodes we * do want to remove it immediately to get immediate feedback on how the size reduction works out.) */ private boolean acceptToRetire(NodeCandidate candidate) { if (candidate.state() != Node.State.active) return false; if (! candidate.allocation().get().membership().cluster().group().equals(cluster.group())) return false; if (candidate.allocation().get().membership().retired()) return true; // don't second-guess if already retired if (! requestedNodes.considerRetiring()) return false; return cluster.type().isContent() || (cluster.type() == ClusterSpec.Type.container && !hasCompatibleFlavor(candidate)); } private boolean hasCompatibleFlavor(NodeCandidate candidate) { return requestedNodes.isCompatible(candidate.flavor(), nodeRepository.flavors()) || candidate.isResizable; } private Node acceptNode(NodeCandidate candidate, boolean wantToRetire, boolean resizeable) { Node node = candidate.toNode(); if (node.allocation().isPresent()) // Record the currently requested resources node = node.with(node.allocation().get().withRequestedResources(requestedNodes.resources().orElse(node.resources()))); if (! wantToRetire) { accepted++; // We want to allocate new nodes rather than unretiring with resize, so count without those // for the purpose of deciding when to stop accepting nodes (saturation) if (node.allocation().isEmpty() || ! ( requestedNodes.needsResize(node) && node.allocation().get().membership().retired())) acceptedWithoutResizingRetired++; if (resizeable && ! ( node.allocation().isPresent() && node.allocation().get().membership().retired())) node = resize(node); if (node.state() != Node.State.active) // reactivated node - wipe state that deactivated it node = node.unretire().removable(false); } else { ++wasRetiredJustNow; node = node.retire(nodeRepository.clock().instant()); } if ( ! node.allocation().get().membership().cluster().equals(cluster)) { // group may be different node = setCluster(cluster, node); } candidate = candidate.withNode(node); indexes.add(node.allocation().get().membership().index()); highestIndex.set(Math.max(highestIndex.get(), node.allocation().get().membership().index())); nodes.put(node.hostname(), candidate); return node; } private Node resize(Node node) { NodeResources hostResources = allNodes.parentOf(node).get().flavor().resources(); return node.with(new Flavor(requestedNodes.resources().get() .with(hostResources.diskSpeed()) .with(hostResources.storageType()))); } private Node setCluster(ClusterSpec cluster, Node node) { ClusterMembership membership = node.allocation().get().membership().with(cluster); return node.with(node.allocation().get().with(membership)); } /** Returns true if no more nodes are needed in this list */ private boolean saturated() { return requestedNodes.saturatedBy(acceptedWithoutResizingRetired); } /** Returns true if the content of this list is sufficient to meet the request */ boolean fulfilled() { return requestedNodes.fulfilledBy(accepted); } /** Returns true this allocation was already fulfilled and resulted in no new changes */ public boolean fulfilledAndNoChanges() { return fulfilled() && reservableNodes().isEmpty() && newNodes().isEmpty(); } /** * Returns {@link FlavorCount} describing the docker node deficit for the given {@link NodeSpec}. * * @return empty if the requested spec is not count based or the requested flavor type is not docker or * the request is already fulfilled. Otherwise returns {@link FlavorCount} containing the required flavor * and node count to cover the deficit. */ Optional getFulfilledDockerDeficit() { return Optional.of(requestedNodes) .filter(NodeSpec.CountNodeSpec.class::isInstance) .map(spec -> new FlavorCount(spec.resources().get(), spec.fulfilledDeficitCount(accepted))) .filter(flavorCount -> flavorCount.getCount() > 0); } /** * Make the number of non-retired nodes in the list equal to the requested number * of nodes, and retire the rest of the list. Only retire currently active nodes. * Prefer to retire nodes of the wrong flavor. * Make as few changes to the retired set as possible. * * @return the final list of nodes */ List finalNodes() { int currentRetiredCount = (int) nodes.values().stream().filter(node -> node.allocation().get().membership().retired()).count(); int deltaRetiredCount = requestedNodes.idealRetiredCount(nodes.size(), currentRetiredCount) - currentRetiredCount; if (deltaRetiredCount > 0) { // retire until deltaRetiredCount is 0 for (NodeCandidate candidate : byRetiringPriority(nodes.values())) { if ( ! candidate.allocation().get().membership().retired() && candidate.state() == Node.State.active) { candidate = candidate.withNode(); candidate = candidate.withNode(candidate.toNode().retire(Agent.application, nodeRepository.clock().instant())); nodes.put(candidate.toNode().hostname(), candidate); if (--deltaRetiredCount == 0) break; } } } else if (deltaRetiredCount < 0) { // unretire until deltaRetiredCount is 0 for (NodeCandidate candidate : byUnretiringPriority(nodes.values())) { if ( candidate.allocation().get().membership().retired() && hasCompatibleFlavor(candidate) ) { candidate = candidate.withNode(); if (candidate.isResizable) candidate = candidate.withNode(resize(candidate.toNode())); candidate = candidate.withNode(candidate.toNode().unretire()); nodes.put(candidate.toNode().hostname(), candidate); if (++deltaRetiredCount == 0) break; } } } for (NodeCandidate candidate : nodes.values()) { // Set whether the node is exclusive candidate = candidate.withNode(); Allocation allocation = candidate.allocation().get(); candidate = candidate.withNode(candidate.toNode().with(allocation.with(allocation.membership() .with(allocation.membership().cluster().exclusive(requestedNodes.isExclusive()))))); nodes.put(candidate.toNode().hostname(), candidate); } return nodes.values().stream().map(n -> n.toNode()).collect(Collectors.toList()); } List reservableNodes() { // Include already reserved nodes to extend reservation period and to potentially update their cluster spec. EnumSet reservableStates = EnumSet.of(Node.State.inactive, Node.State.ready, Node.State.reserved); return nodesFilter(n -> ! n.isNew && reservableStates.contains(n.state())); } List newNodes() { return nodesFilter(n -> n.isNew); } private List nodesFilter(Predicate predicate) { return nodes.values().stream() .filter(predicate) .map(n -> n.toNode()) .collect(Collectors.toList()); } /** Prefer to retire nodes we want the least */ private List byRetiringPriority(Collection candidates) { return candidates.stream().sorted(Comparator.reverseOrder()).collect(Collectors.toList()); } /** Prefer to unretire nodes we don't want to retire, and otherwise those with lower index */ private List byUnretiringPriority(Collection candidates) { return candidates.stream() .sorted(Comparator.comparing(NodeCandidate::wantToRetire) .thenComparing(n -> n.allocation().get().membership().index())) .collect(Collectors.toList()); } public String outOfCapacityDetails() { List reasons = new ArrayList<>(); if (rejectedDueToExclusivity > 0) reasons.add("host exclusivity constraints"); if (rejectedDueToClashingParentHost > 0) reasons.add("insufficient nodes available on separate physical hosts"); if (wasRetiredJustNow > 0) reasons.add("retirement of allocated nodes"); if (rejectedDueToInsufficientRealResources > 0) reasons.add("insufficient real resources on hosts"); if (reasons.isEmpty()) return ""; return ": Not enough nodes available due to " + String.join(", ", reasons); } static class FlavorCount { private final NodeResources flavor; private final int count; private FlavorCount(NodeResources flavor, int count) { this.flavor = flavor; this.count = count; } NodeResources getFlavor() { return flavor; } int getCount() { return count; } } }