diff options
author | Jon Bratseth <bratseth@oath.com> | 2020-06-18 09:10:48 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-18 09:10:48 +0200 |
commit | c95cd618e0903ce0e5101ecb3593821bb399a9f9 (patch) | |
tree | 667aadb9fc67ab9d8f220ef298ccc82186c13ac2 /node-repository | |
parent | b0bc165e64dbcbb9e8e357f92f46f8bf1c84dbb8 (diff) | |
parent | ed30906d441364b95b99f52355d218a085246fa6 (diff) |
Merge pull request #13619 from vespa-engine/bratseth/spare-capacity-maintainer
Bratseth/spare capacity maintainer
Diffstat (limited to 'node-repository')
39 files changed, 1057 insertions, 301 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index a9861497ca3..b6237886dc7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -128,6 +128,8 @@ public final class Node { return parentHostname.isPresent() && parentHostname.get().equals(hostname); } + public NodeResources resources() { return flavor.resources(); } + /** Returns the flavor of this node */ public Flavor flavor() { return flavor; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index 1b2f73a2f5f..cbc5a44ae94 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -32,7 +32,7 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { /** Returns the subset of nodes which are retired */ public NodeList retired() { - return matching(node -> node.allocation().get().membership().retired()); + return matching(node -> node.allocation().isPresent() && node.allocation().get().membership().retired()); } /** Returns the subset of nodes that are being deprovisioned */ @@ -42,17 +42,25 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { /** Returns the subset of nodes which are removable */ public NodeList removable() { - return matching(node -> node.allocation().get().isRemovable()); + return matching(node -> node.allocation().isPresent() && node.allocation().get().isRemovable()); } /** Returns the subset of nodes having exactly the given resources */ - public NodeList resources(NodeResources resources) { return matching(node -> node.flavor().resources().equals(resources)); } + public NodeList resources(NodeResources resources) { return matching(node -> node.resources().equals(resources)); } + + /** Returns the subset of nodes which satisfy the given resources */ + public NodeList satisfies(NodeResources resources) { return matching(node -> node.resources().satisfies(resources)); } /** Returns the subset of nodes of the given flavor */ public NodeList flavor(String flavor) { return matching(node -> node.flavor().name().equals(flavor)); } + /** Returns the subset of nodes not in the given collection */ + public NodeList except(Collection<Node> nodes) { + return matching(node -> ! nodes.contains(node)); + } + /** Returns the subset of nodes assigned to the given cluster type */ public NodeList type(ClusterSpec.Type type) { return matching(node -> node.allocation().isPresent() && node.allocation().get().membership().cluster().type().equals(type)); @@ -109,6 +117,11 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { return matching(node -> nodeTypes.contains(node.type())); } + /** Returns the subset of nodes of the host type */ + public NodeList hosts() { + return matching(node -> node.type() == NodeType.host); + } + /** Returns the subset of nodes that are parents */ public NodeList parents() { return matching(n -> n.parentHostname().isEmpty()); @@ -133,6 +146,11 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { return matching(node -> nodeStates.contains(node.state())); } + /** Returns the subset of nodes which wantToRetire set true */ + public NodeList wantToRetire() { + return matching((node -> node.status().wantToRetire())); + } + /** Returns the parent nodes of the given child nodes */ public NodeList parentsOf(Collection<Node> children) { return children.stream() diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index b41820a461b..bec35e7ee4f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -102,7 +102,7 @@ public class NodeRepository extends AbstractComponent { private final DockerImages dockerImages; private final JobControl jobControl; private final Applications applications; - private final boolean canProvisionHostsWhenRequired; + private final boolean canProvisionHosts; /** * Creates a node repository from a zookeeper provider. @@ -136,7 +136,7 @@ public class NodeRepository extends AbstractComponent { NameResolver nameResolver, DockerImage dockerImage, boolean useCuratorClientCache, - boolean canProvisionHostsWhenRequired) { + boolean canProvisionHosts) { this.db = new CuratorDatabaseClient(flavors, curator, clock, zone, useCuratorClientCache); this.zone = zone; this.clock = clock; @@ -149,7 +149,7 @@ public class NodeRepository extends AbstractComponent { this.dockerImages = new DockerImages(db, dockerImage); this.jobControl = new JobControl(db); this.applications = new Applications(db); - this.canProvisionHostsWhenRequired = canProvisionHostsWhenRequired; + this.canProvisionHosts = canProvisionHosts; // read and write all nodes to make sure they are stored in the latest version of the serialized format for (State state : State.values()) @@ -800,16 +800,14 @@ public class NodeRepository extends AbstractComponent { if (host.status().wantToRetire()) return false; if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false; - if ( canProvisionHostsWhenRequired()) + if ( canProvisionHosts()) return EnumSet.of(State.active, State.ready, State.provisioned).contains(host.state()); else return host.state() == State.active; } - /** Returns whether this has the ability to conjure hosts when required */ - public boolean canProvisionHostsWhenRequired() { - return canProvisionHostsWhenRequired; - } + /** Returns whether this repository can provision hosts on demand */ + public boolean canProvisionHosts() { return canProvisionHosts; } /** Returns the time keeper of this system */ public Clock clock() { return clock; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java index e6cbddf96f2..267bfefa332 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java @@ -47,7 +47,7 @@ public class AllocatableClusterResources { this.nodes = nodes.size(); this.groups = (int)nodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count(); this.realResources = averageRealResourcesOf(nodes, nodeRepository); // Average since we average metrics over nodes - this.advertisedResources = nodes.get(0).flavor().resources(); + this.advertisedResources = nodes.get(0).resources(); this.clusterType = nodes.get(0).allocation().get().membership().cluster().type(); this.fulfilment = 1; } @@ -125,11 +125,11 @@ public class AllocatableClusterResources { NodeResources sum = new NodeResources(0, 0, 0, 0); for (Node node : nodes) sum = sum.add(nodeRepository.resourcesCalculator().realResourcesOf(node, nodeRepository).justNumbers()); - return nodes.get(0).flavor().resources().justNonNumbers() - .withVcpu(sum.vcpu() / nodes.size()) - .withMemoryGb(sum.memoryGb() / nodes.size()) - .withDiskGb(sum.diskGb() / nodes.size()) - .withBandwidthGbps(sum.bandwidthGbps() / nodes.size()); + return nodes.get(0).resources().justNonNumbers() + .withVcpu(sum.vcpu() / nodes.size()) + .withMemoryGb(sum.memoryGb() / nodes.size()) + .withDiskGb(sum.diskGb() / nodes.size()) + .withBandwidthGbps(sum.bandwidthGbps() / nodes.size()); } public static Optional<AllocatableClusterResources> from(ClusterResources wantedResources, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index fa8e8375e23..c32b7854d4e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -85,7 +85,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { int currentGroups = (int)clusterNodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count(); ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type(); log.info("Autoscaling " + application + " " + clusterType + " " + clusterId + ":" + - "\nfrom " + toString(clusterNodes.size(), currentGroups, clusterNodes.get(0).flavor().resources()) + + "\nfrom " + toString(clusterNodes.size(), currentGroups, clusterNodes.get(0).resources()) + "\nto " + toString(target.nodes(), target.groups(), target.nodeResources())); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java index ca8399da629..f583728f9b8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java @@ -6,11 +6,15 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Allocation; +import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceComparator; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; +/** + * @author mgimle + */ public class CapacityChecker { private List<Node> hosts; @@ -42,15 +46,15 @@ public class CapacityChecker { } public List<Node> nodesFromHostnames(List<String> hostnames) { - List<Node> nodes = hostnames.stream() - .filter(h -> nodeMap.containsKey(h)) - .map(h -> nodeMap.get(h)) - .collect(Collectors.toList()); + List<Node> nodes = hostnames.stream().filter(h -> nodeMap.containsKey(h)) + .map(h -> nodeMap.get(h)) + .collect(Collectors.toList()); + if (nodes.size() != hostnames.size()) { Set<String> notFoundNodes = new HashSet<>(hostnames); notFoundNodes.removeAll(nodes.stream().map(Node::hostname).collect(Collectors.toList())); throw new IllegalArgumentException(String.format("Host(s) not found: [ %s ]", - String.join(", ", notFoundNodes))); + String.join(", ", notFoundNodes))); } return nodes; @@ -92,9 +96,9 @@ public class CapacityChecker { if (hosts.size() == 0) return Optional.empty(); List<Node> parentRemovalPriorityList = heuristic.entrySet().stream() - .sorted(Comparator.comparingInt(Map.Entry::getValue)) - .map(Map.Entry::getKey) - .collect(Collectors.toList()); + .sorted(this::hostMitigationOrder) + .map(Map.Entry::getKey) + .collect(Collectors.toList()); for (int i = 1; i <= parentRemovalPriorityList.size(); i++) { List<Node> hostsToRemove = parentRemovalPriorityList.subList(0, i); @@ -110,18 +114,25 @@ public class CapacityChecker { throw new IllegalStateException("No path to failure found. This should be impossible!"); } + private int hostMitigationOrder(Map.Entry<Node, Integer> entry1, Map.Entry<Node, Integer> entry2) { + int result = Integer.compare(entry1.getValue(), entry2.getValue()); + if (result != 0) return result; + // Mitigate the largest hosts first + return NodeResourceComparator.defaultOrder().compare(entry2.getKey().resources(), entry1.getKey().resources()); + } + private Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) { return nodes.stream().collect(Collectors.toMap(Node::hostname, n -> n)); } private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) { Map<Node, List<Node>> nodeChildren = tenants.stream() - .filter(n -> n.parentHostname().isPresent()) - .filter(n -> hostnameToNode.containsKey(n.parentHostname().get())) - .collect(Collectors.groupingBy( - n -> hostnameToNode.get(n.parentHostname().orElseThrow()))); + .filter(n -> n.parentHostname().isPresent()) + .filter(n -> hostnameToNode.containsKey(n.parentHostname().get())) + .collect(Collectors.groupingBy(n -> hostnameToNode.get(n.parentHostname().orElseThrow()))); - for (var host : hosts) nodeChildren.putIfAbsent(host, List.of()); + for (var host : hosts) + nodeChildren.putIfAbsent(host, List.of()); return nodeChildren; } @@ -133,7 +144,7 @@ public class CapacityChecker { int occupiedIps = 0; Set<String> ipPool = host.ipAddressPool().asSet(); for (var child : nodeChildren.get(host)) { - hostResources = hostResources.subtract(child.flavor().resources().justNumbers()); + hostResources = hostResources.subtract(child.resources().justNumbers()); occupiedIps += child.ipAddresses().stream().filter(ipPool::contains).count(); } availableResources.put(host, new AllocationResources(hostResources, host.ipAddressPool().asSet().size() - occupiedIps)); @@ -149,10 +160,8 @@ public class CapacityChecker { private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren, Map<Node, AllocationResources> availableResources) { - Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap( - Function.identity(), - __ -> Integer.MAX_VALUE - )); + Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(Function.identity(), + __ -> Integer.MAX_VALUE)); for (Node host : hosts) { List<Node> children = nodeChildren.get(host); if (children.size() == 0) continue; @@ -196,7 +205,8 @@ public class CapacityChecker { /** * Tests whether it's possible to remove the provided hosts. * Does not mutate any input variable. - * @return Empty optional if removal is possible, information on what caused the failure otherwise + * + * @return empty optional if removal is possible, information on what caused the failure otherwise */ private Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts, Map<Node, List<Node>> nodechildren, @@ -204,20 +214,24 @@ public class CapacityChecker { var containedAllocations = collateAllocations(nodechildren); var resourceMap = new HashMap<>(availableResources); List<Node> validAllocationTargets = allHosts.stream() - .filter(h -> !hostsToRemove.contains(h)) - .collect(Collectors.toList()); - if (validAllocationTargets.size() == 0) { + .filter(h -> !hostsToRemove.contains(h)) + .collect(Collectors.toList()); + if (validAllocationTargets.size() == 0) return Optional.of(HostRemovalFailure.none()); - } allocationHistory = new AllocationHistory(); for (var host : hostsToRemove) { Optional<Node> unallocatedNode = tryAllocateNodes(nodechildren.get(host), - validAllocationTargets, resourceMap, containedAllocations, true); + validAllocationTargets, + resourceMap, + containedAllocations, + true); if (unallocatedNode.isPresent()) { AllocationFailureReasonList failures = collateAllocationFailures(unallocatedNode.get(), - validAllocationTargets, resourceMap, containedAllocations); + validAllocationTargets, + resourceMap, + containedAllocations); return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(), failures)); } } @@ -248,7 +262,7 @@ public class CapacityChecker { long eligibleParents = hosts.stream().filter(h -> !violatesParentHostPolicy(node, h, containedAllocations) - && availableResources.get(h).satisfies(AllocationResources.from(node.flavor().resources()))).count(); + && availableResources.get(h).satisfies(AllocationResources.from(node.resources()))).count(); allocationHistory.addEntry(node, newParent.get(), eligibleParents + 1); } } @@ -300,7 +314,7 @@ public class CapacityChecker { reason.violatesParentHostPolicy = violatesParentHostPolicy(node, host, containedAllocations); NodeResources l = availableHostResources.nodeResources; - NodeResources r = node.allocation().map(Allocation::requestedResources).orElse(node.flavor().resources()); + NodeResources r = node.allocation().map(Allocation::requestedResources).orElse(node.resources()); if (l.vcpu() < r.vcpu()) reason.insufficientVcpu = true; @@ -326,8 +340,15 @@ public class CapacityChecker { * as well as the specific host and tenant which caused it. */ public static class HostFailurePath { + public List<Node> hostsCausingFailure; public HostRemovalFailure failureReason; + + @Override + public String toString() { + return "failure path: " + failureReason + " upon removing " + hostsCausingFailure; + } + } /** @@ -336,22 +357,21 @@ public class CapacityChecker { * will be empty. */ public static class HostRemovalFailure { + public Optional<Node> host; public Optional<Node> tenant; public AllocationFailureReasonList allocationFailures; public static HostRemovalFailure none() { - return new HostRemovalFailure( - Optional.empty(), - Optional.empty(), - new AllocationFailureReasonList(List.of())); + return new HostRemovalFailure(Optional.empty(), + Optional.empty(), + new AllocationFailureReasonList(List.of())); } public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) { - return new HostRemovalFailure( - Optional.of(host), - Optional.of(tenant), - failureReasons); + return new HostRemovalFailure(Optional.of(host), + Optional.of(tenant), + failureReasons); } private HostRemovalFailure(Optional<Node> host, Optional<Node> tenant, AllocationFailureReasonList allocationFailures) { @@ -362,7 +382,7 @@ public class CapacityChecker { @Override public String toString() { - if (host.isEmpty() || tenant.isEmpty()) return "No removal candidates exists."; + if (host.isEmpty() || tenant.isEmpty()) return "No removal candidates exists"; return String.format( "Failure to remove host %s" + "\n\tNo new host found for tenant %s:" + @@ -386,7 +406,7 @@ public class CapacityChecker { if (node.allocation().isPresent()) return from(node.allocation().get().requestedResources()); else - return from(node.flavor().resources()); + return from(node.resources()); } public static AllocationResources from(NodeResources nodeResources) { @@ -406,6 +426,7 @@ public class CapacityChecker { public AllocationResources subtract(AllocationResources other) { return new AllocationResources(this.nodeResources.subtract(other.nodeResources), this.availableIPs - other.availableIPs); } + } /** @@ -449,6 +470,7 @@ public class CapacityChecker { return String.format("[%s]", String.join(", ", reasons)); } + } /** @@ -487,6 +509,7 @@ public class CapacityChecker { insufficientVcpu(), insufficientMemoryGb(), insufficientDiskGb(), incompatibleDiskSpeed(), incompatibleStorageType(), insufficientAvailableIps(), violatesParentHostPolicy()); } + } public static class AllocationHistory { @@ -506,7 +529,7 @@ public class CapacityChecker { public String toString() { return String.format("%-20s %-65s -> %15s [%3d valid]", tenant.hostname().replaceFirst("\\..+", ""), - tenant.flavor().resources(), + tenant.resources(), newParent == null ? "x" : newParent.hostname().replaceFirst("\\..+", ""), this.eligibleParents ); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java deleted file mode 100644 index f6cadabec54..00000000000 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.provision.maintenance; - -import com.yahoo.jdisc.Metric; -import com.yahoo.vespa.hosted.provision.Node; -import com.yahoo.vespa.hosted.provision.NodeRepository; - -import java.time.Duration; -import java.util.List; -import java.util.Objects; -import java.util.Optional; -import java.util.logging.Level; -import java.util.logging.Logger; -import java.util.stream.Collectors; - -/** - * Performs analysis on the node repository to produce metrics that pertain to the capacity of the node repository. - * These metrics include: - * Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's - * unable to find a new home for orphaned tenants. - * Overcommitted hosts, which tracks if there are any hosts whose capacity is less than the sum of its children's. - * - * @author mgimle - */ -public class CapacityReportMaintainer extends NodeRepositoryMaintainer { - - private final Metric metric; - private final NodeRepository nodeRepository; - private static final Logger log = Logger.getLogger(CapacityReportMaintainer.class.getName()); - - CapacityReportMaintainer(NodeRepository nodeRepository, - Metric metric, - Duration interval) { - super(nodeRepository, interval); - this.nodeRepository = nodeRepository; - this.metric = Objects.requireNonNull(metric); - } - - @Override - protected void maintain() { - if (nodeRepository.zone().getCloud().dynamicProvisioning()) return; // Hosts and nodes are 1-1 - - CapacityChecker capacityChecker = new CapacityChecker(this.nodeRepository); - List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts(); - if (overcommittedHosts.size() != 0) { - log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]", overcommittedHosts.size(), - overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", ")))); - } - metric.set("overcommittedHosts", overcommittedHosts.size(), null); - - Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); - if (failurePath.isPresent()) { - int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size(); - metric.set("spareHostCapacity", worstCaseHostLoss - 1, null); - } - } - -} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java index 18471637da7..4e1be9c486c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java @@ -7,10 +7,13 @@ import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; + +import java.util.Objects; import java.util.logging.Level; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.yolean.Exceptions; import java.io.Closeable; @@ -128,4 +131,106 @@ class MaintenanceDeployment implements Closeable { return "deployment of " + application; } + public static class Move { + + private final Node node; + private final Node fromHost, toHost; + + Move(Node node, Node fromHost, Node toHost) { + this.node = node; + this.fromHost = fromHost; + this.toHost = toHost; + } + + public Node node() { return node; } + public Node fromHost() { return fromHost; } + public Node toHost() { return toHost; } + + /** + * Try to deploy to make this move. + * + * @param verifyTarget true to only make this move if the node ends up at the expected target host, + * false if we should perform it as long as it moves from the source host + * @return true if the move was done, false if it couldn't be + */ + public boolean execute(boolean verifyTarget, + Agent agent, Deployer deployer, Metric metric, NodeRepository nodeRepository) { + if (isEmpty()) return false; + ApplicationId application = node.allocation().get().owner(); + try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository)) { + if ( ! deployment.isValid()) return false; + + boolean couldMarkRetiredNow = markWantToRetire(node, true, agent, nodeRepository); + if ( ! couldMarkRetiredNow) return false; + + Optional<Node> expectedNewNode = Optional.empty(); + try { + if ( ! deployment.prepare()) return false; + if (verifyTarget) { + expectedNewNode = + nodeRepository.getNodes(application, Node.State.reserved).stream() + .filter(n -> !n.hostname().equals(node.hostname())) + .filter(n -> n.allocation().get().membership().cluster().id().equals(node.allocation().get().membership().cluster().id())) + .findAny(); + if (expectedNewNode.isEmpty()) return false; + if (!expectedNewNode.get().hasParent(toHost.hostname())) return false; + } + if ( ! deployment.activate()) return false; + + log.info(agent + " redeployed " + application + " to " + + ( verifyTarget ? this : "move " + (node.hostname() + " from " + fromHost))); + return true; + } + finally { + markWantToRetire(node, false, agent, nodeRepository); // Necessary if this failed, no-op otherwise + + // Immediately clean up if we reserved the node but could not activate or reserved a node on the wrong host + expectedNewNode.flatMap(node -> nodeRepository.getNode(node.hostname(), Node.State.reserved)) + .ifPresent(node -> nodeRepository.setDirty(node, agent, "Expired by " + agent)); + } + } + } + + /** Returns true only if this operation changes the state of the wantToRetire flag */ + private boolean markWantToRetire(Node node, boolean wantToRetire, Agent agent, NodeRepository nodeRepository) { + try (Mutex lock = nodeRepository.lock(node)) { + Optional<Node> nodeToMove = nodeRepository.getNode(node.hostname()); + if (nodeToMove.isEmpty()) return false; + if (nodeToMove.get().state() != Node.State.active) return false; + + if (nodeToMove.get().status().wantToRetire() == wantToRetire) return false; + + nodeRepository.write(nodeToMove.get().withWantToRetire(wantToRetire, agent, nodeRepository.clock().instant()), lock); + return true; + } + } + + public boolean isEmpty() { return node == null; } + + @Override + public int hashCode() { + return Objects.hash(node, fromHost, toHost); + } + + public boolean equals(Object o) { + if (o == this) return true; + if (o == null || o.getClass() != this.getClass()) return false; + + Move other = (Move)o; + if ( ! Objects.equals(other.node, this.node)) return false; + if ( ! Objects.equals(other.fromHost, this.fromHost)) return false; + if ( ! Objects.equals(other.toHost, this.toHost)) return false; + return true; + } + + @Override + public String toString() { + return "move " + + ( isEmpty() ? "none" : (node.hostname() + " from " + fromHost + " to " + toHost)); + } + + public static Move empty() { return new Move(null, null, null); } + + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index afd9ad3ffa3..4323622df8b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -47,7 +47,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final InfrastructureProvisioner infrastructureProvisioner; private final Optional<LoadBalancerExpirer> loadBalancerExpirer; private final Optional<DynamicProvisioningMaintainer> dynamicProvisioningMaintainer; - private final CapacityReportMaintainer capacityReportMaintainer; + private final SpareCapacityMaintainer spareCapacityMaintainer; private final OsUpgradeActivator osUpgradeActivator; private final Rebalancer rebalancer; private final NodeMetricsDbMaintainer nodeMetricsDbMaintainer; @@ -88,7 +88,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService)); dynamicProvisioningMaintainer = provisionServiceProvider.getHostProvisioner().map(hostProvisioner -> new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource)); - capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, defaults.capacityReportInterval); + spareCapacityMaintainer = new SpareCapacityMaintainer(deployer, nodeRepository, metric, defaults.spareCapacityMaintenanceInterval); osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval); rebalancer = new Rebalancer(deployer, nodeRepository, metric, clock, defaults.rebalancerInterval); nodeMetricsDbMaintainer = new NodeMetricsDbMaintainer(nodeRepository, nodeMetrics, nodeMetricsDb, defaults.nodeMetricsCollectionInterval); @@ -110,7 +110,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { failedExpirer.close(); dirtyExpirer.close(); nodeRebooter.close(); - capacityReportMaintainer.close(); + spareCapacityMaintainer.close(); provisionedExpirer.close(); metricsReporter.close(); infrastructureProvisioner.close(); @@ -153,7 +153,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration failedExpirerInterval; private final Duration dirtyExpiry; private final Duration provisionedExpiry; - private final Duration capacityReportInterval; + private final Duration spareCapacityMaintenanceInterval; private final Duration metricsInterval; private final Duration retiredInterval; private final Duration infrastructureProvisionInterval; @@ -168,25 +168,24 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final NodeFailer.ThrottlePolicy throttlePolicy; DefaultTimes(Zone zone) { - failGrace = Duration.ofMinutes(30); - periodicRedeployInterval = Duration.ofMinutes(30); - // Don't redeploy in test environments - redeployMaintainerInterval = Duration.ofMinutes(1); - operatorChangeRedeployInterval = Duration.ofMinutes(1); + autoscalingInterval = Duration.ofMinutes(5); + dynamicProvisionerInterval = Duration.ofMinutes(5); failedExpirerInterval = Duration.ofMinutes(10); - provisionedExpiry = Duration.ofHours(4); - capacityReportInterval = Duration.ofMinutes(10); - metricsInterval = Duration.ofMinutes(1); + failGrace = Duration.ofMinutes(30); infrastructureProvisionInterval = Duration.ofMinutes(1); - throttlePolicy = NodeFailer.ThrottlePolicy.hosted; loadBalancerExpirerInterval = Duration.ofMinutes(5); - reservationExpiry = Duration.ofMinutes(15); // Need to be long enough for deployment to be finished for all config model versions - dynamicProvisionerInterval = Duration.ofMinutes(5); + metricsInterval = Duration.ofMinutes(1); + nodeMetricsCollectionInterval = Duration.ofMinutes(1); + operatorChangeRedeployInterval = Duration.ofMinutes(1); osUpgradeActivatorInterval = zone.system().isCd() ? Duration.ofSeconds(30) : Duration.ofMinutes(5); + periodicRedeployInterval = Duration.ofMinutes(30); + provisionedExpiry = Duration.ofHours(4); rebalancerInterval = Duration.ofMinutes(40); - nodeMetricsCollectionInterval = Duration.ofMinutes(1); - autoscalingInterval = Duration.ofMinutes(5); + redeployMaintainerInterval = Duration.ofMinutes(1); + reservationExpiry = Duration.ofMinutes(15); // Need to be long enough for deployment to be finished for all config model versions scalingSuggestionsInterval = Duration.ofMinutes(31); + spareCapacityMaintenanceInterval = Duration.ofMinutes(10); + throttlePolicy = NodeFailer.ThrottlePolicy.hosted; if (zone.environment().equals(Environment.prod) && ! zone.system().isCd()) { inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 12990447eee..3df20fa9d08 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -6,16 +6,14 @@ import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; -import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; -import com.yahoo.vespa.hosted.provision.provisioning.DockerHostCapacity; +import com.yahoo.vespa.hosted.provision.provisioning.HostCapacity; import java.time.Clock; import java.time.Duration; -import java.util.Optional; /** * @author bratseth @@ -41,24 +39,19 @@ public class Rebalancer extends NodeRepositoryMaintainer { @Override protected void maintain() { - if (nodeRepository().canProvisionHostsWhenRequired()) return; // All nodes will be allocated on new hosts, so rebalancing makes no sense - if (nodeRepository().zone().environment().isTest()) return; // Test zones have short lived deployments, no need to rebalance + if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; // Rebalancing not necessary + if (nodeRepository().zone().environment().isTest()) return; // Short lived deployments; no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed NodeList allNodes = nodeRepository().list(); - updateSkewMetric(allNodes); - if ( ! zoneIsStable(allNodes)) return; - - Move bestMove = findBestMove(allNodes); - if (bestMove == Move.none) return; - deployTo(bestMove); + findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository()); } /** We do this here rather than in MetricsReporter because it is expensive and frequent updates are unnecessary */ private void updateSkewMetric(NodeList allNodes) { - DockerHostCapacity capacity = new DockerHostCapacity(allNodes, nodeRepository().resourcesCalculator()); + HostCapacity capacity = new HostCapacity(allNodes, nodeRepository().resourcesCalculator()); double totalSkew = 0; int hostCount = 0; for (Node host : allNodes.nodeType((NodeType.host)).state(Node.State.active)) { @@ -80,8 +73,8 @@ public class Rebalancer extends NodeRepositoryMaintainer { * Returns Move.none if no moves can be made to reduce skew. */ private Move findBestMove(NodeList allNodes) { - DockerHostCapacity capacity = new DockerHostCapacity(allNodes, nodeRepository().resourcesCalculator()); - Move bestMove = Move.none; + HostCapacity capacity = new HostCapacity(allNodes, nodeRepository().resourcesCalculator()); + Move bestMove = Move.empty(); for (Node node : allNodes.nodeType(NodeType.tenant).state(Node.State.active)) { if (node.parentHostname().isEmpty()) continue; ApplicationId applicationId = node.allocation().get().owner(); @@ -89,82 +82,29 @@ public class Rebalancer extends NodeRepositoryMaintainer { if (deployedRecently(applicationId)) continue; for (Node toHost : allNodes.matching(nodeRepository()::canAllocateTenantNodeTo)) { if (toHost.hostname().equals(node.parentHostname().get())) continue; - if ( ! capacity.freeCapacityOf(toHost).satisfies(node.flavor().resources())) continue; + if ( ! capacity.freeCapacityOf(toHost).satisfies(node.resources())) continue; double skewReductionAtFromHost = skewReductionByRemoving(node, allNodes.parentOf(node).get(), capacity); double skewReductionAtToHost = skewReductionByAdding(node, toHost, capacity); double netSkewReduction = skewReductionAtFromHost + skewReductionAtToHost; if (netSkewReduction > bestMove.netSkewReduction) - bestMove = new Move(node, toHost, netSkewReduction); + bestMove = new Move(node, nodeRepository().getNode(node.parentHostname().get()).get(), toHost, netSkewReduction); } } return bestMove; } - /** Returns true only if this operation changes the state of the wantToRetire flag */ - private boolean markWantToRetire(Node node, boolean wantToRetire) { - try (Mutex lock = nodeRepository().lock(node)) { - Optional<Node> nodeToMove = nodeRepository().getNode(node.hostname()); - if (nodeToMove.isEmpty()) return false; - if (nodeToMove.get().state() != Node.State.active) return false; - - if (nodeToMove.get().status().wantToRetire() == wantToRetire) return false; - - nodeRepository().write(nodeToMove.get().withWantToRetire(wantToRetire, Agent.Rebalancer, clock.instant()), lock); - return true; - } - } - - /** - * Try a redeployment to effect the chosen move. - * If it can be done, that's ok; we'll try this or another move later. - * - * @return true if the move was done, false if it couldn't be - */ - private boolean deployTo(Move move) { - ApplicationId application = move.node.allocation().get().owner(); - try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { - if ( ! deployment.isValid()) return false; - - boolean couldMarkRetiredNow = markWantToRetire(move.node, true); - if ( ! couldMarkRetiredNow) return false; - - Optional<Node> expectedNewNode = Optional.empty(); - try { - if ( ! deployment.prepare()) return false; - expectedNewNode = - nodeRepository().getNodes(application, Node.State.reserved).stream() - .filter(node -> !node.hostname().equals(move.node.hostname())) - .filter(node -> node.allocation().get().membership().cluster().id().equals(move.node.allocation().get().membership().cluster().id())) - .findAny(); - if (expectedNewNode.isEmpty()) return false; - if ( ! expectedNewNode.get().hasParent(move.toHost.hostname())) return false; - if ( ! deployment.activate()) return false; - - log.info("Rebalancer redeployed " + application + " to " + move); - return true; - } - finally { - markWantToRetire(move.node, false); // Necessary if this failed, no-op otherwise - - // Immediately clean up if we reserved the node but could not activate or reserved a node on the wrong host - expectedNewNode.flatMap(node -> nodeRepository().getNode(node.hostname(), Node.State.reserved)) - .ifPresent(node -> nodeRepository().setDirty(node, Agent.Rebalancer, "Expired by Rebalancer")); - } - } - } - - private double skewReductionByRemoving(Node node, Node fromHost, DockerHostCapacity capacity) { + private double skewReductionByRemoving(Node node, Node fromHost, HostCapacity capacity) { NodeResources freeHostCapacity = capacity.freeCapacityOf(fromHost); double skewBefore = Node.skew(fromHost.flavor().resources(), freeHostCapacity); double skewAfter = Node.skew(fromHost.flavor().resources(), freeHostCapacity.add(node.flavor().resources().justNumbers())); return skewBefore - skewAfter; } - private double skewReductionByAdding(Node node, Node toHost, DockerHostCapacity capacity) { + private double skewReductionByAdding(Node node, Node toHost, HostCapacity capacity) { NodeResources freeHostCapacity = capacity.freeCapacityOf(toHost); double skewBefore = Node.skew(toHost.flavor().resources(), freeHostCapacity); - double skewAfter = Node.skew(toHost.flavor().resources(), freeHostCapacity.subtract(node.flavor().resources().justNumbers())); + double skewAfter = Node.skew(toHost.flavor().resources(), freeHostCapacity.subtract(node.resources().justNumbers())); return skewBefore - skewAfter; } @@ -176,25 +116,23 @@ public class Rebalancer extends NodeRepositoryMaintainer { .orElse(true); } - private static class Move { - - static final Move none = new Move(null, null, 0); + private static class Move extends MaintenanceDeployment.Move { - final Node node; - final Node toHost; final double netSkewReduction; - Move(Node node, Node toHost, double netSkewReduction) { - this.node = node; - this.toHost = toHost; + Move(Node node, Node fromHost, Node toHost, double netSkewReduction) { + super(node, fromHost, toHost); this.netSkewReduction = netSkewReduction; } @Override public String toString() { - return "move " + - ( node == null ? "none" : - (node.hostname() + " to " + toHost + " [skew reduction " + netSkewReduction + "]")); + if (isEmpty()) return "move none"; + return super.toString() + " [skew reduction " + netSkewReduction + "]"; + } + + public static Move empty() { + return new Move(null, null, null, 0); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java new file mode 100644 index 00000000000..54899372397 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -0,0 +1,337 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.Deployer; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.maintenance.MaintenanceDeployment.Move; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.provisioning.HostCapacity; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.logging.Level; +import java.util.stream.Collectors; + +/** + * A maintainer which attempts to ensure there is spare capacity available in chunks which can fit + * all node resource configuration in use, such that the system is able to quickly replace a failed node + * if necessary. + * + * This also emits the following metrics: + * - Overcommitted hosts: Hosts whose capacity is less than the sum of its children's + * - Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's + * unable to find a new home for orphaned tenants. + * + * @author mgimle + * @author bratseth + */ +public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { + + private final int maxIterations; + private final Deployer deployer; + private final Metric metric; + + public SpareCapacityMaintainer(Deployer deployer, + NodeRepository nodeRepository, + Metric metric, + Duration interval) { + this(deployer, nodeRepository, metric, interval, + 10_000 // Should take less than a few minutes + ); + } + + public SpareCapacityMaintainer(Deployer deployer, + NodeRepository nodeRepository, + Metric metric, + Duration interval, + int maxIterations) { + super(nodeRepository, interval); + this.deployer = deployer; + this.metric = metric; + this.maxIterations = maxIterations; + } + + @Override + protected void maintain() { + if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; + + CapacityChecker capacityChecker = new CapacityChecker(nodeRepository()); + + List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts(); + if (overcommittedHosts.size() != 0) { + log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]", + overcommittedHosts.size(), + overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", ")))); + } + metric.set("overcommittedHosts", overcommittedHosts.size(), null); + + Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); + if (failurePath.isPresent()) { + int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1; + if (spareHostCapacity == 0) { + Move move = findMitigatingMove(failurePath.get()); + if (moving(move)) { + // We succeeded or are in the process of taking a step to mitigate. + // Report with the assumption this will eventually succeed to avoid alerting before we're stuck + spareHostCapacity++; + } + } + metric.set("spareHostCapacity", spareHostCapacity, null); + } + } + + private boolean moving(Move move) { + if (move.isEmpty()) return false; + if (move.node().allocation().get().membership().retired()) return true; // Move already in progress + return move.execute(false, Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository()); + } + + private Move findMitigatingMove(CapacityChecker.HostFailurePath failurePath) { + Optional<Node> nodeWhichCantMove = failurePath.failureReason.tenant; + if (nodeWhichCantMove.isEmpty()) return Move.empty(); + + Node node = nodeWhichCantMove.get(); + NodeList allNodes = nodeRepository().list(); + // Allocation will assign the two most empty nodes as "spares", which will not be allocated on + // unless needed for node failing. Our goal here is to make room on these spares for the given node + HostCapacity hostCapacity = new HostCapacity(allNodes, nodeRepository().resourcesCalculator()); + Set<Node> spareHosts = hostCapacity.findSpareHosts(allNodes.hosts().satisfies(node.resources()).asList(), 2); + List<Node> hosts = allNodes.hosts().except(spareHosts).asList(); + + CapacitySolver capacitySolver = new CapacitySolver(hostCapacity, maxIterations); + List<Move> shortestMitigation = null; + for (Node spareHost : spareHosts) { + List<Move> mitigation = capacitySolver.makeRoomFor(node, spareHost, hosts, List.of(), List.of()); + if (mitigation == null) continue; + if (shortestMitigation == null || shortestMitigation.size() > mitigation.size()) + shortestMitigation = mitigation; + } + if (shortestMitigation == null || shortestMitigation.isEmpty()) return Move.empty(); + return shortestMitigation.get(0); + } + + private static class CapacitySolver { + + private final HostCapacity hostCapacity; + private final int maxIterations; + + private int iterations = 0; + + CapacitySolver(HostCapacity hostCapacity, int maxIterations) { + this.hostCapacity = hostCapacity; + this.maxIterations = maxIterations; + } + + /** The map of subproblem solutions already found. The value is null when there is no solution. */ + private Map<SolutionKey, List<Move>> solutions = new HashMap<>(); + + /** + * Finds the shortest sequence of moves which makes room for the given node on the given host, + * assuming the given moves already made over the given hosts' current allocation. + * + * @param node the node to make room for + * @param host the target host to make room on + * @param hosts the hosts onto which we can move nodes + * @param movesConsidered the moves already being considered to add as part of this scenario + * (after any moves made by this) + * @param movesMade the moves already made in this scenario + * @return the list of movesMade with the moves needed for this appended, in the order they should be performed, + * or null if no sequence could be found + */ + List<Move> makeRoomFor(Node node, Node host, List<Node> hosts, List<Move> movesConsidered, List<Move> movesMade) { + SolutionKey solutionKey = new SolutionKey(node, host, movesConsidered, movesMade); + List<Move> solution = solutions.get(solutionKey); + if (solution == null) { + solution = findRoomFor(node, host, hosts, movesConsidered, movesMade); + solutions.put(solutionKey, solution); + } + return solution; + } + + private List<Move> findRoomFor(Node node, Node host, List<Node> hosts, + List<Move> movesConsidered, List<Move> movesMade) { + if (iterations++ > maxIterations) + return null; + + if ( ! host.resources().satisfies(node.resources())) return null; + NodeResources freeCapacity = freeCapacityWith(movesMade, host); + if (freeCapacity.satisfies(node.resources())) return List.of(); + + List<Move> shortest = null; + for (var i = subsets(hostCapacity.allNodes().childrenOf(host), 5); i.hasNext(); ) { + List<Node> childrenToMove = i.next(); + if ( ! addResourcesOf(childrenToMove, freeCapacity).satisfies(node.resources())) continue; + List<Move> moves = move(childrenToMove, host, hosts, movesConsidered, movesMade); + if (moves == null) continue; + + if (shortest == null || moves.size() < shortest.size()) + shortest = moves; + } + if (shortest == null) return null; + return append(movesMade, shortest); + } + + private List<Move> move(List<Node> nodes, Node host, List<Node> hosts, List<Move> movesConsidered, List<Move> movesMade) { + List<Move> moves = new ArrayList<>(); + for (Node childToMove : nodes) { + List<Move> childMoves = move(childToMove, host, hosts, movesConsidered, append(movesMade, moves)); + if (childMoves == null) return null; + moves.addAll(childMoves); + } + return moves; + } + + private List<Move> move(Node node, Node host, List<Node> hosts, List<Move> movesConsidered, List<Move> movesMade) { + if (contains(node, movesConsidered)) return null; + if (contains(node, movesMade)) return null; + List<Move> shortest = null; + for (Node target : hosts) { + if (target.equals(host)) continue; + Move move = new Move(node, host, target); + List<Move> childMoves = makeRoomFor(node, target, hosts, append(movesConsidered, move), movesMade); + if (childMoves == null) continue; + if (shortest == null || shortest.size() > childMoves.size() + 1) { + shortest = new ArrayList<>(childMoves); + shortest.add(move); + } + } + return shortest; + } + + private boolean contains(Node node, List<Move> moves) { + return moves.stream().anyMatch(move -> move.node().equals(node)); + } + + private NodeResources addResourcesOf(List<Node> nodes, NodeResources resources) { + for (Node node : nodes) + resources = resources.add(node.resources()); + return resources; + } + + private Iterator<List<Node>> subsets(NodeList nodes, int maxSize) { + return new SubsetIterator(nodes.asList(), maxSize); + } + + private List<Move> append(List<Move> a, List<Move> b) { + List<Move> list = new ArrayList<>(); + list.addAll(a); + list.addAll(b); + return list; + } + + private List<Move> append(List<Move> moves, Move move) { + List<Move> list = new ArrayList<>(moves); + list.add(move); + return list; + } + + private NodeResources freeCapacityWith(List<Move> moves, Node host) { + NodeResources resources = hostCapacity.freeCapacityOf(host); + for (Move move : moves) { + if ( ! move.toHost().equals(host)) continue; + resources = resources.subtract(move.node().resources()); + } + for (Move move : moves) { + if ( ! move.fromHost().equals(host)) continue; + resources = resources.add(move.node().resources()); + } + return resources; + } + + } + + private static class SolutionKey { + + private final Node node; + private final Node host; + private final List<Move> movesConsidered; + private final List<Move> movesMade; + + private final int hash; + + public SolutionKey(Node node, Node host, List<Move> movesConsidered, List<Move> movesMade) { + this.node = node; + this.host = host; + this.movesConsidered = movesConsidered; + this.movesMade = movesMade; + + hash = Objects.hash(node, host, movesConsidered, movesMade); + } + + @Override + public int hashCode() { return hash; } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if (o == null || o.getClass() != this.getClass()) return false; + + SolutionKey other = (SolutionKey)o; + if ( ! other.node.equals(this.node)) return false; + if ( ! other.host.equals(this.host)) return false; + if ( ! other.movesConsidered.equals(this.movesConsidered)) return false; + if ( ! other.movesMade.equals(this.movesMade)) return false; + return true; + } + + } + + private static class SubsetIterator implements Iterator<List<Node>> { + + private final List<Node> nodes; + private final int maxLength; + + // A number whose binary representation determines which items of list we'll include + private int i = 0; // first "previous" = 0 -> skip the empty set + private List<Node> next = null; + + public SubsetIterator(List<Node> nodes, int maxLength) { + this.nodes = new ArrayList<>(nodes.subList(0, Math.min(nodes.size(), 31))); + this.maxLength = maxLength; + } + + @Override + public boolean hasNext() { + if (next != null) return true; + + // find next + while (++i < 1<<nodes.size()) { + int ones = Integer.bitCount(i); + if (ones > maxLength) continue; + + next = new ArrayList<>(ones); + for (int position = 0; position < nodes.size(); position++) { + if (hasOneAtPosition(position, i)) + next.add(nodes.get(position)); + } + return true; + } + return false; + } + + @Override + public List<Node> next() { + if ( ! hasNext()) throw new IllegalStateException("No more elements"); + var current = next; + next = null; + return current; + } + + private boolean hasOneAtPosition(int position, int number) { + return (number & (1 << position)) > 0; + } + + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java index 31b7181a58a..eba9e4a1ac9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java @@ -21,6 +21,7 @@ public enum Agent { ProvisionedExpirer, ReservationExpirer, DynamicProvisioningMaintainer, - RetiringUpgrader; + RetiringUpgrader, + SpareCapacityMaintainer } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 15be7796187..37842115949 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -386,15 +386,16 @@ public class NodeSerializer { case "operator" : return Agent.operator; case "application" : return Agent.application; case "system" : return Agent.system; - case "NodeFailer" : return Agent.NodeFailer; - case "Rebalancer" : return Agent.Rebalancer; case "DirtyExpirer" : return Agent.DirtyExpirer; + case "DynamicProvisioningMaintainer" : return Agent.DynamicProvisioningMaintainer; case "FailedExpirer" : return Agent.FailedExpirer; case "InactiveExpirer" : return Agent.InactiveExpirer; + case "NodeFailer" : return Agent.NodeFailer; case "ProvisionedExpirer" : return Agent.ProvisionedExpirer; + case "Rebalancer" : return Agent.Rebalancer; case "ReservationExpirer" : return Agent.ReservationExpirer; - case "DynamicProvisioningMaintainer" : return Agent.DynamicProvisioningMaintainer; case "RetiringUpgrader" : return Agent.RetiringUpgrader; + case "SpareCapacityMaintainer": return Agent.SpareCapacityMaintainer; } throw new IllegalArgumentException("Unknown node event agent '" + eventAgentField.asString() + "'"); } @@ -403,15 +404,16 @@ public class NodeSerializer { case operator : return "operator"; case application : return "application"; case system : return "system"; - case NodeFailer : return "NodeFailer"; - case Rebalancer : return "Rebalancer"; case DirtyExpirer : return "DirtyExpirer"; + case DynamicProvisioningMaintainer : return "DynamicProvisioningMaintainer"; case FailedExpirer : return "FailedExpirer"; case InactiveExpirer : return "InactiveExpirer"; + case NodeFailer : return "NodeFailer"; case ProvisionedExpirer : return "ProvisionedExpirer"; + case Rebalancer : return "Rebalancer"; case ReservationExpirer : return "ReservationExpirer"; - case DynamicProvisioningMaintainer : return "DynamicProvisioningMaintainer"; case RetiringUpgrader: return "RetiringUpgrader"; + case SpareCapacityMaintainer: return "SpareCapacityMaintainer"; } throw new IllegalArgumentException("Serialized form of '" + agent + "' not defined"); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java index a61032af276..7158ccc57e3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java @@ -184,12 +184,12 @@ class Activator { for (Node node : nodes) { HostSpec hostSpec = getHost(node.hostname(), hosts); node = hostSpec.membership().get().retired() ? node.retire(nodeRepository.clock().instant()) : node.unretire(); - if (! hostSpec.advertisedResources().equals(node.flavor().resources())) // A resized node + if (! hostSpec.advertisedResources().equals(node.resources())) // A resized node node = node.with(new Flavor(hostSpec.advertisedResources())); Allocation allocation = node.allocation().get() .with(hostSpec.membership().get()) .withRequestedResources(hostSpec.requestedResources() - .orElse(node.flavor().resources())); + .orElse(node.resources())); if (hostSpec.networkPorts().isPresent()) allocation = allocation.withNetworkPorts(hostSpec.networkPorts().get()); node = node.with(allocation); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/EmptyProvisionServiceProvider.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/EmptyProvisionServiceProvider.java index 5402e4bf3e8..38dd9f29873 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/EmptyProvisionServiceProvider.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/EmptyProvisionServiceProvider.java @@ -34,7 +34,7 @@ public class EmptyProvisionServiceProvider implements ProvisionServiceProvider { private static class IdentityHostResourcesCalculator implements HostResourcesCalculator { @Override - public NodeResources realResourcesOf(Node node, NodeRepository repository) { return node.flavor().resources(); } + public NodeResources realResourcesOf(Node node, NodeRepository repository) { return node.resources(); } @Override public NodeResources advertisedResourcesOf(Flavor flavor) { return flavor.resources(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index caecf8edf2f..d3e5f60599f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -58,7 +58,7 @@ public class GroupPreparer { // active config model which is changed on activate public List<Node> prepare(ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes, List<Node> surplusActiveNodes, MutableInteger highestIndex, int spareCount, int wantedGroups) { - boolean dynamicProvisioningEnabled = nodeRepository.canProvisionHostsWhenRequired() && nodeRepository.zone().getCloud().dynamicProvisioning(); + boolean dynamicProvisioningEnabled = nodeRepository.canProvisionHosts() && nodeRepository.zone().getCloud().dynamicProvisioning(); boolean allocateFully = dynamicProvisioningEnabled && preprovisionCapacityFlag.value().isEmpty(); try (Mutex lock = nodeRepository.lock(application)) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/DockerHostCapacity.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostCapacity.java index b508198db3a..fd16e61417f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/DockerHostCapacity.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostCapacity.java @@ -3,10 +3,14 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; +import java.util.List; import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; /** * Capacity calculation for docker hosts. @@ -16,17 +20,38 @@ import java.util.Objects; * * @author smorgrav */ -public class DockerHostCapacity { +public class HostCapacity { private final NodeList allNodes; private final HostResourcesCalculator hostResourcesCalculator; - public DockerHostCapacity(NodeList allNodes, HostResourcesCalculator hostResourcesCalculator) { + public HostCapacity(NodeList allNodes, HostResourcesCalculator hostResourcesCalculator) { this.allNodes = Objects.requireNonNull(allNodes, "allNodes must be non-null"); this.hostResourcesCalculator = Objects.requireNonNull(hostResourcesCalculator, "hostResourcesCalculator must be non-null"); } - int compareWithoutInactive(Node hostA, Node hostB) { + public NodeList allNodes() { return allNodes; } + + /** + * Spare hosts are the hosts in the system with the most free capacity. + * + * We do not count retired or inactive nodes as used capacity (as they could have been + * moved to create space for the spare node in the first place). + * + * @param candidates the candidates to consider. This list may contain all kinds of nodes. + * @param count the max number of spare hosts to return + */ + public Set<Node> findSpareHosts(List<Node> candidates, int count) { + return candidates.stream() + .filter(node -> node.type() == NodeType.host) + .filter(dockerHost -> dockerHost.state() == Node.State.active) + .filter(dockerHost -> freeIPs(dockerHost) > 0) + .sorted(this::compareWithoutInactive) + .limit(count) + .collect(Collectors.toSet()); + } + + private int compareWithoutInactive(Node hostA, Node hostB) { int result = compare(freeCapacityOf(hostB, true), freeCapacityOf(hostA, true)); if (result != 0) return result; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index 47d1b30a8e7..df8a7e45917 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -148,7 +148,7 @@ class NodeAllocation { } node.node = offered.allocate(application, ClusterMembership.from(cluster, highestIndex.add(1)), - requestedNodes.resources().orElse(node.node.flavor().resources()), + requestedNodes.resources().orElse(node.node.resources()), nodeRepository.clock().instant()); accepted.add(acceptNode(node, false, false)); } @@ -242,7 +242,7 @@ class NodeAllocation { Node node = prioritizableNode.node; if (node.allocation().isPresent()) // Record the currently requested resources - node = node.with(node.allocation().get().withRequestedResources(requestedNodes.resources().orElse(node.flavor().resources()))); + node = node.with(node.allocation().get().withRequestedResources(requestedNodes.resources().orElse(node.resources()))); if (! wantToRetire) { accepted++; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java index 8a15c058ff4..8560dd424e7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java @@ -13,7 +13,6 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.IP; -import com.yahoo.vespa.hosted.provision.persistence.NameResolver; import java.util.EnumSet; import java.util.HashMap; @@ -21,7 +20,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.function.Predicate; import java.util.logging.Logger; import java.util.stream.Collectors; @@ -39,7 +37,7 @@ public class NodePrioritizer { private final Map<Node, PrioritizableNode> nodes = new HashMap<>(); private final LockedNodeList allNodes; - private final DockerHostCapacity capacity; + private final HostCapacity capacity; private final NodeSpec requestedNodes; private final ApplicationId application; private final ClusterSpec clusterSpec; @@ -55,11 +53,11 @@ public class NodePrioritizer { NodePrioritizer(LockedNodeList allNodes, ApplicationId application, ClusterSpec clusterSpec, NodeSpec nodeSpec, int spares, int wantedGroups, boolean allocateFully, NodeRepository nodeRepository) { this.allNodes = allNodes; - this.capacity = new DockerHostCapacity(allNodes, nodeRepository.resourcesCalculator()); + this.capacity = new HostCapacity(allNodes, nodeRepository.resourcesCalculator()); this.requestedNodes = nodeSpec; this.clusterSpec = clusterSpec; this.application = application; - this.spareHosts = findSpareHosts(allNodes, capacity, spares); + this.spareHosts = capacity.findSpareHosts(allNodes.asList(), spares); this.allocateFully = allocateFully; this.nodeRepository = nodeRepository; @@ -83,22 +81,6 @@ public class NodePrioritizer { this.isDocker = resources(requestedNodes) != null; } - /** - * Spare hosts are the two hosts in the system with the most free capacity. - * - * We do not count retired or inactive nodes as used capacity (as they could have been - * moved to create space for the spare node in the first place). - */ - private static Set<Node> findSpareHosts(LockedNodeList nodes, DockerHostCapacity capacity, int spares) { - return nodes.asList().stream() - .filter(node -> node.type() == NodeType.host) - .filter(dockerHost -> dockerHost.state() == Node.State.active) - .filter(dockerHost -> capacity.freeIPs(dockerHost) > 0) - .sorted(capacity::compareWithoutInactive) - .limit(spares) - .collect(Collectors.toSet()); - } - /** Returns the list of nodes sorted by PrioritizableNode::compare */ List<PrioritizableNode> prioritize() { return nodes.values().stream().sorted().collect(Collectors.toList()); @@ -206,8 +188,8 @@ public class NodePrioritizer { builder.parent(parent).freeParentCapacity(parentCapacity); if (!isNewNode) - builder.resizable(!allocateFully - && requestedNodes.canResize(node.flavor().resources(), parentCapacity, isTopologyChange, currentClusterSize)); + builder.resizable(! allocateFully + && requestedNodes.canResize(node.resources(), parentCapacity, isTopologyChange, currentClusterSize)); if (spareHosts.contains(parent)) builder.violatesSpares(true); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java index a8abdc3f38a..9971aae1714 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java @@ -139,7 +139,7 @@ public interface NodeSpec { @Override public boolean needsResize(Node node) { - return ! node.flavor().resources().compatibleWith(requestedNodeResources); + return ! node.resources().compatibleWith(requestedNodeResources); } @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/PrioritizableNode.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/PrioritizableNode.java index 3fc60c1192d..0c1b396c40c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/PrioritizableNode.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/PrioritizableNode.java @@ -126,7 +126,7 @@ class PrioritizableNode implements Comparable<PrioritizableNode> { double skewWithoutThis() { return skewWith(zeroResources); } /** Returns the allocation skew of the parent of this after adding this node to it */ - double skewWithThis() { return skewWith(node.flavor().resources()); } + double skewWithThis() { return skewWith(node.resources()); } private double skewWith(NodeResources resources) { if (parent.isEmpty()) return 0; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java index aa81aae84fe..a4161a318ab 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java @@ -43,7 +43,7 @@ public class ApplicationSerializer { if (nodes.isEmpty()) return; int groups = (int)nodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count(); - ClusterResources currentResources = new ClusterResources(nodes.size(), groups, nodes.get(0).flavor().resources()); + ClusterResources currentResources = new ClusterResources(nodes.size(), groups, nodes.get(0).resources()); toSlime(cluster.minResources(), clusterObject.setObject("min")); toSlime(cluster.maxResources(), clusterObject.setObject("max")); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java index 12a29707303..e28b03d7517 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java @@ -20,6 +20,7 @@ import java.util.Optional; * @author mgimle */ public class HostCapacityResponse extends HttpResponse { + private final StringBuilder text; private final Slime slime; private final CapacityChecker capacityChecker; @@ -128,7 +129,7 @@ public class HostCapacityResponse extends HttpResponse { ); failurePath.failureReason.tenant.ifPresent(tenant -> { object.setString("failedTenant", tenant.hostname()); - object.setString("failedTenantResources", tenant.flavor().resources().toString()); + object.setString("failedTenantResources", tenant.resources().toString()); tenant.allocation().ifPresent(allocation -> object.setString("failedTenantAllocation", allocation.toString()) ); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java index 5ec5c2c08e8..ae3d6ebf815 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java @@ -10,6 +10,9 @@ import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostFilter; import com.yahoo.config.provision.HostSpec; import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; import java.time.Clock; @@ -27,18 +30,22 @@ import java.util.stream.Collectors; */ public class MockDeployer implements Deployer { + // For actual deploy mode private final NodeRepositoryProvisioner provisioner; private final Map<ApplicationId, ApplicationContext> applications; - private final Map<ApplicationId, Instant> lastDeployTimes = new HashMap<>(); + // For mock deploy anything, changing wantToRetire to retired only + private final NodeRepository nodeRepository; /** The number of redeployments done to this */ public int redeployments = 0; + private final Map<ApplicationId, Instant> lastDeployTimes = new HashMap<>(); private final Clock clock; private final ReentrantLock lock = new ReentrantLock(); private boolean failActivate = false; + /** Create a mock deployer which returns empty on every deploy request. */ @Inject @SuppressWarnings("unused") public MockDeployer() { @@ -46,15 +53,30 @@ public class MockDeployer implements Deployer { } /** - * Create a mock deployer which contains a substitute for an application repository, fullfilled to + * Create a mock deployer which returns a deployment on every request, + * and fulfills it by not actually deploying but only changing any wantToRetire nodes + * for the application to retired. + */ + public MockDeployer(NodeRepository nodeRepository) { + this.provisioner = null; + this.applications = Map.of(); + this.nodeRepository = nodeRepository; + + this.clock = nodeRepository.clock(); + } + + /** + * Create a mock deployer which contains a substitute for an application repository, filled to * be able to call provision with the right parameters. */ public MockDeployer(NodeRepositoryProvisioner provisioner, Clock clock, Map<ApplicationId, ApplicationContext> applications) { this.provisioner = provisioner; - this.clock = clock; this.applications = new HashMap<>(applications); + this.nodeRepository = null; + + this.clock = clock; } public ReentrantLock lock() { return lock; } @@ -74,9 +96,13 @@ public class MockDeployer implements Deployer { throw new RuntimeException(e); } try { - return Optional.ofNullable(applications.get(id)) - .map(application -> new MockDeployment(provisioner, application)); - } finally { + if (provisioner != null) + return Optional.ofNullable(applications.get(id)) + .map(application -> new MockDeployment(provisioner, application)); + else + return Optional.of(new RetiringOnlyMockDeployment(nodeRepository, id)); + } + finally { lock.unlock(); } } @@ -135,6 +161,33 @@ public class MockDeployer implements Deployer { } + public class RetiringOnlyMockDeployment implements Deployment { + + private final NodeRepository nodeRepository; + private final ApplicationId applicationId; + + private RetiringOnlyMockDeployment(NodeRepository nodeRepository, ApplicationId applicationId) { + this.nodeRepository = nodeRepository; + this.applicationId = applicationId; + } + + @Override + public void prepare() { } + + @Override + public void activate() { + redeployments++; + lastDeployTimes.put(applicationId, clock.instant()); + + for (Node node : nodeRepository.list().owner(applicationId).state(Node.State.active).wantToRetire().asList()) + nodeRepository.write(node.retire(nodeRepository.clock().instant()), nodeRepository.lock(node)); + } + + @Override + public void restart(HostFilter filter) {} + + } + /** An application context which substitutes for an application repository */ public static class ApplicationContext { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index cc5c6851a92..a0a44e4f342 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -383,7 +383,7 @@ public class AutoscalingTest { @Override public NodeResources realResourcesOf(Node node, NodeRepository nodeRepository) { - return node.flavor().resources(); + return node.resources(); } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index b0e394c93d3..1137ae5ce2c 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -208,9 +208,9 @@ class AutoscalingTester { @Override public NodeResources realResourcesOf(Node node, NodeRepository nodeRepository) { if (zone.getCloud().dynamicProvisioning()) - return node.flavor().resources().withMemoryGb(node.flavor().resources().memoryGb() - 3); + return node.resources().withMemoryGb(node.resources().memoryGb() - 3); else - return node.flavor().resources(); + return node.resources(); } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java index 5813585554d..5e72cfc53ac 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java @@ -29,6 +29,7 @@ public class CapacityCheckerTest { var failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure(); assertTrue(failurePath.isPresent()); assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.get().hostsCausingFailure)); + assertEquals(5, failurePath.get().hostsCausingFailure.size()); } @Test diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java index a6b2f6b15ea..62e9a227109 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java @@ -45,6 +45,7 @@ import java.util.stream.IntStream; * @author mgimle */ public class CapacityCheckerTester { + public static final Zone zone = new Zone(Environment.prod, RegionName.from("us-east")); // Components with state @@ -129,7 +130,7 @@ public class CapacityCheckerTester { childModel.parentHostname = Optional.of(hostname); Node childNode = createNodeFromModel(childModel); - childResources.add(childNode.flavor().resources()); + childResources.add(childNode.resources()); hosts.add(childNode); } @@ -138,8 +139,7 @@ public class CapacityCheckerTester { .mapToObj(n -> String.format("%04X::%04X", hostindex, n)) .collect(Collectors.toSet()); - NodeResources nr = containingNodeResources(childResources, - excessCapacity); + NodeResources nr = containingNodeResources(childResources, excessCapacity); Node node = nodeRepository.createNode(hostname, hostname, new IP.Config(Set.of("::"), availableIps), Optional.empty(), new Flavor(nr), Optional.empty(), NodeType.host); @@ -159,7 +159,8 @@ public class CapacityCheckerTester { Set<String> availableIps = IntStream.range(0, ips) .mapToObj(n -> String.format("%04X::%04X", hostid, n)) .collect(Collectors.toSet()); - Node node = nodeRepository.createNode(hostname, hostname, + Node node = nodeRepository.createNode(hostname, + hostname, new IP.Config(Set.of("::"), availableIps), Optional.empty(), new Flavor(capacity), Optional.empty(), NodeType.host); hosts.add(node); @@ -175,8 +176,8 @@ public class CapacityCheckerTester { ); createNodes(childrenPerHost, numDistinctChildren, childResources, - numHosts, hostExcessCapacity, hostExcessIps, - numEmptyHosts, emptyHostExcessCapacity, emptyHostExcessIps); + numHosts, hostExcessCapacity, hostExcessIps, + numEmptyHosts, emptyHostExcessCapacity, emptyHostExcessIps); } void createNodes(int childrenPerHost, int numDistinctChildren, List<NodeResources> childResources, int numHosts, NodeResources hostExcessCapacity, int hostExcessIps, @@ -264,10 +265,11 @@ public class CapacityCheckerTester { owner = ApplicationId.from(nodeModel.owner.tenant, nodeModel.owner.application, nodeModel.owner.instance); } - NodeResources.DiskSpeed diskSpeed; - NodeResources nr = new NodeResources(nodeModel.minCpuCores, nodeModel.minMainMemoryAvailableGb, - nodeModel.minDiskAvailableGb, nodeModel.bandwidth * 1000, - nodeModel.fastDisk ? NodeResources.DiskSpeed.fast : NodeResources.DiskSpeed.slow); + NodeResources nr = new NodeResources(nodeModel.minCpuCores, + nodeModel.minMainMemoryAvailableGb, + nodeModel.minDiskAvailableGb, + nodeModel.bandwidth * 1000, + nodeModel.fastDisk ? NodeResources.DiskSpeed.fast : NodeResources.DiskSpeed.slow); Flavor f = new Flavor(nr); Node node = nodeRepository.createNode(nodeModel.id, nodeModel.hostname, @@ -275,7 +277,7 @@ public class CapacityCheckerTester { nodeModel.parentHostname, f, Optional.empty(), nodeModel.type); if (membership != null) { - return node.allocate(owner, membership, node.flavor().resources(), Instant.now()); + return node.allocate(owner, membership, node.resources(), Instant.now()); } else { return node; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index 9fc2f666d27..727232e5c7c 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -223,7 +223,7 @@ public class MetricsReporterTest { if (tenant.isPresent()) { Allocation allocation = new Allocation(app(tenant.get()), ClusterMembership.from("container/id1/0/3", new Version(), Optional.empty()), - owner.flavor().resources(), + owner.resources(), Generation.initial(), false); return Optional.of(allocation); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index e2fd8a8721c..51f70e8b640 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -301,9 +301,9 @@ public class NodeFailerTest { // Two ready nodes and a ready docker node die, but only 2 of those are failed out tester.clock.advance(Duration.ofMinutes(180)); - Node dockerNode = ready.stream().filter(node -> node.flavor().resources().equals(newNodeResources)).findFirst().get(); + Node dockerNode = ready.stream().filter(node -> node.resources().equals(newNodeResources)).findFirst().get(); List<Node> otherNodes = ready.stream() - .filter(node -> ! node.flavor().resources().equals(newNodeResources)) + .filter(node -> ! node.resources().equals(newNodeResources)) .collect(Collectors.toList()); tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode); tester.failer.run(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java new file mode 100644 index 00000000000..fb84dc0a32a --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java @@ -0,0 +1,327 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ClusterMembership; +import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.DockerImage; +import com.yahoo.config.provision.Environment; +import com.yahoo.config.provision.Flavor; +import com.yahoo.config.provision.NodeFlavors; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.RegionName; +import com.yahoo.config.provision.Zone; +import com.yahoo.test.ManualClock; +import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.curator.mock.MockCurator; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.IP; +import com.yahoo.vespa.hosted.provision.provisioning.EmptyProvisionServiceProvider; +import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; +import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; +import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; +import org.junit.Ignore; +import org.junit.Test; + +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class SpareCapacityMaintainerTest { + + @Test + public void testEmpty() { + var tester = new SpareCapacityMaintainerTester(); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + } + + @Test + public void testOneSpare() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(2, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 1, new NodeResources(10, 100, 1000, 1), 0); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testTwoSpares() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(3, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 1, new NodeResources(10, 100, 1000, 1), 0); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(2, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testNoSpares() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(2, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(10, 100, 1000, 1), 0); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(0, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testAllWorksAsSpares() { + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(4, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(5, 50, 500, 0.5), 0); + tester.addNodes(1, 2, new NodeResources(5, 50, 500, 0.5), 2); + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(2, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testMoveIsNeeded() { + // Moving application id 1 and 2 to the same nodes frees up spares for application 0 + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(6, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(10, 100, 1000, 1), 0); + tester.addNodes(1, 2, new NodeResources(5, 50, 500, 0.5), 2); + tester.addNodes(2, 2, new NodeResources(5, 50, 500, 0.5), 4); + tester.maintainer.maintain(); + assertEquals(1, tester.deployer.redeployments); + assertEquals(1, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + + // Maintaining again is a no-op since the node to move is already retired + tester.maintainer.maintain(); + assertEquals(1, tester.deployer.redeployments); + assertEquals(1, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testMultipleMovesAreNeeded() { + // Moving application id 1 and 2 to the same nodes frees up spares for application 0 + // so that it can be moved from size 12 to size 10 hosts, clearing up spare room for the size 12 application + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(4, new NodeResources(12, 120, 1200, 1.2)); + tester.addHosts(4, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(10, 100, 1000, 1.0), 0); + tester.addNodes(1, 2, new NodeResources(12, 120, 1200, 1.2), 2); + tester.addNodes(2, 2, new NodeResources(5, 50, 500, 0.5), 4); + tester.addNodes(3, 2, new NodeResources(5, 50, 500, 0.5), 6); + tester.maintainer.maintain(); + assertEquals(1, tester.deployer.redeployments); + assertEquals(1, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testMultipleNodesMustMoveFromOneHost() { + // By moving the 4 small nodes from host 2 we free up sufficient space on the third host to act as a spare for + // application 0 + var tester = new SpareCapacityMaintainerTester(); + setupMultipleHosts(tester, 5); + + tester.maintainer.maintain(); + assertEquals(1, tester.deployer.redeployments); + assertEquals(1, tester.nodeRepository.list().retired().size()); + assertEquals(1, tester.metric.values.get("spareHostCapacity")); + } + + @Test + public void testMultipleNodesMustMoveFromOneHostButInsufficientCapacity() { + var tester = new SpareCapacityMaintainerTester(); + setupMultipleHosts(tester, 4); + + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(0, tester.metric.values.get("spareHostCapacity")); + } + + private void setupMultipleHosts(SpareCapacityMaintainerTester tester, int smallNodeCount) { + tester.addHosts(2, new NodeResources(10, 100, 1000, 1)); + tester.addNodes(0, 2, new NodeResources(10, 100, 1000, 1.0), 0); + + tester.addHosts(1, new NodeResources(16, 160, 1600, 1.6)); + tester.addNodes(1, 1, new NodeResources(1, 10, 100, 0.1), 2); + tester.addNodes(2, 1, new NodeResources(1, 10, 100, 0.1), 2); + tester.addNodes(3, 1, new NodeResources(1, 10, 100, 0.1), 2); + tester.addNodes(4, 1, new NodeResources(1, 10, 100, 0.1), 2); + tester.addNodes(5, 1, new NodeResources(2, 20, 200, 2.0), 2); + tester.addNodes(6, 1, new NodeResources(2, 20, 200, 2.0), 2); + tester.addNodes(7, 1, new NodeResources(2, 20, 200, 2.0), 2); + + tester.addHosts(smallNodeCount, new NodeResources(2, 20, 200, 2.0)); + } + + @Test + public void testTooManyIterationsAreNeeded() { + // 6 nodes must move to the next host, which is more than the max limit + var tester = new SpareCapacityMaintainerTester(5); + + tester.addHosts(2, new NodeResources(10, 100, 1000, 1)); + tester.addHosts(1, new NodeResources(9, 90, 900, 0.9)); + tester.addHosts(1, new NodeResources(8, 80, 800, 0.8)); + tester.addHosts(1, new NodeResources(7, 70, 700, 0.7)); + tester.addHosts(1, new NodeResources(6, 60, 600, 0.6)); + tester.addHosts(1, new NodeResources(5, 50, 500, 0.5)); + tester.addHosts(1, new NodeResources(4, 40, 400, 0.4)); + + tester.addNodes(0, 1, new NodeResources(10, 100, 1000, 1.0), 0); + tester.addNodes(1, 1, new NodeResources( 9, 90, 900, 0.9), 1); + tester.addNodes(2, 1, new NodeResources( 8, 80, 800, 0.8), 2); + tester.addNodes(3, 1, new NodeResources( 7, 70, 700, 0.7), 3); + tester.addNodes(4, 1, new NodeResources( 6, 60, 600, 0.6), 4); + tester.addNodes(5, 1, new NodeResources( 5, 50, 500, 0.5), 5); + tester.addNodes(6, 1, new NodeResources( 4, 40, 400, 0.4), 6); + + tester.maintainer.maintain(); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(0, tester.metric.values.get("spareHostCapacity")); + } + + /** Microbenchmark */ + @Test + @Ignore + public void testLargeNodeRepo() { + // Completely fill 200 hosts with 2000 nodes + int hosts = 200; + var tester = new SpareCapacityMaintainerTester(); + tester.addHosts(hosts, new NodeResources(100, 1000, 10000, 10)); + int hostOffset = 0; + for (int i = 0; i < 200; i++) { + int applicationSize = 10; + int resourceSize = 10; + tester.addNodes(i, applicationSize, new NodeResources(resourceSize, resourceSize * 10, resourceSize * 100, 0.1), hostOffset); + hostOffset = (hostOffset + applicationSize) % hosts; + } + long startTime = System.currentTimeMillis(); + tester.maintainer.maintain(); + long totalTime = System.currentTimeMillis() - startTime; + System.out.println("Complete in " + ( totalTime / 1000) + " seconds"); + assertEquals(0, tester.deployer.redeployments); + assertEquals(0, tester.nodeRepository.list().retired().size()); + assertEquals(0, tester.metric.values.get("spareHostCapacity")); + } + + private static class SpareCapacityMaintainerTester { + + NodeRepository nodeRepository; + MockDeployer deployer; + TestMetric metric = new TestMetric(); + SpareCapacityMaintainer maintainer; + private int hostIndex = 0; + private int nodeIndex = 0; + + private SpareCapacityMaintainerTester() { + this(1000); + } + + private SpareCapacityMaintainerTester(int maxIterations) { + NodeFlavors flavors = new NodeFlavors(new FlavorConfigBuilder().build()); + nodeRepository = new NodeRepository(flavors, + new EmptyProvisionServiceProvider().getHostResourcesCalculator(), + new MockCurator(), + new ManualClock(), + new Zone(Environment.prod, RegionName.from("us-east-3")), + new MockNameResolver().mockAnyLookup(), + DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), true, false); + deployer = new MockDeployer(nodeRepository); + maintainer = new SpareCapacityMaintainer(deployer, nodeRepository, metric, Duration.ofDays(1), maxIterations); + } + + private void addHosts(int count, NodeResources resources) { + List<Node> hosts = new ArrayList<>(); + for (int i = 0; i < count; i++) { + Node host = nodeRepository.createNode("host" + hostIndex, + "host" + hostIndex + ".yahoo.com", + ipConfig(hostIndex + nodeIndex, true), + Optional.empty(), + new Flavor(resources), + Optional.empty(), + NodeType.host); + hosts.add(host); + hostIndex++; + } + hosts = nodeRepository.addNodes(hosts, Agent.system); + hosts = nodeRepository.setReady(hosts, Agent.system, "Test"); + var transaction = new NestedTransaction(); + nodeRepository.activate(hosts, transaction); + transaction.commit(); + } + + private void addNodes(int id, int count, NodeResources resources, int hostOffset) { + List<Node> nodes = new ArrayList<>(); + ApplicationId application = ApplicationId.from("tenant" + id, "application" + id, "default"); + for (int i = 0; i < count; i++) { + ClusterMembership membership = ClusterMembership.from(ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("cluster" + id)) + .group(ClusterSpec.Group.from(0)) + .vespaVersion("7") + .build(), + i); + Node node = nodeRepository.createNode("node" + nodeIndex, + "node" + nodeIndex + ".yahoo.com", + ipConfig(hostIndex + nodeIndex, false), + Optional.of("host" + ( hostOffset + i) + ".yahoo.com"), + new Flavor(resources), + Optional.empty(), + NodeType.tenant); + node = node.allocate(application, membership, node.resources(), Instant.now()); + nodes.add(node); + nodeIndex++; + } + nodes = nodeRepository.addNodes(nodes, Agent.system); + for (int i = 0; i < count; i++) { + Node node = nodes.get(i); + ClusterMembership membership = ClusterMembership.from(ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("cluster" + id)) + .group(ClusterSpec.Group.from(0)) + .vespaVersion("7") + .build(), + i); + node = node.allocate(application, membership, node.resources(), Instant.now()); + nodes.set(i, node); + } + nodes = nodeRepository.reserve(nodes); + var transaction = new NestedTransaction(); + nodes = nodeRepository.activate(nodes, transaction); + transaction.commit(); + } + + private IP.Config ipConfig(int id, boolean host) { + return new IP.Config(Set.of(String.format("%04X::%04X", id, 0)), + host ? IntStream.range(0, 10) + .mapToObj(n -> String.format("%04X::%04X", id, n)) + .collect(Collectors.toSet()) + : Set.of()); + } + + private void dumpState() { + for (Node host : nodeRepository.list().hosts().asList()) { + System.out.println("Host " + host.hostname() + " " + host.resources()); + for (Node node : nodeRepository.list().childrenOf(host).asList()) + System.out.println(" Node " + node.hostname() + " " + node.resources() + " allocation " +node.allocation()); + } + } + + } + +} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/AllocationVisualizer.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/AllocationVisualizer.java index ea4386f2fd5..644b2338a5a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/AllocationVisualizer.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/AllocationVisualizer.java @@ -102,13 +102,13 @@ public class AllocationVisualizer extends JPanel { if (isHost) { g.setColor(Color.GRAY); - for (int i = 0; i < node.flavor().resources().memoryGb(); i++) { + for (int i = 0; i < node.resources().memoryGb(); i++) { g.fillRect(x, y - nodeHeight, nodeWidth, nodeHeight); y = y - (nodeHeight + 2); } } else { g.setColor(Color.YELLOW); - int multi = (int) node.flavor().resources().memoryGb(); + int multi = (int) node.resources().memoryGb(); int height = multi * nodeHeight + ((multi - 1) * 2); g.fillRect(x, y - height, nodeWidth, height); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java index 3ffb0dc34f0..0c5a682c3c5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java @@ -60,7 +60,7 @@ public class DockerProvisioningTest { NodeList nodes = tester.getNodes(application1, Node.State.active); assertEquals(nodeCount, nodes.size()); - assertEquals(dockerResources, nodes.asList().get(0).flavor().resources()); + assertEquals(dockerResources, nodes.asList().get(0).resources()); // Upgrade Vespa version on nodes Version upgradedWantedVespaVersion = Version.fromString("6.40"); @@ -70,7 +70,7 @@ public class DockerProvisioningTest { tester.activate(application1, new HashSet<>(upgradedHosts)); NodeList upgradedNodes = tester.getNodes(application1, Node.State.active); assertEquals(nodeCount, upgradedNodes.size()); - assertEquals(dockerResources, upgradedNodes.asList().get(0).flavor().resources()); + assertEquals(dockerResources, upgradedNodes.asList().get(0).resources()); assertEquals(hosts, upgradedHosts); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java index 7350df40718..98ec01e8e95 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java @@ -425,7 +425,7 @@ public class DynamicDockerAllocationTest { ); ClusterMembership clusterMembership1 = ClusterMembership.from( clusterSpec.with(Optional.of(ClusterSpec.Group.from(0))), index); // Need to add group here so that group is serialized in node allocation - Node node1aAllocation = node1a.allocate(id, clusterMembership1, node1a.flavor().resources(), Instant.now()); + Node node1aAllocation = node1a.allocate(id, clusterMembership1, node1a.resources(), Instant.now()); tester.nodeRepository().addNodes(Collections.singletonList(node1aAllocation), Agent.system); NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(tester.getCurator())); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerHostCapacityTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/HostCapacityTest.java index aef25daa659..da78aff493e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerHostCapacityTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/HostCapacityTest.java @@ -27,10 +27,10 @@ import static org.mockito.Mockito.mock; /** * @author smorgrav */ -public class DockerHostCapacityTest { +public class HostCapacityTest { private final HostResourcesCalculator hostResourcesCalculator = mock(HostResourcesCalculator.class); - private DockerHostCapacity capacity; + private HostCapacity capacity; private List<Node> nodes; private Node host1, host2, host3; private final NodeResources resources1 = new NodeResources(1, 30, 20, 1.5); @@ -61,7 +61,7 @@ public class DockerHostCapacityTest { // init docker host capacity nodes = new ArrayList<>(List.of(host1, host2, host3, nodeA, nodeB, nodeC, nodeD, nodeE)); - capacity = new DockerHostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); + capacity = new HostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); } @Test @@ -76,7 +76,7 @@ public class DockerHostCapacityTest { // Add a new node to host1 to deplete the memory resource Node nodeF = Node.createDockerNode(Set.of("::6"), "nodeF", "host1", resources1, NodeType.tenant); nodes.add(nodeF); - capacity = new DockerHostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); + capacity = new HostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); assertFalse(capacity.hasCapacity(host1, resources1)); assertFalse(capacity.hasCapacity(host1, resources2)); } @@ -116,12 +116,12 @@ public class DockerHostCapacityTest { var cfg = Node.createDockerNode(Set.of("::2"), "cfg", "devhost", resources1, NodeType.config); var nodes = new ArrayList<>(List.of(cfg)); - var capacity = new DockerHostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); + var capacity = new HostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); assertTrue(capacity.hasCapacity(devHost, resources1)); var container1 = Node.createDockerNode(Set.of("::3"), "container1", "devhost", resources1, NodeType.tenant); nodes = new ArrayList<>(List.of(cfg, container1)); - capacity = new DockerHostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); + capacity = new HostCapacity(new LockedNodeList(nodes, () -> {}), hostResourcesCalculator); assertFalse(capacity.hasCapacity(devHost, resources1)); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InPlaceResizeProvisionTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InPlaceResizeProvisionTest.java index b2ee298c19d..71c3ec37d65 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InPlaceResizeProvisionTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InPlaceResizeProvisionTest.java @@ -150,9 +150,9 @@ public class InPlaceResizeProvisionTest { assertEquals(6, appNodes.size()); // 4 nodes with large resources + 2 retired nodes with medium resources appNodes.forEach(node -> { if (node.allocation().get().membership().retired()) - assertEquals(new NodeResources(4, 8, 160, 1, fast, local), node.flavor().resources()); + assertEquals(new NodeResources(4, 8, 160, 1, fast, local), node.resources()); else - assertEquals(new NodeResources(8, 16, 320, 1, fast, local), node.flavor().resources()); + assertEquals(new NodeResources(8, 16, 320, 1, fast, local), node.resources()); initialHostnames.remove(node.hostname()); }); assertTrue("All initial nodes should still be allocated to the application", initialHostnames.isEmpty()); @@ -254,7 +254,7 @@ public class InPlaceResizeProvisionTest { private void assertSizeAndResources(NodeList nodes, int size, NodeResources resources) { assertEquals(size, nodes.size()); - nodes.forEach(n -> assertEquals(resources, n.flavor().resources())); + nodes.forEach(n -> assertEquals(resources, n.resources())); } private NodeList listCluster(ClusterSpec cluster) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImplTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImplTest.java index 48bd091011e..e45ea09d372 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImplTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImplTest.java @@ -140,7 +140,7 @@ public class InfraDeployerImplTest { Optional<Node> nodeWithAllocation = wantedVespaVersion.map(version -> { ClusterSpec clusterSpec = application.getClusterSpecWithVersion(version).with(Optional.of(ClusterSpec.Group.from(0))); ClusterMembership membership = ClusterMembership.from(clusterSpec, 1); - Allocation allocation = new Allocation(application.getApplicationId(), membership, node.flavor().resources(), Generation.initial(), false); + Allocation allocation = new Allocation(application.getApplicationId(), membership, node.resources(), Generation.initial(), false); return node.with(allocation); }); return nodeRepository.database().writeTo(state, nodeWithAllocation.orElse(node), Agent.system, Optional.empty()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index 2427c0303c6..e73aeb05ce3 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -262,8 +262,8 @@ public class ProvisioningTester { nodeList.stream().map(n -> n.allocation().get().membership().cluster().group().get()).distinct().count()); for (Node node : nodeList) { var expected = new NodeResources(vcpu, memory, disk, bandwidth, diskSpeed, storageType); - assertTrue(explanation + ": Resources: Expected " + expected + " but was " + node.flavor().resources(), - expected.compatibleWith(node.flavor().resources())); + assertTrue(explanation + ": Resources: Expected " + expected + " but was " + node.resources(), + expected.compatibleWith(node.resources())); } } @@ -658,7 +658,7 @@ public class ProvisioningTester { @Override public NodeResources realResourcesOf(Node node, NodeRepository nodeRepository) { - NodeResources resources = node.flavor().resources(); + NodeResources resources = node.resources(); if (node.type() == NodeType.host) return resources; return resources.withMemoryGb(resources.memoryGb() - memoryTaxGb) .withDiskGb(resources.diskGb() - ( resources.storageType() == local ? localDiskTax : 0)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json index e041a7b8b54..6bb30d90218 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json @@ -4,9 +4,6 @@ "name": "AutoscalingMaintainer" }, { - "name": "CapacityReportMaintainer" - }, - { "name": "DirtyExpirer" }, { @@ -56,6 +53,9 @@ }, { "name":"ScalingSuggestionsMaintainer" + }, + { + "name": "SpareCapacityMaintainer" } ], "inactive": [ |