diff options
author | Jon Bratseth <bratseth@oath.com> | 2020-09-11 14:35:09 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-09-11 14:35:09 +0200 |
commit | b9d08a9fe934e1d94a7402ad5e9f9b7cfd990633 (patch) | |
tree | 4866f351f2a09e01ec2f40ffa1845d1dbd60e5af | |
parent | ad371f1e4930d3a51d4b3f23e26095e2f0ada6a5 (diff) | |
parent | d74a7a32ec7eed3c65b27a723f9a60f773d53e8e (diff) |
Merge pull request #14380 from vespa-engine/freva/overcommit
[VESPA-18713] Retire nodes on overcommited hosts
6 files changed, 142 insertions, 100 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java index 92abae46f9a..bbd763eea45 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java @@ -4,11 +4,18 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; -import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceComparator; -import java.util.*; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -17,16 +24,20 @@ import java.util.stream.Collectors; */ public class CapacityChecker { - private List<Node> hosts; - Map<String, Node> nodeMap; - private Map<Node, List<Node>> nodeChildren; - private Map<Node, AllocationResources> availableResources; + // We only care about nodes in one of these states. + private static final Set<Node.State> relevantNodeStates = EnumSet.of( + Node.State.active, Node.State.inactive, Node.State.dirty, Node.State.provisioned, Node.State.ready, Node.State.reserved); + + private final List<Node> hosts; + private final Map<String, Node> nodeMap; + private final Map<Node, List<Node>> nodeChildren; + private final Map<Node, AllocationResources> availableResources; public AllocationHistory allocationHistory = null; - public CapacityChecker(NodeRepository nodeRepository) { - this.hosts = getHosts(nodeRepository); - List<Node> tenants = getTenants(nodeRepository, hosts); + public CapacityChecker(NodeList allNodes) { + this.hosts = allNodes.hosts().state(relevantNodeStates).asList(); + List<Node> tenants = getTenants(allNodes, hosts); nodeMap = constructHostnameToNodeMap(hosts); this.nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap); this.availableResources = constructAvailableResourcesMap(hosts, nodeChildren); @@ -37,17 +48,24 @@ public class CapacityChecker { } public Optional<HostFailurePath> worstCaseHostLossLeadingToFailure() { - Map<Node, Integer> timesNodeCanBeRemoved = computeMaximalRepeatedRemovals(hosts, nodeChildren, availableResources); - return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved, hosts, nodeChildren, availableResources); + Map<Node, Integer> timesNodeCanBeRemoved = computeMaximalRepeatedRemovals(); + return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved); } protected List<Node> findOvercommittedHosts() { - return findOvercommittedNodes(availableResources); + List<Node> overcommittedNodes = new ArrayList<>(); + for (var entry : availableResources.entrySet()) { + var resources = entry.getValue().nodeResources; + if (resources.vcpu() < 0 || resources.memoryGb() < 0 || resources.diskGb() < 0) { + overcommittedNodes.add(entry.getKey()); + } + } + return overcommittedNodes; } public List<Node> nodesFromHostnames(List<String> hostnames) { - List<Node> nodes = hostnames.stream().filter(h -> nodeMap.containsKey(h)) - .map(h -> nodeMap.get(h)) + List<Node> nodes = hostnames.stream().filter(nodeMap::containsKey) + .map(nodeMap::get) .collect(Collectors.toList()); if (nodes.size() != hostnames.size()) { @@ -61,38 +79,18 @@ public class CapacityChecker { } public Optional<HostFailurePath> findHostRemovalFailure(List<Node> hostsToRemove) { - var removal = findHostRemovalFailure(hostsToRemove, hosts, nodeChildren, availableResources); - if (removal.isEmpty()) return Optional.empty(); - HostFailurePath failurePath = new HostFailurePath(); - failurePath.hostsCausingFailure = hostsToRemove; - failurePath.failureReason = removal.get(); - return Optional.of(failurePath); - } - - // We only care about nodes in one of these states. - private static Node.State[] relevantNodeStates = { - Node.State.active, - Node.State.inactive, - Node.State.dirty, - Node.State.provisioned, - Node.State.ready, - Node.State.reserved - }; - - private List<Node> getHosts(NodeRepository nodeRepository) { - return nodeRepository.getNodes(NodeType.host, relevantNodeStates); + return findHostRemovalFailure(hostsToRemove, hosts, nodeChildren, availableResources) + .map(removal -> new HostFailurePath(hostsToRemove, removal)); } - private List<Node> getTenants(NodeRepository nodeRepository, List<Node> hosts) { + private List<Node> getTenants(NodeList allNodes, List<Node> hosts) { var parentNames = hosts.stream().map(Node::hostname).collect(Collectors.toSet()); - return nodeRepository.getNodes(NodeType.tenant, relevantNodeStates).stream() + return allNodes.nodeType(NodeType.tenant).state(relevantNodeStates).stream() .filter(t -> parentNames.contains(t.parentHostname().orElse(""))) .collect(Collectors.toList()); } - private Optional<HostFailurePath> greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts, - Map<Node, List<Node>> nodeChildren, - Map<Node, AllocationResources> availableResources) { + private Optional<HostFailurePath> greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic) { if (hosts.size() == 0) return Optional.empty(); List<Node> parentRemovalPriorityList = heuristic.entrySet().stream() @@ -102,13 +100,8 @@ public class CapacityChecker { for (int i = 1; i <= parentRemovalPriorityList.size(); i++) { List<Node> hostsToRemove = parentRemovalPriorityList.subList(0, i); - var hostRemovalFailure = findHostRemovalFailure(hostsToRemove, hosts, nodeChildren, availableResources); - if (hostRemovalFailure.isPresent()) { - HostFailurePath failurePath = new HostFailurePath(); - failurePath.hostsCausingFailure = hostsToRemove; - failurePath.failureReason = hostRemovalFailure.get(); - return Optional.of(failurePath); - } + var hostRemovalFailure = findHostRemovalFailure(hostsToRemove); + if (hostRemovalFailure.isPresent()) return hostRemovalFailure; } throw new IllegalStateException("No path to failure found. This should be impossible!"); @@ -157,9 +150,7 @@ public class CapacityChecker { * Computes a heuristic for each host, with a lower score indicating a higher perceived likelihood that removing * the host causes an unrecoverable state */ - private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, - Map<Node, List<Node>> nodeChildren, - Map<Node, AllocationResources> availableResources) { + private Map<Node, Integer> computeMaximalRepeatedRemovals() { Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(Function.identity(), __ -> Integer.MAX_VALUE)); for (Node host : hosts) { @@ -182,17 +173,6 @@ public class CapacityChecker { return timesNodeCanBeRemoved; } - private List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) { - List<Node> overcommittedNodes = new ArrayList<>(); - for (var entry : availableResources.entrySet()) { - var resources = entry.getValue().nodeResources; - if (resources.vcpu() < 0 || resources.memoryGb() < 0 || resources.diskGb() < 0) { - overcommittedNodes.add(entry.getKey()); - } - } - return overcommittedNodes; - } - private Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) { return nodeChildren.entrySet().stream().collect(Collectors.toMap( Map.Entry::getKey, @@ -341,8 +321,13 @@ public class CapacityChecker { */ public static class HostFailurePath { - public List<Node> hostsCausingFailure; - public HostRemovalFailure failureReason; + public final List<Node> hostsCausingFailure; + public final HostRemovalFailure failureReason; + + public HostFailurePath(List<Node> hostsCausingFailure, HostRemovalFailure failureReason) { + this.hostsCausingFailure = hostsCausingFailure; + this.failureReason = failureReason; + } @Override public String toString() { @@ -358,9 +343,9 @@ public class CapacityChecker { */ public static class HostRemovalFailure { - public Optional<Node> host; - public Optional<Node> tenant; - public AllocationFailureReasonList allocationFailures; + public final Optional<Node> host; + public final Optional<Node> tenant; + public final AllocationFailureReasonList allocationFailures; public static HostRemovalFailure none() { return new HostRemovalFailure(Optional.empty(), @@ -399,8 +384,8 @@ public class CapacityChecker { /** Used to describe the resources required for a tenant, and available to a host. */ private static class AllocationResources { - NodeResources nodeResources; - int availableIPs; + private final NodeResources nodeResources; + private final int availableIPs; public static AllocationResources from(Node node) { if (node.allocation().isPresent()) @@ -434,7 +419,7 @@ public class CapacityChecker { */ private static class AllocationFailureReason { - Node host; + private final Node host; public AllocationFailureReason (Node host) { this.host = host; } @@ -478,7 +463,7 @@ public class CapacityChecker { */ public static class AllocationFailureReasonList { - private List<AllocationFailureReason> allocationFailureReasons; + private final List<AllocationFailureReason> allocationFailureReasons; public AllocationFailureReasonList(List<AllocationFailureReason> allocationFailureReasons) { this.allocationFailureReasons = allocationFailureReasons; @@ -515,9 +500,9 @@ public class CapacityChecker { public static class AllocationHistory { public static class Entry { - public Node tenant; - public Node newParent; - public long eligibleParents; + public final Node tenant; + public final Node newParent; + public final long eligibleParents; public Entry(Node tenant, Node newParent, long eligibleParents) { this.tenant = tenant; @@ -536,7 +521,7 @@ public class CapacityChecker { } } - public List<Entry> historyEntries; + public final List<Entry> historyEntries; public AllocationHistory() { this.historyEntries = new ArrayList<>(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 9b9c7df5d0d..b1d3551c1b6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -63,7 +63,7 @@ public class Rebalancer extends NodeRepositoryMaintainer { metric.set("hostedVespa.docker.skew", totalSkew/hostCount, null); } - private boolean zoneIsStable(NodeList allNodes) { + static boolean zoneIsStable(NodeList allNodes) { NodeList active = allNodes.state(Node.State.active); if (active.stream().anyMatch(node -> node.allocation().get().membership().retired())) return false; if (active.stream().anyMatch(node -> node.status().wantToRetire())) return false; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index ff9015863e9..0861473a7c0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -1,6 +1,8 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.NodeResources; import com.yahoo.jdisc.Metric; @@ -10,6 +12,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.maintenance.MaintenanceDeployment.Move; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.provisioning.HostCapacity; +import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceComparator; import java.time.Duration; import java.util.ArrayList; @@ -67,15 +70,12 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { boolean success = true; if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return success; - CapacityChecker capacityChecker = new CapacityChecker(nodeRepository()); + NodeList allNodes = nodeRepository().list(); + CapacityChecker capacityChecker = new CapacityChecker(allNodes); List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts(); - if (overcommittedHosts.size() != 0) { - log.log(Level.WARNING, String.format("%d nodes are overcommitted! [ %s ]", - overcommittedHosts.size(), - overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", ")))); - } metric.set("overcommittedHosts", overcommittedHosts.size(), null); + retireOvercommitedHosts(allNodes, overcommittedHosts); Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); if (failurePath.isPresent()) { @@ -133,6 +133,46 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { return shortestMitigation; } + private int retireOvercomittedComparator(Node n1, Node n2) { + ClusterSpec.Type t1 = n1.allocation().get().membership().cluster().type(); + ClusterSpec.Type t2 = n2.allocation().get().membership().cluster().type(); + + // Prefer to container nodes for faster retirement + if (t1 == ClusterSpec.Type.container && t2 != ClusterSpec.Type.container) return -1; + if (t1 != ClusterSpec.Type.container && t2 == ClusterSpec.Type.container) return 1; + + return NodeResourceComparator.memoryDiskCpuOrder().compare(n1.resources(), n2.resources()); + } + + private void retireOvercommitedHosts(NodeList allNodes, List<Node> overcommittedHosts) { + if (overcommittedHosts.isEmpty()) return; + log.log(Level.WARNING, String.format("%d hosts are overcommitted: %s", + overcommittedHosts.size(), + overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", ")))); + + if (!Rebalancer.zoneIsStable(allNodes)) return; + + // Find an active node on a overcommited host and retire it + Optional<Node> nodeToRetire = overcommittedHosts.stream().flatMap(parent -> allNodes.childrenOf(parent).stream()) + .filter(node -> node.state() == Node.State.active) + .min(this::retireOvercomittedComparator); + if (nodeToRetire.isEmpty()) return; + + ApplicationId application = nodeToRetire.get().allocation().get().owner(); + try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { + if ( ! deployment.isValid()) return; // this will be done at another config server + + Optional<Node> nodeWithWantToRetire = nodeRepository().getNode(nodeToRetire.get().hostname()) + .map(node -> node.withWantToRetire(true, Agent.SpareCapacityMaintainer, nodeRepository().clock().instant())); + if (nodeWithWantToRetire.isEmpty()) return; + + nodeRepository().write(nodeWithWantToRetire.get(), deployment.applicationLock().get()); + log.log(Level.INFO, String.format("Redeploying %s to relocate %s from overcommited host", + application, nodeToRetire.get().hostname())); + deployment.activate(); + } + } + private static class CapacitySolver { private final HostCapacity hostCapacity; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java index e28b03d7517..219fdaf9663 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java @@ -28,7 +28,7 @@ public class HostCapacityResponse extends HttpResponse { public HostCapacityResponse(NodeRepository nodeRepository, HttpRequest request) { super(200); - capacityChecker = new CapacityChecker(nodeRepository); + capacityChecker = new CapacityChecker(nodeRepository.list()); json = request.getBooleanProperty("json"); String hostsJson = request.getProperty("hosts"); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java index 59e0dad9720..ad3f620d9bd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java @@ -71,7 +71,7 @@ public class CapacityCheckerTester { } private void updateCapacityChecker() { - this.capacityChecker = new CapacityChecker(this.nodeRepository); + this.capacityChecker = new CapacityChecker(nodeRepository.list()); } List<NodeModel> createDistinctChildren(int amount, List<NodeResources> childResources) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java index 6cd206cd5b9..5766a5bed01 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainerTest.java @@ -200,6 +200,22 @@ public class SpareCapacityMaintainerTest { assertEquals(0, tester.metric.values.get("spareHostCapacity")); } + @Test + public void retireFromOvercommitedHosts() { + var tester = new SpareCapacityMaintainerTester(5); + + tester.addHosts(7, new NodeResources(10, 100, 1000, 1)); + + tester.addNodes(0, 5, new NodeResources( 7, 70, 700, 0.7), 0); + tester.addNodes(1, 4, new NodeResources( 2, 20, 200, 0.2), 0); + tester.addNodes(2, 2, new NodeResources( 1.1, 10, 100, 0.1), 1); + + tester.maintainer.maintain(); + assertEquals(2, tester.metric.values.get("overcommittedHosts")); + assertEquals(1, tester.deployer.redeployments); + assertEquals(List.of(new NodeResources( 1.1, 10, 100, 0.1)), tester.nodeRepository.list().retired().mapToList(Node::resources)); + } + /** Microbenchmark */ @Test @Ignore @@ -267,22 +283,18 @@ public class SpareCapacityMaintainerTest { hosts.add(host); hostIndex++; } - hosts = nodeRepository.addNodes(hosts, Agent.system); - hosts = nodeRepository.setReady(hosts, Agent.system, "Test"); - var transaction = new NestedTransaction(); - nodeRepository.activate(hosts, transaction); - transaction.commit(); + + ApplicationId application = ApplicationId.from("vespa", "tenant-host", "default"); + ClusterSpec clusterSpec = ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("tenant-host")) + .group(ClusterSpec.Group.from(0)) + .vespaVersion("7") + .build(); + allocate(application, clusterSpec, hosts); } private void addNodes(int id, int count, NodeResources resources, int hostOffset) { List<Node> nodes = new ArrayList<>(); - ApplicationId application = ApplicationId.from("tenant" + id, "application" + id, "default"); for (int i = 0; i < count; i++) { - ClusterMembership membership = ClusterMembership.from(ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("cluster" + id)) - .group(ClusterSpec.Group.from(0)) - .vespaVersion("7") - .build(), - i); Node node = nodeRepository.createNode("node" + nodeIndex, "node" + nodeIndex + ".yahoo.com", ipConfig(hostIndex + nodeIndex, false), @@ -290,18 +302,23 @@ public class SpareCapacityMaintainerTest { new Flavor(resources), Optional.empty(), NodeType.tenant); - node = node.allocate(application, membership, node.resources(), Instant.now()); nodes.add(node); nodeIndex++; } + + ApplicationId application = ApplicationId.from("tenant" + id, "application" + id, "default"); + ClusterSpec clusterSpec = ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("cluster" + id)) + .group(ClusterSpec.Group.from(0)) + .vespaVersion("7") + .build(); + allocate(application, clusterSpec, nodes); + } + + private void allocate(ApplicationId application, ClusterSpec clusterSpec, List<Node> nodes) { nodes = nodeRepository.addNodes(nodes, Agent.system); - for (int i = 0; i < count; i++) { + for (int i = 0; i < nodes.size(); i++) { Node node = nodes.get(i); - ClusterMembership membership = ClusterMembership.from(ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("cluster" + id)) - .group(ClusterSpec.Group.from(0)) - .vespaVersion("7") - .build(), - i); + ClusterMembership membership = ClusterMembership.from(clusterSpec, i); node = node.allocate(application, membership, node.resources(), Instant.now()); nodes.set(i, node); } |