From f8400289c573467829db1d9fffa370d727b0487b Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Mon, 25 May 2020 16:14:35 +0200 Subject: Split provisioning logic --- .../maintenance/DynamicProvisioningMaintainer.java | 108 +++++++++++++-------- 1 file changed, 65 insertions(+), 43 deletions(-) (limited to 'node-repository/src/main/java') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 9e1c5b1d630..040ab99389d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -22,7 +22,7 @@ import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost; import com.yahoo.yolean.Exceptions; import java.time.Duration; -import java.util.Collection; +import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.List; @@ -60,13 +60,13 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { try (Mutex lock = nodeRepository().lockUnallocated()) { NodeList nodes = nodeRepository().list(); - - updateProvisioningNodes(nodes, lock); + resumeProvisioning(nodes, lock); convergeToCapacity(nodes); } } - private void updateProvisioningNodes(NodeList nodes, Mutex lock) { + /** Resume provisioning of already provisioned hosts and their children */ + private void resumeProvisioning(NodeList nodes, Mutex lock) { Map> nodesByProvisionedParentHostname = nodes.nodeType(NodeType.tenant).asList().stream() .filter(node -> node.parentHostname().isPresent()) .collect(Collectors.groupingBy( @@ -80,10 +80,10 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { nodeRepository().write(updatedNodes, lock); } catch (IllegalArgumentException | IllegalStateException e) { log.log(Level.INFO, "Failed to provision " + host.hostname() + " with " + children.size() + " children: " + - Exceptions.toMessageString(e)); + Exceptions.toMessageString(e)); } catch (FatalProvisioningException e) { log.log(Level.SEVERE, "Failed to provision " + host.hostname() + " with " + children.size() + - " children, failing out the host recursively", e); + " children, failing out the host recursively", e); // Fail out as operator to force a quick redeployment nodeRepository().failRecursively( host.hostname(), Agent.operator, "Failed by HostProvisioner due to provisioning failure"); @@ -93,31 +93,47 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { }); } + /** Converge zone to wanted capacity */ private void convergeToCapacity(NodeList nodes) { - Collection removableHosts = getRemovableHosts(nodes); - List preProvisionCapacity = preprovisionCapacityFlag.value().stream() - .flatMap(cap -> { - NodeResources resources = new NodeResources(cap.getVcpu(), cap.getMemoryGb(), cap.getDiskGb(), 1); - return IntStream.range(0, cap.getCount()).mapToObj(i -> resources); - }) - .sorted(NodeResourceComparator.memoryDiskCpuOrder().reversed()) - .collect(Collectors.toList()); + List removableHosts = removableHostsOf(nodes); + List excessHosts = preprovisionCapacity(removableHosts); + + excessHosts.forEach(host -> { + try { + hostProvisioner.deprovision(host); + nodeRepository().removeRecursively(host, true); + } catch (RuntimeException e) { + log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + ", will retry in " + interval(), e); + } + }); + } - for (Iterator it = preProvisionCapacity.iterator(); it.hasNext() && !removableHosts.isEmpty();) { + /** + * Provision spare capacity according to removable hosts. + * + * @return Excess hosts eligible for deprovisioning. + */ + private List preprovisionCapacity(List removableHosts) { + List excessHosts = new ArrayList<>(removableHosts); + List spareCapacity = preprovisionedCapacity(); + + // Keep one removable host, if that host satisfies the capacity requirement. This results in one host being + // empty most of the time. + for (Iterator it = spareCapacity.iterator(); it.hasNext() && !excessHosts.isEmpty(); ) { NodeResources resources = it.next(); - removableHosts.stream() - .filter(nodeRepository()::canAllocateTenantNodeTo) - .filter(host -> nodeRepository().resourcesCalculator().advertisedResourcesOf(host.flavor()).satisfies(resources)) - .min(Comparator.comparingInt(n -> n.flavor().cost())) - .ifPresent(host -> { - removableHosts.remove(host); - it.remove(); - }); + excessHosts.stream() + .filter(nodeRepository()::canAllocateTenantNodeTo) + .filter(host -> nodeRepository().resourcesCalculator().advertisedResourcesOf(host.flavor()).satisfies(resources)) + .min(Comparator.comparingInt(n -> n.flavor().cost())) + .ifPresent(host -> { + excessHosts.remove(host); + it.remove(); + }); } - // pre-provisioning is best effort, do one host at a time - preProvisionCapacity.forEach(resources -> { + // Pre-provisioning is best effort, do one host at a time + spareCapacity.forEach(resources -> { try { Version osVersion = nodeRepository().osVersions().targetFor(NodeType.host).orElse(Version.emptyVersion); List hosts = hostProvisioner.provisionHosts(nodeRepository().database().getProvisionIndexes(1), @@ -133,30 +149,36 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { } }); - // Finally, deprovision excess hosts. - removableHosts.forEach(host -> { - try { - hostProvisioner.deprovision(host); - nodeRepository().removeRecursively(host, true); - } catch (RuntimeException e) { - log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + ", will retry in " + interval(), e); - } - }); + return excessHosts; + } + + /** Returns the preprovisioned capacity that should be available in this zone, if any */ + private List preprovisionedCapacity() { + return preprovisionCapacityFlag.value().stream() + .flatMap(cap -> { + NodeResources resources = new NodeResources(cap.getVcpu(), cap.getMemoryGb(), + cap.getDiskGb(), 1); + return IntStream.range(0, cap.getCount()).mapToObj(i -> resources); + }) + .sorted(NodeResourceComparator.memoryDiskCpuOrder().reversed()) + .collect(Collectors.toList()); } - private static Collection getRemovableHosts(NodeList nodes) { + /** Returns hosts that are candidates for removal, e.g. hosts that have no containers or are failed */ + private static List removableHostsOf(NodeList nodes) { Map hostsByHostname = nodes.nodeType(NodeType.host) - .asList().stream() - .filter(host -> host.state() != Node.State.parked || host.status().wantToDeprovision()) - .collect(Collectors.toMap(Node::hostname, Function.identity())); + .matching(host -> host.state() != Node.State.parked || + host.status().wantToDeprovision()) + .stream() + .collect(Collectors.toMap(Node::hostname, Function.identity())); nodes.asList().stream() - .filter(node -> node.allocation().isPresent()) - .flatMap(node -> node.parentHostname().stream()) - .distinct() - .forEach(hostsByHostname::remove); + .filter(node -> node.allocation().isPresent()) + .flatMap(node -> node.parentHostname().stream()) + .distinct() + .forEach(hostsByHostname::remove); - return hostsByHostname.values(); + return List.copyOf(hostsByHostname.values()); } } -- cgit v1.2.3