diff options
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java | 58 |
1 files changed, 44 insertions, 14 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java index 67dae48cff7..270fda8314e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.component.Version; @@ -8,9 +8,13 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeAllocationException; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.ApplicationMutex; import com.yahoo.jdisc.Metric; import com.yahoo.text.internal.SnippetGenerator; import com.yahoo.transaction.Mutex; +import com.yahoo.vespa.applicationmodel.InfrastructureApplication; +import com.yahoo.vespa.flags.BooleanFlag; +import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -18,6 +22,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing; +import com.yahoo.yolean.Exceptions; import java.util.LinkedHashSet; import java.util.List; @@ -44,12 +49,14 @@ public class Preparer { private final Optional<HostProvisioner> hostProvisioner; private final Optional<LoadBalancerProvisioner> loadBalancerProvisioner; private final ProvisioningThrottler throttler; + private final BooleanFlag makeExclusiveFlag; public Preparer(NodeRepository nodeRepository, Optional<HostProvisioner> hostProvisioner, Optional<LoadBalancerProvisioner> loadBalancerProvisioner, Metric metric) { this.nodeRepository = nodeRepository; this.hostProvisioner = hostProvisioner; this.loadBalancerProvisioner = loadBalancerProvisioner; this.throttler = new ProvisioningThrottler(nodeRepository, metric); + this.makeExclusiveFlag = Flags.MAKE_EXCLUSIVE.bindTo(nodeRepository.flagSource()); } /** @@ -69,11 +76,12 @@ public class Preparer { loadBalancerProvisioner.ifPresent(provisioner -> provisioner.prepare(application, cluster, requested)); + boolean makeExclusive = makeExclusiveFlag.value(); // Try preparing in memory without global unallocated lock. Most of the time there should be no changes, // and we can return nodes previously allocated. LockedNodeList allNodes = nodeRepository.nodes().list(PROBE_LOCK); NodeIndices indices = new NodeIndices(cluster.id(), allNodes); - NodeAllocation probeAllocation = prepareAllocation(application, cluster, requested, indices::probeNext, allNodes); + NodeAllocation probeAllocation = prepareAllocation(application, cluster, requested, indices::probeNext, allNodes, makeExclusive); if (probeAllocation.fulfilledAndNoChanges()) { List<Node> acceptedNodes = probeAllocation.finalNodes(); indices.commitProbe(); @@ -81,16 +89,25 @@ public class Preparer { } else { // There were some changes, so re-do the allocation with locks indices.resetProbe(); - return prepareWithLocks(application, cluster, requested, indices); + return prepareWithLocks(application, cluster, requested, indices, makeExclusive); } } + private ApplicationMutex parentLockOrNull(boolean makeExclusive, NodeType type) { + return NodeCandidate.canMakeHostExclusive(makeExclusive, type, nodeRepository.zone().cloud().allowHostSharing()) ? + nodeRepository.applications().lock(InfrastructureApplication.withNodeType(type.parentNodeType()).id()) : + null; + } + /// Note that this will write to the node repo. - private List<Node> prepareWithLocks(ApplicationId application, ClusterSpec cluster, NodeSpec requested, NodeIndices indices) { + private List<Node> prepareWithLocks(ApplicationId application, ClusterSpec cluster, NodeSpec requested, NodeIndices indices, boolean makeExclusive) { + Runnable waiter = null; + List<Node> acceptedNodes; try (Mutex lock = nodeRepository.applications().lock(application); + ApplicationMutex parentLockOrNull = parentLockOrNull(makeExclusive, requested.type()); Mutex allocationLock = nodeRepository.nodes().lockUnallocated()) { LockedNodeList allNodes = nodeRepository.nodes().list(allocationLock); - NodeAllocation allocation = prepareAllocation(application, cluster, requested, indices::next, allNodes); + NodeAllocation allocation = prepareAllocation(application, cluster, requested, indices::next, allNodes, makeExclusive); NodeType hostType = allocation.nodeType().hostType(); if (canProvisionDynamically(hostType) && allocation.hostDeficit().isPresent()) { HostSharing sharing = hostSharing(cluster, hostType); @@ -127,12 +144,13 @@ public class Preparer { requested.cloudAccount(), deficit.dueToFlavorUpgrade()); Predicate<NodeResources> realHostResourcesWithinLimits = resources -> nodeRepository.nodeResourceLimits().isWithinRealLimits(resources, application, cluster); - hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned); + waiter = hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned); } catch (NodeAllocationException e) { // Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do // not exist, we cannot remove them from ZK here because other nodes may already have been // allocated on them, so let HostDeprovisioner deal with it - hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant())); + hosts.forEach(host -> nodeRepository.nodes().parkRecursively(host.hostname(), Agent.system, true, + "Failed to provision: " + Exceptions.toMessageString(e))); throw e; } } else if (allocation.hostDeficit().isPresent() && requested.canFail() && @@ -140,7 +158,7 @@ public class Preparer { // Non-dynamically provisioned zone with a deficit because we just now retired some nodes. // Try again, but without retiring indices.resetProbe(); - List<Node> accepted = prepareWithLocks(application, cluster, cns.withoutRetiring(), indices); + List<Node> accepted = prepareWithLocks(application, cluster, cns.withoutRetiring(), indices, makeExclusive); log.warning("Prepared " + application + " " + cluster.id() + " without retirement due to lack of capacity"); return accepted; } @@ -150,7 +168,12 @@ public class Preparer { allocation.allocationFailureDetails(), true); // Carry out and return allocation - List<Node> acceptedNodes = allocation.finalNodes(); + if (parentLockOrNull != null) { + List<Node> exclusiveParents = allocation.parentsRequiredToBeExclusive(); + nodeRepository.nodes().setExclusiveToApplicationId(exclusiveParents, parentLockOrNull); + // TODO: also update tags + } + acceptedNodes = allocation.finalNodes(); nodeRepository.nodes().reserve(allocation.reservableNodes()); nodeRepository.nodes().addReservedNodes(new LockedNodeList(allocation.newNodes(), allocationLock)); @@ -160,14 +183,16 @@ public class Preparer { .filter(node -> node.parentHostname().isEmpty() || activeHosts.parentOf(node).isPresent()) .toList(); } - return acceptedNodes; } + + if (waiter != null) waiter.run(); + return acceptedNodes; } private NodeAllocation prepareAllocation(ApplicationId application, ClusterSpec cluster, NodeSpec requested, - Supplier<Integer> nextIndex, LockedNodeList allNodes) { + Supplier<Integer> nextIndex, LockedNodeList allNodes, boolean makeExclusive) { validateAccount(requested.cloudAccount(), application, allNodes); - NodeAllocation allocation = new NodeAllocation(allNodes, application, cluster, requested, nextIndex, nodeRepository); + NodeAllocation allocation = new NodeAllocation(allNodes, application, cluster, requested, nextIndex, nodeRepository, makeExclusive); var allocationContext = IP.Allocation.Context.from(nodeRepository.zone().cloud().name(), requested.cloudAccount().isExclave(nodeRepository.zone()), nodeRepository.nameResolver()); @@ -176,10 +201,13 @@ public class Preparer { cluster, requested, nodeRepository.zone().cloud().dynamicProvisioning(), + nodeRepository.zone().cloud().allowHostSharing(), allocationContext, nodeRepository.nodes(), nodeRepository.resourcesCalculator(), - nodeRepository.spareCount()); + nodeRepository.spareCount(), + nodeRepository.exclusiveAllocation(cluster), + makeExclusive); allocation.offer(prioritizer.collect()); return allocation; } @@ -208,7 +236,9 @@ public class Preparer { private HostSharing hostSharing(ClusterSpec cluster, NodeType hostType) { if ( hostType.isSharable()) - return nodeRepository.exclusiveAllocation(cluster) ? HostSharing.exclusive : HostSharing.any; + return nodeRepository.exclusiveProvisioning(cluster) ? HostSharing.provision : + nodeRepository.exclusiveAllocation(cluster) ? HostSharing.exclusive : + HostSharing.any; else return HostSharing.any; } |