diff options
author | HÃ¥kon Hallingstad <hakon.hallingstad@gmail.com> | 2023-10-26 09:55:49 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-26 09:55:49 +0200 |
commit | 8079a8769915d740474493a8c23623adfecdfdc9 (patch) | |
tree | 337c0f310e800c2a166d4a5a39f2800166383ce6 | |
parent | 01431e4953faa89955978113c67887fa19ce8a4e (diff) | |
parent | 17d7a900c9273fe41d1d5d4d5f112adeb49bb590 (diff) |
Merge pull request #29105 from vespa-engine/freva/optimize
Dynamic provisioning improvements
9 files changed, 46 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java index 7155a49ef58..be6c420c63b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java @@ -229,15 +229,19 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer { sharingMode, clusterType.map(ClusterSpec.Type::valueOf), Optional.empty(), nodeRepository().zone().cloud().account(), false); List<Node> hosts = new ArrayList<>(); - hostProvisioner.provisionHosts(request, - resources -> true, - provisionedHosts -> { - hosts.addAll(provisionedHosts.stream() - .map(host -> host.generateHost(Duration.ZERO)) - .map(host -> host.withExclusiveToApplicationId(null)) - .toList()); - nodeRepository().nodes().addNodes(hosts, Agent.HostCapacityMaintainer); - }); + Runnable waiter; + try (var lock = nodeRepository().nodes().lockUnallocated()) { + waiter = hostProvisioner.provisionHosts(request, + resources -> true, + provisionedHosts -> { + hosts.addAll(provisionedHosts.stream() + .map(host -> host.generateHost(Duration.ZERO)) + .map(host -> host.withExclusiveToApplicationId(null)) + .toList()); + nodeRepository().nodes().addNodes(hosts, Agent.HostCapacityMaintainer); + }); + } + waiter.run(); return hosts; } catch (NodeAllocationException | IllegalArgumentException | IllegalStateException e) { throw new NodeAllocationException("Failed to provision " + count + " " + nodeResources + ": " + e.getMessage(), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java index 1d459f7dd14..2e3c6d1755a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java @@ -61,10 +61,13 @@ public class HostResumeProvisioner extends NodeRepositoryMaintainer { log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); } catch (FatalProvisioningException e) { + // FatalProvisioningException is thrown if node is not found in the cloud, allow for + // some time for the state to propagate + if (host.history().age(clock().instant()).getSeconds() < 30) continue; failures++; log.log(Level.SEVERE, "Failed to provision " + host.hostname() + ", failing out the host recursively", e); - nodeRepository().nodes().failOrMarkRecursively( - host.hostname(), Agent.HostResumeProvisioner, "Failed by HostResumeProvisioner due to provisioning failure"); + nodeRepository().nodes().parkRecursively( + host.hostname(), Agent.HostResumeProvisioner, true, "Failed by HostResumeProvisioner due to provisioning failure"); } catch (RuntimeException e) { if (e.getCause() instanceof NamingException) log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java index 2d513390cf5..24901cb10a9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java @@ -38,10 +38,9 @@ public class ProvisionedExpirer extends Expirer { for (Node expiredNode : expired) { if (expiredNode.type() != NodeType.host) continue; - nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, "Node is stuck in provisioned"); - if (MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired) { - nodeRepository.nodes().deprovision(expiredNode.hostname(), Agent.ProvisionedExpirer, nodeRepository.clock().instant()); - } + boolean forceDeprovision = MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired; + nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, + forceDeprovision, "Node is stuck in provisioned"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index c352dca1656..5c0614dd8d6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -403,8 +403,8 @@ public class Nodes { * * @return List of all the parked nodes in their new state */ - public List<Node> parkRecursively(String hostname, Agent agent, String reason) { - return moveRecursively(hostname, Node.State.parked, agent, Optional.of(reason)); + public List<Node> parkRecursively(String hostname, Agent agent, boolean forceDeprovision, String reason) { + return moveRecursively(hostname, Node.State.parked, agent, forceDeprovision, Optional.of(reason)); } /** @@ -433,12 +433,12 @@ public class Nodes { } } - private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) { + private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, boolean forceDeprovision, Optional<String> reason) { try (RecursiveNodeMutexes locked = lockAndGetRecursively(hostname, Optional.empty())) { List<Node> moved = new ArrayList<>(); NestedTransaction transaction = new NestedTransaction(); for (NodeMutex node : locked.nodes().nodes()) - moved.add(move(node.node().hostname(), toState, agent, false, reason, transaction)); + moved.add(move(node.node().hostname(), toState, agent, forceDeprovision, reason, transaction)); transaction.commit(); return moved; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java index a876999e80b..38cbfa7fe5f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java @@ -50,8 +50,10 @@ public interface HostProvisioner { * written to ZK immediately in case the config server goes down while waiting * for the provisioning to finish. * @throws NodeAllocationException if the cloud provider cannot satisfy the request + * @return a runnable that waits for the provisioning request to finish. It can be run without holding any locks, + * but may fail with an exception that should be propagated to the user initiating prepare() */ - void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException; + Runnable provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException; /** * Continue provisioning of given list of Nodes. diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java index fad42449285..17ff3b99e0c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java @@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing; +import com.yahoo.yolean.Exceptions; import java.util.LinkedHashSet; import java.util.List; @@ -108,6 +109,8 @@ public class Preparer { /// Note that this will write to the node repo. private List<Node> prepareWithLocks(ApplicationId application, ClusterSpec cluster, NodeSpec requested, NodeIndices indices, boolean makeExclusive) { + Runnable waiter = null; + List<Node> acceptedNodes; try (Mutex lock = nodeRepository.applications().lock(application); ApplicationMutex parentLockOrNull = parentLockOrNull(makeExclusive, requested.type()); Mutex allocationLock = nodeRepository.nodes().lockUnallocated()) { @@ -149,12 +152,13 @@ public class Preparer { requested.cloudAccount(), deficit.dueToFlavorUpgrade()); Predicate<NodeResources> realHostResourcesWithinLimits = resources -> nodeRepository.nodeResourceLimits().isWithinRealLimits(resources, application, cluster); - hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned); + waiter = hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned); } catch (NodeAllocationException e) { // Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do // not exist, we cannot remove them from ZK here because other nodes may already have been // allocated on them, so let HostDeprovisioner deal with it - hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant())); + hosts.forEach(host -> nodeRepository.nodes().parkRecursively(host.hostname(), Agent.system, true, + "Failed to provision: " + Exceptions.toMessageString(e))); throw e; } } else if (allocation.hostDeficit().isPresent() && requested.canFail() && @@ -177,7 +181,7 @@ public class Preparer { nodeRepository.nodes().setExclusiveToApplicationId(exclusiveParents, parentLockOrNull); // TODO: also update tags } - List<Node> acceptedNodes = allocation.finalNodes(); + acceptedNodes = allocation.finalNodes(); nodeRepository.nodes().reserve(allocation.reservableNodes()); nodeRepository.nodes().addReservedNodes(new LockedNodeList(allocation.newNodes(), allocationLock)); @@ -187,8 +191,10 @@ public class Preparer { .filter(node -> node.parentHostname().isEmpty() || activeHosts.parentOf(node).isPresent()) .toList(); } - return acceptedNodes; } + + if (waiter != null) waiter.run(); + return acceptedNodes; } private NodeAllocation prepareAllocation(ApplicationId application, ClusterSpec cluster, NodeSpec requested, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 0d9666e4f26..9080030f026 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -153,7 +153,7 @@ public class NodesV2ApiHandler extends ThreadedHttpRequestHandler { " and marked " + hostnamesAsString(failedOrMarkedNodes.failing().asList()) + " as wantToFail"); } else if (path.matches("/nodes/v2/state/parked/{hostname}")) { - List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), "Parked through the nodes/v2 API"); + List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), false, "Parked through the nodes/v2 API"); return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to " + Node.State.parked); } else if (path.matches("/nodes/v2/state/dirty/{hostname}")) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java index f7710ca7019..b5bb91af71a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java @@ -73,7 +73,7 @@ public class MockHostProvisioner implements HostProvisioner { } @Override - public void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException { + public Runnable provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException { if (behaviour(Behaviour.failProvisionRequest)) throw new NodeAllocationException("No capacity for provision request", true); Flavor hostFlavor = hostFlavors.get(request.clusterType().orElse(ClusterSpec.Type.content)); if (hostFlavor == null) @@ -101,6 +101,7 @@ public class MockHostProvisioner implements HostProvisioner { } provisionedHosts.addAll(hosts); whenProvisioned.accept(hosts); + return () -> {}; } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java index 77986d03da2..74db071b060 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java @@ -98,7 +98,11 @@ public class HostResumeProvisionerTest { Stream.of(host, node).map(n -> n.ipConfig().primary()).allMatch(List::isEmpty)); hostResumeProvisioner.maintain(); - assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.failed).hostnames()); + assertEquals(Set.of(), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames()); + tester.clock().advance(Duration.ofSeconds(60)); + + hostResumeProvisioner.maintain(); + assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames()); } private void deployApplication() { |