From 96f8d1dd19764c3851a7e98d38e3015347ec0364 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Fri, 14 Apr 2023 17:23:22 +0200 Subject: Do not fail if no capacity due to retirement --- .../hosted/provision/provisioning/GroupPreparer.java | 9 ++++++++- .../provision/provisioning/NodeAllocation.java | 9 +++++++-- .../hosted/provision/provisioning/NodeSpec.java | 20 +++++++++++++------- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'node-repository/src/main/java/com/yahoo') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index eff39dddf42..06c1916dd4f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -21,7 +21,6 @@ import java.util.function.Consumer; import java.util.function.Supplier; import java.util.logging.Level; import java.util.logging.Logger; -import java.util.stream.Collectors; /** * Performs preparation of node activation changes for a single host group in an application. @@ -124,6 +123,14 @@ public class GroupPreparer { hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant())); throw e; } + } else if (allocation.hostDeficit().isPresent() && requestedNodes.canFail() && + allocation.hasRetiredJustNow() && requestedNodes instanceof NodeSpec.CountNodeSpec cns) { + // Non-dynamically provisioned zone with a deficit because we just now retired some nodes. + // Try again, but without retiring + indices.resetProbe(); + List accepted = prepareWithLocks(application, cluster, cns.withoutRetiring(), surplusActiveNodes, indices, wantedGroups); + log.warning("Prepared " + application + " " + cluster.id() + " without retirement due to lack of capacity"); + return accepted; } if (! allocation.fulfilled() && requestedNodes.canFail()) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index c6971f0fe02..3af63125474 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -307,10 +307,15 @@ class NodeAllocation { } /** Returns true if this allocation was already fulfilled and resulted in no new changes */ - public boolean fulfilledAndNoChanges() { + boolean fulfilledAndNoChanges() { return fulfilled() && reservableNodes().isEmpty() && newNodes().isEmpty(); } + /** Returns true if this allocation has retired nodes */ + boolean hasRetiredJustNow() { + return wasRetiredJustNow > 0; + } + /** * Returns {@link HostDeficit} describing the host deficit for the given {@link NodeSpec}. * @@ -451,7 +456,7 @@ class NodeAllocation { .toList(); } - public String allocationFailureDetails() { + String allocationFailureDetails() { List reasons = new ArrayList<>(); if (rejectedDueToExclusivity > 0) reasons.add("host exclusivity constraints"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java index 9edfa221abf..28d1e7c1c68 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java @@ -3,8 +3,6 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.CloudAccount; import com.yahoo.config.provision.ClusterSpec; -import com.yahoo.config.provision.Flavor; -import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; @@ -79,7 +77,7 @@ public interface NodeSpec { } static NodeSpec from(int nodeCount, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) { - return new CountNodeSpec(nodeCount, resources, exclusive, canFail, cloudAccount); + return new CountNodeSpec(nodeCount, resources, exclusive, canFail, canFail, cloudAccount); } static NodeSpec from(NodeType type, CloudAccount cloudAccount) { @@ -93,14 +91,19 @@ public interface NodeSpec { private final NodeResources requestedNodeResources; private final boolean exclusive; private final boolean canFail; + private final boolean considerRetiring; private final CloudAccount cloudAccount; - private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) { + private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, boolean considerRetiring, CloudAccount cloudAccount) { this.count = count; this.requestedNodeResources = Objects.requireNonNull(resources, "Resources must be specified"); this.exclusive = exclusive; this.canFail = canFail; + this.considerRetiring = considerRetiring; this.cloudAccount = Objects.requireNonNull(cloudAccount); + + if (!canFail && considerRetiring) + throw new IllegalArgumentException("Cannot consider retiring nodes if we cannot fail"); } @Override @@ -127,8 +130,7 @@ public interface NodeSpec { @Override public boolean considerRetiring() { - // If we cannot fail we cannot retire as we may end up without sufficient replacement capacity - return canFail(); + return considerRetiring; } @Override @@ -143,7 +145,11 @@ public interface NodeSpec { @Override public NodeSpec fraction(int divisor) { - return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, cloudAccount); + return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, considerRetiring, cloudAccount); + } + + public NodeSpec withoutRetiring() { + return new CountNodeSpec(count, requestedNodeResources, exclusive, canFail, false, cloudAccount); } @Override -- cgit v1.2.3