From 96f8d1dd19764c3851a7e98d38e3015347ec0364 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Fri, 14 Apr 2023 17:23:22 +0200 Subject: Do not fail if no capacity due to retirement --- .../hosted/provision/provisioning/GroupPreparer.java | 9 ++++++++- .../provision/provisioning/NodeAllocation.java | 9 +++++++-- .../hosted/provision/provisioning/NodeSpec.java | 20 +++++++++++++------- .../provision/provisioning/ProvisioningTest.java | 17 +++++++++++++++++ .../provisioning/VirtualNodeProvisioningTest.java | 14 +++++--------- 5 files changed, 50 insertions(+), 19 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index eff39dddf42..06c1916dd4f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -21,7 +21,6 @@ import java.util.function.Consumer; import java.util.function.Supplier; import java.util.logging.Level; import java.util.logging.Logger; -import java.util.stream.Collectors; /** * Performs preparation of node activation changes for a single host group in an application. @@ -124,6 +123,14 @@ public class GroupPreparer { hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant())); throw e; } + } else if (allocation.hostDeficit().isPresent() && requestedNodes.canFail() && + allocation.hasRetiredJustNow() && requestedNodes instanceof NodeSpec.CountNodeSpec cns) { + // Non-dynamically provisioned zone with a deficit because we just now retired some nodes. + // Try again, but without retiring + indices.resetProbe(); + List accepted = prepareWithLocks(application, cluster, cns.withoutRetiring(), surplusActiveNodes, indices, wantedGroups); + log.warning("Prepared " + application + " " + cluster.id() + " without retirement due to lack of capacity"); + return accepted; } if (! allocation.fulfilled() && requestedNodes.canFail()) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index c6971f0fe02..3af63125474 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -307,10 +307,15 @@ class NodeAllocation { } /** Returns true if this allocation was already fulfilled and resulted in no new changes */ - public boolean fulfilledAndNoChanges() { + boolean fulfilledAndNoChanges() { return fulfilled() && reservableNodes().isEmpty() && newNodes().isEmpty(); } + /** Returns true if this allocation has retired nodes */ + boolean hasRetiredJustNow() { + return wasRetiredJustNow > 0; + } + /** * Returns {@link HostDeficit} describing the host deficit for the given {@link NodeSpec}. * @@ -451,7 +456,7 @@ class NodeAllocation { .toList(); } - public String allocationFailureDetails() { + String allocationFailureDetails() { List reasons = new ArrayList<>(); if (rejectedDueToExclusivity > 0) reasons.add("host exclusivity constraints"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java index 9edfa221abf..28d1e7c1c68 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java @@ -3,8 +3,6 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.CloudAccount; import com.yahoo.config.provision.ClusterSpec; -import com.yahoo.config.provision.Flavor; -import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; @@ -79,7 +77,7 @@ public interface NodeSpec { } static NodeSpec from(int nodeCount, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) { - return new CountNodeSpec(nodeCount, resources, exclusive, canFail, cloudAccount); + return new CountNodeSpec(nodeCount, resources, exclusive, canFail, canFail, cloudAccount); } static NodeSpec from(NodeType type, CloudAccount cloudAccount) { @@ -93,14 +91,19 @@ public interface NodeSpec { private final NodeResources requestedNodeResources; private final boolean exclusive; private final boolean canFail; + private final boolean considerRetiring; private final CloudAccount cloudAccount; - private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) { + private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, boolean considerRetiring, CloudAccount cloudAccount) { this.count = count; this.requestedNodeResources = Objects.requireNonNull(resources, "Resources must be specified"); this.exclusive = exclusive; this.canFail = canFail; + this.considerRetiring = considerRetiring; this.cloudAccount = Objects.requireNonNull(cloudAccount); + + if (!canFail && considerRetiring) + throw new IllegalArgumentException("Cannot consider retiring nodes if we cannot fail"); } @Override @@ -127,8 +130,7 @@ public interface NodeSpec { @Override public boolean considerRetiring() { - // If we cannot fail we cannot retire as we may end up without sufficient replacement capacity - return canFail(); + return considerRetiring; } @Override @@ -143,7 +145,11 @@ public interface NodeSpec { @Override public NodeSpec fraction(int divisor) { - return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, cloudAccount); + return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, considerRetiring, cloudAccount); + } + + public NodeSpec withoutRetiring() { + return new CountNodeSpec(count, requestedNodeResources, exclusive, canFail, false, cloudAccount); } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java index 978edf3f7e4..2cd0e84c356 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java @@ -756,6 +756,23 @@ public class ProvisioningTest { assertEquals(6, tester.nodeRepository().nodes().list(Node.State.active).owner(application).size()); } + @Test + public void ignore_retirement_if_no_capacity() { + ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); + tester.makeReadyHosts(3, defaultResources).activateTenantHosts(); + + ApplicationId application = ProvisioningTester.applicationId(); + ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("music")).vespaVersion("4.5.6").build(); + tester.activate(application, tester.prepare(application, cluster, 3, 1, defaultResources)); + + // Mark the nodes as want to retire + NodeList nodes = tester.getNodes(application); + tester.patchNodes(nodes.asList(), node -> node.withWantToRetire(true, Agent.system, tester.clock().instant())); + + tester.activate(application, tester.prepare(application, cluster, 3, 1, defaultResources)); + assertEquals(3, tester.getNodes(application).state(Node.State.active).not().retired().size()); + } + @Test public void highest_node_indexes_are_retired_first() { ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java index fb773f19b8a..0744d82c85b 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java @@ -34,7 +34,6 @@ import java.util.stream.Stream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -171,7 +170,7 @@ public class VirtualNodeProvisioningTest { @Test public void indistinct_distribution_with_known_ready_nodes() { ProvisioningTester tester = new ProvisioningTester.Builder().build(); - tester.makeReadyChildren(3, resources1); + tester.makeReadyChildren(4, resources1); int contentNodeCount = 3; int groups = 1; @@ -191,13 +190,10 @@ public class VirtualNodeProvisioningTest { tester.makeReadyChildren(1, resources1, "parentHost1"); tester.makeReadyChildren(2, resources1, "parentHost2"); - NodeAllocationException expectedException = null; - try { - tester.prepare(applicationId, contentClusterSpec, contentNodeCount, groups, resources1); - } catch (NodeAllocationException e) { - expectedException = e; - } - assertNotNull(expectedException); + tester.activate(applicationId, tester.prepare(applicationId, contentClusterSpec, contentNodeCount, groups, resources1)); + nodes = tester.getNodes(applicationId, Node.State.active); + assertEquals(4, nodes.size()); + assertEquals(1, nodes.retired().size()); } @Test -- cgit v1.2.3