diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-06-06 14:02:32 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-06-06 14:02:32 +0200 |
commit | 94cbae74bca37d7fb288af3d132f66b35d776b0f (patch) | |
tree | 726d85550ddee6d21021f13b2dc0be8edc703d36 /node-repository | |
parent | 5fa0497e8f56de637c8d5f3b14aa7245050a0072 (diff) |
Don't hard remove active nodes
Keep existing active nodes even if they are incompatible and not resizable
if we are not allowed to retire them.
Diffstat (limited to 'node-repository')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java | 47 |
1 files changed, 28 insertions, 19 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index 4ef315b9322..e24b6fcf7f5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -60,8 +60,11 @@ class NodeAllocation { /** The number of already allocated nodes accepted and not retired */ private int accepted = 0; - /** The number of already allocated nodes accepted and not retired and not needing resize */ - private int acceptedWithoutResizingRetired = 0; + /** The number of already allocated nodes of compatible size */ + private int acceptedAndCompatible = 0; + + /** The number of already allocated nodes which can be made compatible*/ + private int acceptedAndCompatibleOrResizable = 0; /** The number of nodes rejected because of clashing parentHostname */ private int rejectedDueToClashingParentHost = 0; @@ -127,9 +130,8 @@ class NodeAllocation { if (nodeRepository.zone().cloud().allowEnclave() && candidate.parent.isPresent() && ! candidate.parent.get().cloudAccount().equals(requestedNodes.cloudAccount())) continue; // wrong account boolean resizeable = requestedNodes.considerRetiring() && candidate.isResizable; - boolean acceptToRetire = acceptToRetire(candidate); - if ((! saturated() && hasCompatibleResources(candidate) && requestedNodes.acceptable(candidate)) || acceptToRetire) { + if ((! saturated() && hasCompatibleResources(candidate) && requestedNodes.acceptable(candidate)) || acceptIncompatible(candidate)) { candidate = candidate.withNode(); if (candidate.isValid()) acceptNode(candidate, shouldRetire(candidate, candidates), resizeable); @@ -225,23 +227,27 @@ class NodeAllocation { /** * Returns whether this node should be accepted into the cluster even if it is not currently desired * (already enough nodes, or wrong resources, etc.). - * Such nodes will be marked retired during finalization of the list of accepted nodes. + * Such nodes will be marked retired during finalization of the list of accepted nodes when allowed. * The conditions for this are: * - * This is a stateful node. These must always be retired before being removed to allow the cluster to + * - We are forced to accept since we cannot remove gracefully (bootstrap). + * + * - This is a stateful node. These must always be retired before being removed to allow the cluster to * migrate away data. * - * This is a container node and it is not desired due to having the wrong flavor. In this case this + * - This is a container node and it is not desired due to having the wrong flavor. In this case this * will (normally) obtain for all the current nodes in the cluster and so retiring before removing must * be used to avoid removing all the current nodes at once, before the newly allocated replacements are * initialized. (In the other case, where a container node is not desired because we have enough nodes we * do want to remove it immediately to get immediate feedback on how the size reduction works out.) */ - private boolean acceptToRetire(NodeCandidate candidate) { + private boolean acceptIncompatible(NodeCandidate candidate) { if (candidate.state() != Node.State.active) return false; if (! candidate.allocation().get().membership().cluster().group().equals(cluster.group())) return false; if (candidate.allocation().get().membership().retired()) return true; // don't second-guess if already retired - if (! requestedNodes.considerRetiring()) return false; + + if ( ! requestedNodes.considerRetiring()) // the node is active and we are not allowed to remove gracefully, so keep + return true; return cluster.isStateful() || (cluster.type() == ClusterSpec.Type.container && !hasCompatibleResources(candidate)); @@ -263,12 +269,15 @@ class NodeAllocation { // We want to allocate new nodes rather than unretiring with resize, so count without those // for the purpose of deciding when to stop accepting nodes (saturation) if (node.allocation().isEmpty() - || ! ( requestedNodes.needsResize(node) && node.allocation().get().membership().retired())) - acceptedWithoutResizingRetired++; + || ! ( requestedNodes.needsResize(node) && + (node.allocation().get().membership().retired() || ! requestedNodes.considerRetiring()))) { + acceptedAndCompatible++; + } + if (hasCompatibleResources(candidate)) + acceptedAndCompatibleOrResizable++; - if (resizeable && ! ( node.allocation().isPresent() && node.allocation().get().membership().retired())) { + if (resizeable && ! ( node.allocation().isPresent() && node.allocation().get().membership().retired())) node = resize(node); - } if (node.state() != Node.State.active) // reactivated node - wipe state that deactivated it node = node.unretire().removable(false); @@ -305,13 +314,13 @@ class NodeAllocation { } /** Returns true if no more nodes are needed in this list */ - private boolean saturated() { - return requestedNodes.saturatedBy(acceptedWithoutResizingRetired); + public boolean saturated() { + return requestedNodes.saturatedBy(acceptedAndCompatible); } /** Returns true if the content of this list is sufficient to meet the request */ boolean fulfilled() { - return requestedNodes.fulfilledBy(accepted()); + return requestedNodes.fulfilledBy(acceptedAndCompatibleOrResizable()); } /** Returns true if this allocation was already fulfilled and resulted in no new changes */ @@ -334,7 +343,7 @@ class NodeAllocation { if (nodeType().isHost()) { return Optional.empty(); // Hosts are provisioned as required by the child application } - int deficit = requestedNodes.fulfilledDeficitCount(accepted()); + int deficit = requestedNodes.fulfilledDeficitCount(acceptedAndCompatibleOrResizable()); // We can only require flavor upgrade if the entire deficit is caused by upgrades boolean dueToFlavorUpgrade = deficit == wasRetiredDueToFlavorUpgrade; return Optional.of(new HostDeficit(requestedNodes.resources().orElseGet(NodeResources::unspecified), @@ -445,8 +454,8 @@ class NodeAllocation { } /** Returns the number of nodes accepted this far */ - private int accepted() { - if (nodeType() == NodeType.tenant) return accepted; + private int acceptedAndCompatibleOrResizable() { + if (nodeType() == NodeType.tenant) return acceptedAndCompatibleOrResizable; // Infrastructure nodes are always allocated by type. Count all nodes as accepted so that we never exceed // the wanted number of nodes for the type. return allNodes.nodeType(nodeType()).size(); |