summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-06-28 14:51:24 +0200
committerMartin Polden <mpolden@mpolden.no>2022-06-28 15:10:18 +0200
commit0552e8d348a5b5e6e3866afb101592b526084ade (patch)
tree104d247e0c94ede72c13afd48046cd116655900c /node-repository/src/main/java/com/yahoo
parentf6754a825beb5c9e7655d71b726de4428d10689d (diff)
Reuse fully retired nodes faster
Diffstat (limited to 'node-repository/src/main/java/com/yahoo')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java66
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Allocation.java44
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java2
9 files changed, 98 insertions, 60 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index e91963ab621..cde7f300f2b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -304,7 +304,7 @@ public final class Node implements Nodelike {
/** Returns a copy of this with removable set to the given value */
public Node removable(boolean removable) {
- return with(requireAllocation("Cannot set removable").removable(removable));
+ return with(requireAllocation("Cannot set removable").removable(removable, false));
}
/** Returns a copy of this with the restart generation set to generation */
@@ -385,9 +385,8 @@ public final class Node implements Nodelike {
/** Returns a copy of this with allocation set as specified. <code>node.state</code> is *not* changed. */
public Node allocate(ApplicationId owner, ClusterMembership membership, NodeResources requestedResources, Instant at) {
- return this
- .with(new Allocation(owner, membership, requestedResources, new Generation(0, 0), false))
- .with(history.with(new History.Event(History.Event.Type.reserved, Agent.application, at)));
+ return this.with(new Allocation(owner, membership, requestedResources, new Generation(0, 0), false))
+ .with(history.with(new History.Event(History.Event.Type.reserved, Agent.application, at)));
}
/**
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index 9a110427223..dd4d5aa213f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -9,7 +9,6 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.node.ClusterId;
-import com.yahoo.vespa.hosted.provision.node.Report;
import java.util.Comparator;
import java.util.EnumSet;
@@ -57,7 +56,12 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
/** Returns the subset of nodes which are removable */
public NodeList removable() {
- return matching(node -> node.allocation().isPresent() && node.allocation().get().isRemovable());
+ return matching(node -> node.allocation().isPresent() && node.allocation().get().removable());
+ }
+
+ /** Returns the subset of nodes which are reusable immediately after removal */
+ public NodeList reusable() {
+ return matching(node -> node.allocation().isPresent() && node.allocation().get().reusable());
}
/** Returns the subset of nodes having exactly the given resources */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 9623406767a..c2e66d39861 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -115,7 +115,7 @@ public class Autoscaler {
// The cluster is processing recent changes
if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() ||
node.allocation().get().membership().retired() ||
- node.allocation().get().isRemovable()))
+ node.allocation().get().removable()))
return false;
// A deployment is ongoing
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index 73c9a1ab55a..3143fb5bcfe 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -14,9 +14,7 @@ import com.yahoo.vespa.orchestrator.OrchestrationException;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
-import java.util.List;
import java.util.Map;
-import java.util.stream.Collectors;
/**
* Maintenance job which deactivates retired nodes, if given permission by orchestrator, or
@@ -53,42 +51,48 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
for (Map.Entry<ApplicationId, NodeList> entry : retiredNodesByApplication.entrySet()) {
ApplicationId application = entry.getKey();
NodeList retiredNodes = entry.getValue();
- List<Node> nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList());
- if (nodesToRemove.isEmpty()) continue;
-
- attempts++;
- try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
- if ( ! deployment.isValid()) continue;
-
- nodeRepository().nodes().setRemovable(application, nodesToRemove);
- boolean success = deployment.activate().isPresent();
- if ( ! success) continue;
- String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
- log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
- successes++;
+ Map<Removal, NodeList> nodesByRemovalReason = retiredNodes.groupingBy(node -> removalOf(node, activeNodes));
+ if (nodesByRemovalReason.isEmpty()) continue;
+
+ for (var kv : nodesByRemovalReason.entrySet()) {
+ Removal removal = kv.getKey();
+ if (removal.equals(Removal.none())) continue;
+
+ NodeList nodes = kv.getValue();
+ attempts++;
+ try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
+ if (!deployment.isValid()) continue;
+
+ nodeRepository().nodes().setRemovable(application, nodes.asList(), removal.isReusable());
+ boolean success = deployment.activate().isPresent();
+ if (!success) continue;
+ String nodeList = String.join(", ", nodes.mapToList(Node::hostname));
+ log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
+ successes++;
+ }
}
}
return attempts == 0 ? 1.0 : ((double)successes / attempts);
}
/**
- * Checks if the node can be removed:
- * if the node is a host, it will only be removed if it has no children,
- * or all its children are parked or failed.
+ * Returns the removal action for given node.
+ *
+ * If the node is a host, it will only be removed if it has no children, or all its children are parked or failed.
+ *
* Otherwise, a removal is allowed if either of these are true:
* - The node has been in state {@link History.Event.Type#retired} for longer than {@link #retiredExpiry}
* - Orchestrator allows it
*/
- private boolean canRemove(Node node, NodeList activeNodes) {
+ private Removal removalOf(Node node, NodeList activeNodes) {
if (node.type().isHost()) {
if (nodeRepository().nodes().list().childrenOf(node).asList().stream()
.allMatch(child -> child.state() == Node.State.parked ||
child.state() == Node.State.failed)) {
log.info("Allowing removal of " + node + ": host has no non-parked/failed children");
- return true;
+ return Removal.reusable(); // Hosts have no state that needs to be recoverable
}
-
- return false;
+ return Removal.none();
}
if (node.type().isConfigServerLike()) {
@@ -114,24 +118,32 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
// with node states across all config servers. As this would require some work,
// we will instead verify here that there are 3 active config servers before
// allowing the removal of any config server.
- return false;
+ return Removal.none();
}
} else if (node.history().hasEventBefore(History.Event.Type.retired, clock().instant().minus(retiredExpiry))) {
log.warning("Node " + node + " has been retired longer than " + retiredExpiry + ": Allowing removal. This may cause data loss");
- return true;
+ return Removal.recoverable();
}
try {
nodeRepository().orchestrator().acquirePermissionToRemove(new HostName(node.hostname()));
log.info("Node " + node + " has been granted permission to be removed");
- return true;
+ return Removal.reusable(); // Node is fully retired
} catch (UncheckedTimeoutException e) {
log.warning("Timed out trying to acquire permission to remove " + node.hostname() + ": " + Exceptions.toMessageString(e));
- return false;
+ return Removal.none();
} catch (OrchestrationException e) {
log.info("Did not get permission to remove retired " + node + ": " + Exceptions.toMessageString(e));
- return false;
+ return Removal.none();
}
}
+ private record Removal(boolean isRemovable, boolean isReusable) {
+
+ private static Removal recoverable() { return new Removal(true, false); }
+ private static Removal reusable() { return new Removal(true, true); }
+ private static Removal none() { return new Removal(false, false); }
+
+ }
+
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Allocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Allocation.java
index 9c71d621c69..ec1c6cd4b1a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Allocation.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Allocation.java
@@ -27,25 +27,33 @@ public class Allocation {
*/
private final Generation restartGeneration;
- /** This node can (and should) be removed from the cluster on the next deployment */
+ /** This node should be removed from the cluster on the next deployment */
private final boolean removable;
+ /** This node can be reused (dirtied) immediately after being removed from the cluster */
+ private final boolean reusable;
+
private final Optional<NetworkPorts> networkPorts;
public Allocation(ApplicationId owner, ClusterMembership clusterMembership, NodeResources requestedResources,
Generation restartGeneration, boolean removable) {
- this(owner, clusterMembership, requestedResources, restartGeneration, removable, Optional.empty());
+ this(owner, clusterMembership, requestedResources, restartGeneration, removable, false, Optional.empty());
}
public Allocation(ApplicationId owner, ClusterMembership clusterMembership, NodeResources requestedResources,
- Generation restartGeneration, boolean removable, Optional<NetworkPorts> networkPorts) {
+ Generation restartGeneration, boolean removable, boolean reusable,
+ Optional<NetworkPorts> networkPorts) {
this.owner = owner;
this.clusterMembership = clusterMembership;
this.requestedResources = requestedResources;
this.restartGeneration = restartGeneration;
this.removable = removable;
+ this.reusable = reusable;
this.networkPorts = networkPorts;
+ if (!removable && reusable) {
+ throw new IllegalArgumentException("Allocation must be removable in order to be reusable");
+ }
}
/** Returns the id of the application this is allocated to */
@@ -65,37 +73,45 @@ public class Allocation {
/** Returns a copy of this which is retired */
public Allocation retire() {
- return new Allocation(owner, clusterMembership.retire(), requestedResources, restartGeneration, removable, networkPorts);
+ return new Allocation(owner, clusterMembership.retire(), requestedResources, restartGeneration, removable, reusable, networkPorts);
}
/** Returns a copy of this which is not retired */
public Allocation unretire() {
- return new Allocation(owner, clusterMembership.unretire(), requestedResources, restartGeneration, removable, networkPorts);
+ return new Allocation(owner, clusterMembership.unretire(), requestedResources, restartGeneration, removable, reusable, networkPorts);
}
- /** Return whether this node is ready to be removed from the application */
- public boolean isRemovable() { return removable; }
+ /** Returns whether this node is ready to be removed from the application */
+ public boolean removable() { return removable; }
+
+ /** Returns whether this node has fully retired and can be reused immediately */
+ public boolean reusable() {
+ return reusable;
+ }
public Allocation withRequestedResources(NodeResources resources) {
- return new Allocation(owner, clusterMembership, resources, restartGeneration, removable, networkPorts);
+ return new Allocation(owner, clusterMembership, resources, restartGeneration, removable, reusable, networkPorts);
}
/** Returns a copy of this with the current restart generation set to generation */
public Allocation withRestart(Generation generation) {
- return new Allocation(owner, clusterMembership, requestedResources, generation, removable, networkPorts);
+ return new Allocation(owner, clusterMembership, requestedResources, generation, removable, reusable, networkPorts);
}
- /** Returns a copy of this allocation where removable is set to the given value */
- public Allocation removable(boolean removable) {
- return new Allocation(owner, clusterMembership, requestedResources, restartGeneration, removable, networkPorts);
+ /**
+ * Returns a copy of this allocation where removable and reusable are set to the given values. A node which is
+ * reusable may be moved directly to {@link com.yahoo.vespa.hosted.provision.Node.State#dirty} after removal.
+ */
+ public Allocation removable(boolean removable, boolean reusable) {
+ return new Allocation(owner, clusterMembership, requestedResources, restartGeneration, removable, reusable, networkPorts);
}
public Allocation with(ClusterMembership newMembership) {
- return new Allocation(owner, newMembership, requestedResources, restartGeneration, removable, networkPorts);
+ return new Allocation(owner, newMembership, requestedResources, restartGeneration, removable, reusable, networkPorts);
}
public Allocation withNetworkPorts(NetworkPorts ports) {
- return new Allocation(owner, clusterMembership, requestedResources, restartGeneration, removable, Optional.of(ports));
+ return new Allocation(owner, clusterMembership, requestedResources, restartGeneration, removable, reusable, Optional.of(ports));
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index 11d3e03e494..d750a3ef737 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -226,13 +226,14 @@ public class Nodes {
* Sets a list of nodes to have their allocation removable (active to inactive) in the node repository.
*
* @param application the application the nodes belong to
- * @param nodes the nodes to make removable. These nodes MUST be in the active state.
+ * @param nodes the nodes to make removable. These nodes MUST be in the active state
+ * @param reusable move the node directly to {@link Node.State#dirty} after removal
*/
- public void setRemovable(ApplicationId application, List<Node> nodes) {
+ public void setRemovable(ApplicationId application, List<Node> nodes, boolean reusable) {
try (Mutex lock = lock(application)) {
List<Node> removableNodes = nodes.stream()
- .map(node -> node.with(node.allocation().get().removable(true)))
- .collect(Collectors.toList());
+ .map(node -> node.with(node.allocation().get().removable(true, reusable)))
+ .toList();
write(removableNodes, lock);
}
}
@@ -247,9 +248,12 @@ public class Nodes {
var stateless = NodeList.copyOf(nodes).stateless();
var stateful = NodeList.copyOf(nodes).stateful();
+ var statefulToInactive = stateful.not().reusable();
+ var statefulToDirty = stateful.reusable();
List<Node> written = new ArrayList<>();
written.addAll(deallocate(stateless.asList(), Agent.application, "Deactivated by application", transaction.nested()));
- written.addAll(db.writeTo(Node.State.inactive, stateful.asList(), Agent.application, Optional.empty(), transaction.nested()));
+ written.addAll(deallocate(statefulToDirty.asList(), Agent.application, "Deactivated by application (recycled)", transaction.nested()));
+ written.addAll(db.writeTo(Node.State.inactive, statefulToInactive.asList(), Agent.application, Optional.empty(), transaction.nested()));
return written;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index 083b707b3c5..24c52f7b2e0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -115,6 +115,7 @@ public class NodeSerializer {
private static final String restartGenerationKey = "restartGeneration";
private static final String currentRestartGenerationKey = "currentRestartGeneration";
private static final String removableKey = "removable";
+ private static final String reusableKey = "reusable";
// Saved as part of allocation instead of serviceId, since serviceId serialized form is not easily extendable.
private static final String wantedVespaVersionKey = "wantedVespaVersion";
private static final String wantedContainerImageRepoKey = "wantedDockerImageRepo";
@@ -217,7 +218,8 @@ public class NodeSerializer {
object.setString(serviceIdKey, allocation.membership().stringValue());
object.setLong(restartGenerationKey, allocation.restartGeneration().wanted());
object.setLong(currentRestartGenerationKey, allocation.restartGeneration().current());
- object.setBool(removableKey, allocation.isRemovable());
+ object.setBool(removableKey, allocation.removable());
+ object.setBool(reusableKey, allocation.reusable());
object.setString(wantedVespaVersionKey, allocation.membership().cluster().vespaVersion().toString());
allocation.membership().cluster().dockerImageRepo().ifPresent(repo -> object.setString(wantedContainerImageRepoKey, repo.untagged()));
allocation.networkPorts().ifPresent(ports -> NetworkPortsSerializer.toSlime(ports, object.setArray(networkPortsKey)));
@@ -330,6 +332,7 @@ public class NodeSerializer {
.orElse(assignedResources),
generationFromSlime(object, restartGenerationKey, currentRestartGenerationKey),
object.field(removableKey).asBool(),
+ object.field(reusableKey).asBool(),
NetworkPortsSerializer.fromSlime(object.field(networkPortsKey))));
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
index 3da0506f2e1..5d73c4929af 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
@@ -88,8 +88,7 @@ class Activator {
validateParentHosts(application, allNodes, reserved);
NodeList activeToRemove = oldActive.matching(node -> ! hostnames.contains(node.hostname()));
- activeToRemove = NodeList.copyOf(activeToRemove.mapToList(Node::unretire)); // only active nodes can be retired. TODO: Move this line to deactivate
- deactivate(activeToRemove, transaction); // TODO: Pass activation time in this call and next line
+ remove(activeToRemove, transaction); // TODO: Pass activation time in this call and next line
nodeRepository.nodes().activate(newActive.asList(), transaction.nested()); // activate also continued active to update node state
rememberResourceChange(transaction, generation, activationTime,
@@ -99,9 +98,10 @@ class Activator {
return newActive;
}
- private void deactivate(NodeList toDeactivate, ApplicationTransaction transaction) {
- nodeRepository.nodes().deactivate(toDeactivate.not().failing().asList(), transaction);
- nodeRepository.nodes().fail(toDeactivate.failing().asList(), transaction);
+ private void remove(NodeList nodes, ApplicationTransaction transaction) {
+ nodes = NodeList.copyOf(nodes.mapToList(Node::unretire)); // clear retire flag when moving to non-active state
+ nodeRepository.nodes().deactivate(nodes.not().failing().asList(), transaction);
+ nodeRepository.nodes().fail(nodes.failing().asList(), transaction);
}
private void rememberResourceChange(ApplicationTransaction transaction, long generation, Instant at,
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
index a26df62be27..9557354725b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
@@ -112,7 +112,7 @@ class NodeAllocation {
if ( ! allocation.owner().equals(application)) continue; // wrong application
if ( ! membership.cluster().satisfies(cluster)) continue; // wrong cluster id/type
if ((! candidate.isSurplus || saturated()) && ! membership.cluster().group().equals(cluster.group())) continue; // wrong group and we can't or have no reason to change it
- if ( candidate.state() == Node.State.active && allocation.isRemovable()) continue; // don't accept; causes removal
+ if ( candidate.state() == Node.State.active && allocation.removable()) continue; // don't accept; causes removal
if ( candidate.state() == Node.State.active && candidate.wantToFail()) continue; // don't accept; causes failing
if ( indexes.contains(membership.index())) continue; // duplicate index (just to be sure)