summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java66
1 files changed, 39 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index 73c9a1ab55a..3143fb5bcfe 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -14,9 +14,7 @@ import com.yahoo.vespa.orchestrator.OrchestrationException;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
-import java.util.List;
import java.util.Map;
-import java.util.stream.Collectors;
/**
* Maintenance job which deactivates retired nodes, if given permission by orchestrator, or
@@ -53,42 +51,48 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
for (Map.Entry<ApplicationId, NodeList> entry : retiredNodesByApplication.entrySet()) {
ApplicationId application = entry.getKey();
NodeList retiredNodes = entry.getValue();
- List<Node> nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList());
- if (nodesToRemove.isEmpty()) continue;
-
- attempts++;
- try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
- if ( ! deployment.isValid()) continue;
-
- nodeRepository().nodes().setRemovable(application, nodesToRemove);
- boolean success = deployment.activate().isPresent();
- if ( ! success) continue;
- String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
- log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
- successes++;
+ Map<Removal, NodeList> nodesByRemovalReason = retiredNodes.groupingBy(node -> removalOf(node, activeNodes));
+ if (nodesByRemovalReason.isEmpty()) continue;
+
+ for (var kv : nodesByRemovalReason.entrySet()) {
+ Removal removal = kv.getKey();
+ if (removal.equals(Removal.none())) continue;
+
+ NodeList nodes = kv.getValue();
+ attempts++;
+ try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
+ if (!deployment.isValid()) continue;
+
+ nodeRepository().nodes().setRemovable(application, nodes.asList(), removal.isReusable());
+ boolean success = deployment.activate().isPresent();
+ if (!success) continue;
+ String nodeList = String.join(", ", nodes.mapToList(Node::hostname));
+ log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
+ successes++;
+ }
}
}
return attempts == 0 ? 1.0 : ((double)successes / attempts);
}
/**
- * Checks if the node can be removed:
- * if the node is a host, it will only be removed if it has no children,
- * or all its children are parked or failed.
+ * Returns the removal action for given node.
+ *
+ * If the node is a host, it will only be removed if it has no children, or all its children are parked or failed.
+ *
* Otherwise, a removal is allowed if either of these are true:
* - The node has been in state {@link History.Event.Type#retired} for longer than {@link #retiredExpiry}
* - Orchestrator allows it
*/
- private boolean canRemove(Node node, NodeList activeNodes) {
+ private Removal removalOf(Node node, NodeList activeNodes) {
if (node.type().isHost()) {
if (nodeRepository().nodes().list().childrenOf(node).asList().stream()
.allMatch(child -> child.state() == Node.State.parked ||
child.state() == Node.State.failed)) {
log.info("Allowing removal of " + node + ": host has no non-parked/failed children");
- return true;
+ return Removal.reusable(); // Hosts have no state that needs to be recoverable
}
-
- return false;
+ return Removal.none();
}
if (node.type().isConfigServerLike()) {
@@ -114,24 +118,32 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
// with node states across all config servers. As this would require some work,
// we will instead verify here that there are 3 active config servers before
// allowing the removal of any config server.
- return false;
+ return Removal.none();
}
} else if (node.history().hasEventBefore(History.Event.Type.retired, clock().instant().minus(retiredExpiry))) {
log.warning("Node " + node + " has been retired longer than " + retiredExpiry + ": Allowing removal. This may cause data loss");
- return true;
+ return Removal.recoverable();
}
try {
nodeRepository().orchestrator().acquirePermissionToRemove(new HostName(node.hostname()));
log.info("Node " + node + " has been granted permission to be removed");
- return true;
+ return Removal.reusable(); // Node is fully retired
} catch (UncheckedTimeoutException e) {
log.warning("Timed out trying to acquire permission to remove " + node.hostname() + ": " + Exceptions.toMessageString(e));
- return false;
+ return Removal.none();
} catch (OrchestrationException e) {
log.info("Did not get permission to remove retired " + node + ": " + Exceptions.toMessageString(e));
- return false;
+ return Removal.none();
}
}
+ private record Removal(boolean isRemovable, boolean isReusable) {
+
+ private static Removal recoverable() { return new Removal(true, false); }
+ private static Removal reusable() { return new Removal(true, true); }
+ private static Removal none() { return new Removal(false, false); }
+
+ }
+
}