diff options
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java | 66 |
1 files changed, 39 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index 73c9a1ab55a..3143fb5bcfe 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -14,9 +14,7 @@ import com.yahoo.vespa.orchestrator.OrchestrationException; import com.yahoo.yolean.Exceptions; import java.time.Duration; -import java.util.List; import java.util.Map; -import java.util.stream.Collectors; /** * Maintenance job which deactivates retired nodes, if given permission by orchestrator, or @@ -53,42 +51,48 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { for (Map.Entry<ApplicationId, NodeList> entry : retiredNodesByApplication.entrySet()) { ApplicationId application = entry.getKey(); NodeList retiredNodes = entry.getValue(); - List<Node> nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList()); - if (nodesToRemove.isEmpty()) continue; - - attempts++; - try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { - if ( ! deployment.isValid()) continue; - - nodeRepository().nodes().setRemovable(application, nodesToRemove); - boolean success = deployment.activate().isPresent(); - if ( ! success) continue; - String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", ")); - log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); - successes++; + Map<Removal, NodeList> nodesByRemovalReason = retiredNodes.groupingBy(node -> removalOf(node, activeNodes)); + if (nodesByRemovalReason.isEmpty()) continue; + + for (var kv : nodesByRemovalReason.entrySet()) { + Removal removal = kv.getKey(); + if (removal.equals(Removal.none())) continue; + + NodeList nodes = kv.getValue(); + attempts++; + try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { + if (!deployment.isValid()) continue; + + nodeRepository().nodes().setRemovable(application, nodes.asList(), removal.isReusable()); + boolean success = deployment.activate().isPresent(); + if (!success) continue; + String nodeList = String.join(", ", nodes.mapToList(Node::hostname)); + log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); + successes++; + } } } return attempts == 0 ? 1.0 : ((double)successes / attempts); } /** - * Checks if the node can be removed: - * if the node is a host, it will only be removed if it has no children, - * or all its children are parked or failed. + * Returns the removal action for given node. + * + * If the node is a host, it will only be removed if it has no children, or all its children are parked or failed. + * * Otherwise, a removal is allowed if either of these are true: * - The node has been in state {@link History.Event.Type#retired} for longer than {@link #retiredExpiry} * - Orchestrator allows it */ - private boolean canRemove(Node node, NodeList activeNodes) { + private Removal removalOf(Node node, NodeList activeNodes) { if (node.type().isHost()) { if (nodeRepository().nodes().list().childrenOf(node).asList().stream() .allMatch(child -> child.state() == Node.State.parked || child.state() == Node.State.failed)) { log.info("Allowing removal of " + node + ": host has no non-parked/failed children"); - return true; + return Removal.reusable(); // Hosts have no state that needs to be recoverable } - - return false; + return Removal.none(); } if (node.type().isConfigServerLike()) { @@ -114,24 +118,32 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { // with node states across all config servers. As this would require some work, // we will instead verify here that there are 3 active config servers before // allowing the removal of any config server. - return false; + return Removal.none(); } } else if (node.history().hasEventBefore(History.Event.Type.retired, clock().instant().minus(retiredExpiry))) { log.warning("Node " + node + " has been retired longer than " + retiredExpiry + ": Allowing removal. This may cause data loss"); - return true; + return Removal.recoverable(); } try { nodeRepository().orchestrator().acquirePermissionToRemove(new HostName(node.hostname())); log.info("Node " + node + " has been granted permission to be removed"); - return true; + return Removal.reusable(); // Node is fully retired } catch (UncheckedTimeoutException e) { log.warning("Timed out trying to acquire permission to remove " + node.hostname() + ": " + Exceptions.toMessageString(e)); - return false; + return Removal.none(); } catch (OrchestrationException e) { log.info("Did not get permission to remove retired " + node + ": " + Exceptions.toMessageString(e)); - return false; + return Removal.none(); } } + private record Removal(boolean isRemovable, boolean isReusable) { + + private static Removal recoverable() { return new Removal(true, false); } + private static Removal reusable() { return new Removal(true, true); } + private static Removal none() { return new Removal(false, false); } + + } + } |