diff options
author | Håkon Hallingstad <hakon@yahooinc.com> | 2023-01-11 17:32:21 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@yahooinc.com> | 2023-01-11 17:32:21 +0100 |
commit | f65f9bb46d80d312683824c6811546c6d1b7161b (patch) | |
tree | ff984615aa12aff931be23055582ca2205395505 /node-repository | |
parent | e420f57dc74c53fb64bb6bf60b5180676b070b50 (diff) |
Also park wantToRetire/wantToDeprovision in FailedExpirer
Diffstat (limited to 'node-repository')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java | 33 |
1 files changed, 21 insertions, 12 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index d506eb6e3d3..fa3f9435c70 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -14,9 +14,9 @@ import com.yahoo.vespa.hosted.provision.node.History; import java.time.Duration; import java.util.ArrayList; import java.util.List; +import java.util.Optional; import java.util.function.Predicate; import java.util.logging.Logger; -import java.util.stream.Collectors; /** * This moves expired failed nodes: @@ -95,18 +95,20 @@ public class FailedExpirer extends NodeRepositoryMaintainer { private void recycle(List<Node> nodes, NodeList allNodes) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node candidate : nodes) { - if (broken(candidate, allNodes)) { - List<String> unparkedChildren = !candidate.type().isHost() ? List.of() : + Optional<String> reason = shouldPark(candidate, allNodes); + if (reason.isPresent()) { + List<String> unparkedChildren = candidate.type().isHost() ? allNodes.childrenOf(candidate) - .not().state(Node.State.parked) - .mapToList(Node::hostname); + .not() + .state(Node.State.parked) + .mapToList(Node::hostname) : + List.of(); if (unparkedChildren.isEmpty()) { nodeRepository.nodes().park(candidate.hostname(), true, Agent.FailedExpirer, - "Parked by FailedExpirer due to hardware issue or high fail count"); + "Parked by FailedExpirer due to " + reason.get()); } else { - log.info(String.format("Expired failed node %s with hardware issue was not parked because of " + - "unparked children: %s", + log.info(String.format("Expired failed node %s was not parked because of unparked children: %s", candidate.hostname(), String.join(", ", unparkedChildren))); } } else { @@ -116,10 +118,17 @@ public class FailedExpirer extends NodeRepositoryMaintainer { nodeRepository.nodes().deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer"); } - /** Returns whether node is broken and cannot be recycled */ - private boolean broken(Node node, NodeList allNodes) { - return NodeFailer.hasHardwareIssue(node, allNodes) || - (node.type().isHost() && node.status().failCount() >= maxAllowedFailures); + /** Returns whether the node should be parked instead of recycled */ + private Optional<String> shouldPark(Node node, NodeList allNodes) { + if (NodeFailer.hasHardwareIssue(node, allNodes)) + return Optional.of("has hardware issues"); + if (node.type().isHost() && node.status().failCount() >= maxAllowedFailures) + return Optional.of("has failed too many times"); + if (node.status().wantToDeprovision()) + return Optional.of("want to deprovision"); + if (node.status().wantToRetire()) + return Optional.of("want to retire"); + return Optional.empty(); } } |