summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2023-01-12 00:09:15 +0100
committerGitHub <noreply@github.com>2023-01-12 00:09:15 +0100
commit844eeeeebfd8cdffb28ee7d64e05a803aa2f0e5a (patch)
treee1a8316c52b9bcf417340b341d342487f8d0f39c /node-repository
parent65f655bac61810a39c964fb3eb72f35f0c0154fb (diff)
parentf65f9bb46d80d312683824c6811546c6d1b7161b (diff)
Merge pull request #25520 from vespa-engine/hakonhall/also-park-wanttoretirewanttodeprovision-in-failedexpirer
Also park wantToRetire/wantToDeprovision in FailedExpirer
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java33
1 files changed, 21 insertions, 12 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index d506eb6e3d3..fa3f9435c70 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -14,9 +14,9 @@ import com.yahoo.vespa.hosted.provision.node.History;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
+import java.util.Optional;
import java.util.function.Predicate;
import java.util.logging.Logger;
-import java.util.stream.Collectors;
/**
* This moves expired failed nodes:
@@ -95,18 +95,20 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
private void recycle(List<Node> nodes, NodeList allNodes) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node candidate : nodes) {
- if (broken(candidate, allNodes)) {
- List<String> unparkedChildren = !candidate.type().isHost() ? List.of() :
+ Optional<String> reason = shouldPark(candidate, allNodes);
+ if (reason.isPresent()) {
+ List<String> unparkedChildren = candidate.type().isHost() ?
allNodes.childrenOf(candidate)
- .not().state(Node.State.parked)
- .mapToList(Node::hostname);
+ .not()
+ .state(Node.State.parked)
+ .mapToList(Node::hostname) :
+ List.of();
if (unparkedChildren.isEmpty()) {
nodeRepository.nodes().park(candidate.hostname(), true, Agent.FailedExpirer,
- "Parked by FailedExpirer due to hardware issue or high fail count");
+ "Parked by FailedExpirer due to " + reason.get());
} else {
- log.info(String.format("Expired failed node %s with hardware issue was not parked because of " +
- "unparked children: %s",
+ log.info(String.format("Expired failed node %s was not parked because of unparked children: %s",
candidate.hostname(), String.join(", ", unparkedChildren)));
}
} else {
@@ -116,10 +118,17 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
nodeRepository.nodes().deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer");
}
- /** Returns whether node is broken and cannot be recycled */
- private boolean broken(Node node, NodeList allNodes) {
- return NodeFailer.hasHardwareIssue(node, allNodes) ||
- (node.type().isHost() && node.status().failCount() >= maxAllowedFailures);
+ /** Returns whether the node should be parked instead of recycled */
+ private Optional<String> shouldPark(Node node, NodeList allNodes) {
+ if (NodeFailer.hasHardwareIssue(node, allNodes))
+ return Optional.of("has hardware issues");
+ if (node.type().isHost() && node.status().failCount() >= maxAllowedFailures)
+ return Optional.of("has failed too many times");
+ if (node.status().wantToDeprovision())
+ return Optional.of("want to deprovision");
+ if (node.status().wantToRetire())
+ return Optional.of("want to retire");
+ return Optional.empty();
}
}