summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-02-11 09:17:15 +0100
committerGitHub <noreply@github.com>2022-02-11 09:17:15 +0100
commit5e849a507070560565d4fed6646f1b49943b81bf (patch)
tree803a3803dbc7fffc1be6f64dae2edad7bc01b6d1
parentd7a92cc0fc9bca05b7a028cfa971183a4fddea8f (diff)
parent56c5faa50bb12686ef44164bbe6cc23d6504bcda (diff)
Merge pull request #21140 from vespa-engine/mpolden/park-eventually
Park hosts with high fail count
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java14
1 files changed, 11 insertions, 3 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 6133705ed59..3274f12dbc6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -42,6 +42,8 @@ import java.util.stream.Collectors;
public class FailedExpirer extends NodeRepositoryMaintainer {
private static final Logger log = Logger.getLogger(FailedExpirer.class.getName());
+ // Try recycling nodes until reaching this many failures
+ private static final int maxAllowedFailures = 50;
private final NodeRepository nodeRepository;
private final Duration statefulExpiry; // Stateful nodes: Grace period to allow recovery of data
@@ -85,11 +87,11 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
recycle(nodesToRecycle);
}
- /** Move eligible nodes to dirty. This may be a subset of the given nodes */
+ /** Move eligible nodes to dirty or parked. This may be a subset of the given nodes */
private void recycle(List<Node> nodes) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node candidate : nodes) {
- if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) {
+ if (broken(candidate)) {
List<String> unparkedChildren = !candidate.type().isHost() ? List.of() :
nodeRepository.nodes().list()
.childrenOf(candidate)
@@ -98,7 +100,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
if (unparkedChildren.isEmpty()) {
nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer,
- "Parked by FailedExpirer due to hardware issue");
+ "Parked by FailedExpirer due to hardware issue or high fail count");
} else {
log.info(String.format("Expired failed node %s with hardware issue was not parked because of " +
"unparked children: %s", candidate.hostname(),
@@ -111,4 +113,10 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
nodeRepository.nodes().deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer");
}
+ /** Returns whether node is broken and cannot be recycled */
+ private boolean broken(Node node) {
+ return NodeFailer.hasHardwareIssue(node, nodeRepository) ||
+ (node.type().isHost() && node.status().failCount() >= maxAllowedFailures);
+ }
+
}