diff options
author | Martin Polden <mpolden@mpolden.no> | 2022-02-11 09:17:15 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-02-11 09:17:15 +0100 |
commit | 5e849a507070560565d4fed6646f1b49943b81bf (patch) | |
tree | 803a3803dbc7fffc1be6f64dae2edad7bc01b6d1 | |
parent | d7a92cc0fc9bca05b7a028cfa971183a4fddea8f (diff) | |
parent | 56c5faa50bb12686ef44164bbe6cc23d6504bcda (diff) |
Merge pull request #21140 from vespa-engine/mpolden/park-eventually
Park hosts with high fail count
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 6133705ed59..3274f12dbc6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -42,6 +42,8 @@ import java.util.stream.Collectors; public class FailedExpirer extends NodeRepositoryMaintainer { private static final Logger log = Logger.getLogger(FailedExpirer.class.getName()); + // Try recycling nodes until reaching this many failures + private static final int maxAllowedFailures = 50; private final NodeRepository nodeRepository; private final Duration statefulExpiry; // Stateful nodes: Grace period to allow recovery of data @@ -85,11 +87,11 @@ public class FailedExpirer extends NodeRepositoryMaintainer { recycle(nodesToRecycle); } - /** Move eligible nodes to dirty. This may be a subset of the given nodes */ + /** Move eligible nodes to dirty or parked. This may be a subset of the given nodes */ private void recycle(List<Node> nodes) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node candidate : nodes) { - if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) { + if (broken(candidate)) { List<String> unparkedChildren = !candidate.type().isHost() ? List.of() : nodeRepository.nodes().list() .childrenOf(candidate) @@ -98,7 +100,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { if (unparkedChildren.isEmpty()) { nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer, - "Parked by FailedExpirer due to hardware issue"); + "Parked by FailedExpirer due to hardware issue or high fail count"); } else { log.info(String.format("Expired failed node %s with hardware issue was not parked because of " + "unparked children: %s", candidate.hostname(), @@ -111,4 +113,10 @@ public class FailedExpirer extends NodeRepositoryMaintainer { nodeRepository.nodes().deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer"); } + /** Returns whether node is broken and cannot be recycled */ + private boolean broken(Node node) { + return NodeFailer.hasHardwareIssue(node, nodeRepository) || + (node.type().isHost() && node.status().failCount() >= maxAllowedFailures); + } + } |