diff options
author | Martin Polden <mpolden@mpolden.no> | 2020-07-17 15:53:12 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2020-07-17 15:55:47 +0200 |
commit | 0e6357cf602c7db4be4709fc33f66708d61906bb (patch) | |
tree | dfd0b533159fdc54d2d4474b5da54725d2318177 /node-repository | |
parent | 81656283891fb3a661e866bd6f534e5efeb76cfd (diff) |
Always fail out hosts when exceeding acceptable fail count
Diffstat (limited to 'node-repository')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java | 27 |
1 files changed, 11 insertions, 16 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 3f8cc58540d..2afbb0e6476 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; -import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.Zone; import com.yahoo.jdisc.Metric; @@ -23,24 +22,21 @@ import java.util.stream.Collectors; /** * This moves expired failed nodes: - * <ul> - * <li>To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their - * children are already in parked - * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf. - * Those environments have no protection against users running bogus applications, so - * we cannot use the node failure count to conclude the node has a failure. - * <li>Otherwise the node will remain in failed - * </ul> + * + * - To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their + * children are already in parked. + * - To dirty: If the node is a host and has failed less than 5 times, or always if the node is a child. + * - Otherwise the node will remain in failed. + * * Failed content nodes are given a long expiry time to enable us to manually moved them back to * active to recover data in cases where the node was failed accidentally. - * <p> + * * Failed container (Vespa, not Docker) nodes are expired early as there's no data to potentially recover. - * </p> - * <p> + * * The purpose of the automatic recycling to dirty + fail count is that nodes which were moved * to failed due to some undetected hardware failure will end up being failed again. * When that has happened enough they will not be recycled. - * <p> + * * Nodes with detected hardware issues will not be recycled. * * @author bratseth @@ -125,8 +121,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { /** Returns whether the current node fail count should be used as an indicator of hardware issue */ private boolean failCountIndicatesHardwareIssue(Node node) { - if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER) return false; - return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) && - node.status().failCount() >= maxAllowedFailures; + return node.type().isHost() && node.status().failCount() >= maxAllowedFailures; } + } |