diff options
author | Harald Musum <musum@verizonmedia.com> | 2020-07-17 19:40:06 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-17 19:40:06 +0200 |
commit | 07a8078345c852b5b558a329afaee2c50ae843c3 (patch) | |
tree | 15b774e0a1fa55f5e269b0201310d1a2405bd29c | |
parent | ed8fa0954399f92b38d6332ca48b24105e83b833 (diff) | |
parent | 0e6357cf602c7db4be4709fc33f66708d61906bb (diff) |
Merge pull request #13919 from vespa-engine/mpolden/host-fail-count
Always fail out hosts when exceeding acceptable fail count
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java | 27 |
1 files changed, 11 insertions, 16 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 3f8cc58540d..2afbb0e6476 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; -import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.Zone; import com.yahoo.jdisc.Metric; @@ -23,24 +22,21 @@ import java.util.stream.Collectors; /** * This moves expired failed nodes: - * <ul> - * <li>To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their - * children are already in parked - * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf. - * Those environments have no protection against users running bogus applications, so - * we cannot use the node failure count to conclude the node has a failure. - * <li>Otherwise the node will remain in failed - * </ul> + * + * - To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their + * children are already in parked. + * - To dirty: If the node is a host and has failed less than 5 times, or always if the node is a child. + * - Otherwise the node will remain in failed. + * * Failed content nodes are given a long expiry time to enable us to manually moved them back to * active to recover data in cases where the node was failed accidentally. - * <p> + * * Failed container (Vespa, not Docker) nodes are expired early as there's no data to potentially recover. - * </p> - * <p> + * * The purpose of the automatic recycling to dirty + fail count is that nodes which were moved * to failed due to some undetected hardware failure will end up being failed again. * When that has happened enough they will not be recycled. - * <p> + * * Nodes with detected hardware issues will not be recycled. * * @author bratseth @@ -125,8 +121,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { /** Returns whether the current node fail count should be used as an indicator of hardware issue */ private boolean failCountIndicatesHardwareIssue(Node node) { - if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER) return false; - return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) && - node.status().failCount() >= maxAllowedFailures; + return node.type().isHost() && node.status().failCount() >= maxAllowedFailures; } + } |