Always fail out hosts when exceeding acceptable fail count

author: Martin Polden <mpolden@mpolden.no> 2020-07-17 15:53:12 +0200
committer: Martin Polden <mpolden@mpolden.no> 2020-07-17 15:55:47 +0200
commit: 0e6357cf602c7db4be4709fc33f66708d61906bb (patch)
tree: dfd0b533159fdc54d2d4474b5da54725d2318177 /node-repository
parent: 81656283891fb3a661e866bd6f534e5efeb76cfd (diff)
1 files changed, 11 insertions, 16 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 3f8cc58540d..2afbb0e6476 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance;
 
 import com.yahoo.config.provision.ClusterSpec;
 import com.yahoo.config.provision.Environment;
-import com.yahoo.config.provision.Flavor;
 import com.yahoo.config.provision.NodeType;
 import com.yahoo.config.provision.Zone;
 import com.yahoo.jdisc.Metric;
@@ -23,24 +22,21 @@ import java.util.stream.Collectors;
 
 /**
  * This moves expired failed nodes:
- * <ul>
- *     <li>To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their
- *     children are already in parked
- *     <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf.
- *     Those environments have no protection against users running bogus applications, so
- *     we cannot use the node failure count to conclude the node has a failure.
- *     <li>Otherwise the node will remain in failed
- * </ul>
+ *
+ * - To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their
+ *              children are already in parked.
+ * - To dirty: If the node is a host and has failed less than 5 times, or always if the node is a child.
+ * - Otherwise the node will remain in failed.
+ *
  * Failed content nodes are given a long expiry time to enable us to manually moved them back to
  * active to recover data in cases where the node was failed accidentally.
- * <p>
+ *
  * Failed container (Vespa, not Docker) nodes are expired early as there's no data to potentially recover.
- * </p>
- * <p>
+ *
  * The purpose of the automatic recycling to dirty + fail count is that nodes which were moved
  * to failed due to some undetected hardware failure will end up being failed again.
  * When that has happened enough they will not be recycled.
- * <p>
+ *
  * Nodes with detected hardware issues will not be recycled.
  *
  * @author bratseth
@@ -125,8 +121,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
 
     /** Returns whether the current node fail count should be used as an indicator of hardware issue */
     private boolean failCountIndicatesHardwareIssue(Node node) {
-        if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER) return false;
-        return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) &&
-               node.status().failCount() >= maxAllowedFailures;
+        return node.type().isHost() && node.status().failCount() >= maxAllowedFailures;
     }
+
 }
author	Martin Polden <mpolden@mpolden.no>	2020-07-17 15:53:12 +0200
committer	Martin Polden <mpolden@mpolden.no>	2020-07-17 15:55:47 +0200
commit	0e6357cf602c7db4be4709fc33f66708d61906bb (patch)
tree	dfd0b533159fdc54d2d4474b5da54725d2318177 /node-repository
parent	81656283891fb3a661e866bd6f534e5efeb76cfd (diff)