summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Musum <musum@verizonmedia.com>2020-07-17 19:40:06 +0200
committerGitHub <noreply@github.com>2020-07-17 19:40:06 +0200
commit07a8078345c852b5b558a329afaee2c50ae843c3 (patch)
tree15b774e0a1fa55f5e269b0201310d1a2405bd29c
parented8fa0954399f92b38d6332ca48b24105e83b833 (diff)
parent0e6357cf602c7db4be4709fc33f66708d61906bb (diff)
Merge pull request #13919 from vespa-engine/mpolden/host-fail-count
Always fail out hosts when exceeding acceptable fail count
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java27
1 files changed, 11 insertions, 16 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 3f8cc58540d..2afbb0e6476 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Environment;
-import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.Zone;
import com.yahoo.jdisc.Metric;
@@ -23,24 +22,21 @@ import java.util.stream.Collectors;
/**
* This moves expired failed nodes:
- * <ul>
- * <li>To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their
- * children are already in parked
- * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf.
- * Those environments have no protection against users running bogus applications, so
- * we cannot use the node failure count to conclude the node has a failure.
- * <li>Otherwise the node will remain in failed
- * </ul>
+ *
+ * - To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their
+ * children are already in parked.
+ * - To dirty: If the node is a host and has failed less than 5 times, or always if the node is a child.
+ * - Otherwise the node will remain in failed.
+ *
* Failed content nodes are given a long expiry time to enable us to manually moved them back to
* active to recover data in cases where the node was failed accidentally.
- * <p>
+ *
* Failed container (Vespa, not Docker) nodes are expired early as there's no data to potentially recover.
- * </p>
- * <p>
+ *
* The purpose of the automatic recycling to dirty + fail count is that nodes which were moved
* to failed due to some undetected hardware failure will end up being failed again.
* When that has happened enough they will not be recycled.
- * <p>
+ *
* Nodes with detected hardware issues will not be recycled.
*
* @author bratseth
@@ -125,8 +121,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
/** Returns whether the current node fail count should be used as an indicator of hardware issue */
private boolean failCountIndicatesHardwareIssue(Node node) {
- if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER) return false;
- return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) &&
- node.status().failCount() >= maxAllowedFailures;
+ return node.type().isHost() && node.status().failCount() >= maxAllowedFailures;
}
+
}