summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2018-08-21 16:27:53 +0200
committerValerij Fredriksen <valerijf@oath.com>2018-08-21 16:27:53 +0200
commit08c599b4286ffe5c19c23b192c36a8b24f919352 (patch)
treee2502e89e5e66505ca36143bdc75d52e15626093 /node-repository
parent6a73630a6b104254ad29915cad4c5aab822f806c (diff)
Fail nodes because of hardware failure
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java14
1 files changed, 14 insertions, 0 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 7ea292757d5..497a511dc71 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -8,6 +8,7 @@ import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
+import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
@@ -15,8 +16,10 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
+import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
+import com.yahoo.vespa.orchestrator.status.HostStatus;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Clock;
@@ -166,6 +169,8 @@ public class NodeFailer extends Maintainer {
for (Node node : nodeRepository().getNodes(Node.State.active)) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
+ } else if (node.status().hardwareFailureDescription().isPresent() && nodeSuspended(node)) {
+ nodesByFailureReason.put(node, "Node has hardware failure");
}
}
return nodesByFailureReason;
@@ -197,6 +202,15 @@ public class NodeFailer extends Maintainer {
}
}
+ private boolean nodeSuspended(Node node) {
+ try {
+ return orchestrator.getNodeStatus(new HostName(node.hostname())) == HostStatus.ALLOWED_TO_BE_DOWN;
+ } catch (HostNameNotFoundException e) {
+ // Treat it as not suspended
+ return false;
+ }
+ }
+
/**
* We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected
* unless the node is replaced.