diff options
author | Valerij Fredriksen <valerijf@oath.com> | 2018-08-21 16:27:53 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerijf@oath.com> | 2018-08-21 16:27:53 +0200 |
commit | 08c599b4286ffe5c19c23b192c36a8b24f919352 (patch) | |
tree | e2502e89e5e66505ca36143bdc75d52e15626093 /node-repository | |
parent | 6a73630a6b104254ad29915cad4c5aab822f806c (diff) |
Fail nodes because of hardware failure
Diffstat (limited to 'node-repository')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 7ea292757d5..497a511dc71 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -8,6 +8,7 @@ import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; @@ -15,8 +16,10 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; +import com.yahoo.vespa.orchestrator.HostNameNotFoundException; import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; +import com.yahoo.vespa.orchestrator.status.HostStatus; import com.yahoo.vespa.service.monitor.ServiceMonitor; import java.time.Clock; @@ -166,6 +169,8 @@ public class NodeFailer extends Maintainer { for (Node node : nodeRepository().getNodes(Node.State.active)) { if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); + } else if (node.status().hardwareFailureDescription().isPresent() && nodeSuspended(node)) { + nodesByFailureReason.put(node, "Node has hardware failure"); } } return nodesByFailureReason; @@ -197,6 +202,15 @@ public class NodeFailer extends Maintainer { } } + private boolean nodeSuspended(Node node) { + try { + return orchestrator.getNodeStatus(new HostName(node.hostname())) == HostStatus.ALLOWED_TO_BE_DOWN; + } catch (HostNameNotFoundException e) { + // Treat it as not suspended + return false; + } + } + /** * We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected * unless the node is replaced. |