summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2018-08-21 16:08:38 +0200
committerValerij Fredriksen <valerijf@oath.com>2018-08-21 16:27:29 +0200
commit6a73630a6b104254ad29915cad4c5aab822f806c (patch)
treeff00fe77d70022a9aecca68a3708df5532847cbf /node-repository
parent8bf5edc01249b341afbd028e4d40b27780b564cb (diff)
Return active nodes that should be failed with reason
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java69
1 files changed, 38 insertions, 31 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 5631899c68b..7ea292757d5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -22,7 +22,6 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -94,11 +93,12 @@ public class NodeFailer extends Maintainer {
}
// Active nodes
- for (Node node : determineActiveNodeDownStatus()) {
- Instant graceTimeEnd = clock.instant().minus(downTimeLimit);
- if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node) && failAllowedFor(node.type()))
- if (!throttle(node)) failActive(node, "Node has been down longer than " + downTimeLimit);
- }
+ updateNodeDownState();
+ getActiveNodesByFailureReason().forEach((node, reason) -> {
+ if (failAllowedFor(node.type()) && !throttle(node)) {
+ failActive(node, reason);
+ }
+ });
}
private void updateNodeLivenessEventsForReadyNodes() {
@@ -139,6 +139,38 @@ public class NodeFailer extends Maintainer {
return nodesByFailureReason;
}
+ /**
+ * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it.
+ * Otherwise we remove any "down" history record.
+ */
+ private void updateNodeDownState() {
+ Map<String, Node> activeNodesByHostname = nodeRepository().getNodes(Node.State.active).stream()
+ .collect(Collectors.toMap(Node::hostname, node -> node));
+
+ serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName()
+ .forEach((hostName, serviceInstances) -> {
+ Node node = activeNodesByHostname.get(hostName.s());
+ if (node == null) return;
+
+ if (badNode(serviceInstances)) {
+ recordAsDown(node);
+ } else {
+ clearDownRecord(node);
+ }
+ });
+ }
+
+ private Map<Node, String> getActiveNodesByFailureReason() {
+ Instant graceTimeEnd = clock.instant().minus(downTimeLimit);
+ Map<Node, String> nodesByFailureReason = new HashMap<>();
+ for (Node node : nodeRepository().getNodes(Node.State.active)) {
+ if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
+ nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
+ }
+ }
+ return nodesByFailureReason;
+ }
+
private boolean expectConfigRequests(Node node) {
return !node.type().isDockerHost() || configserverConfig.nodeAdminInContainer();
}
@@ -189,31 +221,6 @@ public class NodeFailer extends Maintainer {
}
/**
- * If the node is down (see badNode()), and there is no "down" history record, we add it.
- * Otherwise we remove any "down" history record.
- *
- * @return a list of all nodes that should be considered as down
- */
- private List<Node> determineActiveNodeDownStatus() {
- List<Node> downNodes = new ArrayList<>();
- serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName()
- .entrySet().stream().forEach(
- entry -> {
- Optional<Node> node = nodeRepository().getNode(entry.getKey().s(), Node.State.active);
- if (node.isPresent()) {
- if (badNode(entry.getValue())) {
- downNodes.add(recordAsDown(node.get()));
- } else {
- clearDownRecord(node.get());
- }
- }
- }
- );
-
- return downNodes;
- }
-
- /**
* Record a node as down if not already recorded and returns the node in the new state.
* This assumes the node is found in the node
* repo and that the node is allocated. If we get here otherwise something is truly odd.