diff options
Diffstat (limited to 'node-repository')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 69 |
1 files changed, 38 insertions, 31 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 5631899c68b..7ea292757d5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -22,7 +22,6 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor; import java.time.Clock; import java.time.Duration; import java.time.Instant; -import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -94,11 +93,12 @@ public class NodeFailer extends Maintainer { } // Active nodes - for (Node node : determineActiveNodeDownStatus()) { - Instant graceTimeEnd = clock.instant().minus(downTimeLimit); - if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node) && failAllowedFor(node.type())) - if (!throttle(node)) failActive(node, "Node has been down longer than " + downTimeLimit); - } + updateNodeDownState(); + getActiveNodesByFailureReason().forEach((node, reason) -> { + if (failAllowedFor(node.type()) && !throttle(node)) { + failActive(node, reason); + } + }); } private void updateNodeLivenessEventsForReadyNodes() { @@ -139,6 +139,38 @@ public class NodeFailer extends Maintainer { return nodesByFailureReason; } + /** + * If the node is down (see {@link #badNode}), and there is no "down" history record, we add it. + * Otherwise we remove any "down" history record. + */ + private void updateNodeDownState() { + Map<String, Node> activeNodesByHostname = nodeRepository().getNodes(Node.State.active).stream() + .collect(Collectors.toMap(Node::hostname, node -> node)); + + serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName() + .forEach((hostName, serviceInstances) -> { + Node node = activeNodesByHostname.get(hostName.s()); + if (node == null) return; + + if (badNode(serviceInstances)) { + recordAsDown(node); + } else { + clearDownRecord(node); + } + }); + } + + private Map<Node, String> getActiveNodesByFailureReason() { + Instant graceTimeEnd = clock.instant().minus(downTimeLimit); + Map<Node, String> nodesByFailureReason = new HashMap<>(); + for (Node node : nodeRepository().getNodes(Node.State.active)) { + if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { + nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); + } + } + return nodesByFailureReason; + } + private boolean expectConfigRequests(Node node) { return !node.type().isDockerHost() || configserverConfig.nodeAdminInContainer(); } @@ -189,31 +221,6 @@ public class NodeFailer extends Maintainer { } /** - * If the node is down (see badNode()), and there is no "down" history record, we add it. - * Otherwise we remove any "down" history record. - * - * @return a list of all nodes that should be considered as down - */ - private List<Node> determineActiveNodeDownStatus() { - List<Node> downNodes = new ArrayList<>(); - serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName() - .entrySet().stream().forEach( - entry -> { - Optional<Node> node = nodeRepository().getNode(entry.getKey().s(), Node.State.active); - if (node.isPresent()) { - if (badNode(entry.getValue())) { - downNodes.add(recordAsDown(node.get())); - } else { - clearDownRecord(node.get()); - } - } - } - ); - - return downNodes; - } - - /** * Record a node as down if not already recorded and returns the node in the new state. * This assumes the node is found in the node * repo and that the node is allocated. If we get here otherwise something is truly odd. |