diff options
author | Håkon Hallingstad <hakon@yahooinc.com> | 2024-01-05 10:25:14 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@yahooinc.com> | 2024-01-05 10:25:14 +0100 |
commit | 86fa1eb5342633d971413f5be79095e1653f27f9 (patch) | |
tree | b71d3e135a614a06471764cd862bd7de40fdd51b /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | |
parent | 71b2c35d7904a9cec4357f43ce1d06af3ee6127d (diff) |
Reset downtime at resume
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 6c4be09c489..27301c9bf2a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -25,6 +25,7 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Objects; @@ -33,6 +34,7 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Maintains information in the node repo about when this node last responded to ping @@ -110,12 +112,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes")); for (Node node : activeNodes) { - Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); - if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !affectedByMaintenance(node)) { - // Allow a grace period after node re-activation - if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) - failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); - } + downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant))); } for (Node node : activeNodes) { @@ -149,6 +146,26 @@ public class NodeFailer extends NodeRepositoryMaintainer { return !reasonsToFailHost(host).isEmpty(); } + private Optional<Instant> downSince(Node node) { + Optional<Instant> downInstant = node.history().downSince(); + if (downInstant.isEmpty()) return Optional.empty(); + + Instant downSince = Stream.of(downInstant, + node.history().resumedSince(), + node.history().event(History.Event.Type.activated).map(History.Event::at)) + .filter(Optional::isPresent) + .map(Optional::get) + .max(Comparator.naturalOrder()) + .orElseThrow(); + Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit; + if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty(); + + if (applicationSuspended(node)) return Optional.empty(); + if (affectedByMaintenance(node)) return Optional.empty(); + + return Optional.of(downSince); + } + private boolean applicationSuspended(Node node) { try { return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner()) |