diff options
author | Harald Musum <musum@vespa.ai> | 2024-01-06 00:56:32 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-06 00:56:32 +0100 |
commit | 93203c07633be5c148f4c2b23746f4dac83561b2 (patch) | |
tree | 8f1239d3816625b4519324d8c9736f9d6ae64f6b /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | |
parent | e4da75db4556a3cd72b034c4406027f9bba73918 (diff) |
Revert "Reset downtime at resume"
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 29 |
1 files changed, 6 insertions, 23 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 27301c9bf2a..6c4be09c489 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -25,7 +25,6 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; -import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Objects; @@ -34,7 +33,6 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Maintains information in the node repo about when this node last responded to ping @@ -112,7 +110,12 @@ public class NodeFailer extends NodeRepositoryMaintainer { failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes")); for (Node node : activeNodes) { - downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant))); + Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); + if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !affectedByMaintenance(node)) { + // Allow a grace period after node re-activation + if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) + failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); + } } for (Node node : activeNodes) { @@ -146,26 +149,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { return !reasonsToFailHost(host).isEmpty(); } - private Optional<Instant> downSince(Node node) { - Optional<Instant> downInstant = node.history().downSince(); - if (downInstant.isEmpty()) return Optional.empty(); - - Instant downSince = Stream.of(downInstant, - node.history().resumedSince(), - node.history().event(History.Event.Type.activated).map(History.Event::at)) - .filter(Optional::isPresent) - .map(Optional::get) - .max(Comparator.naturalOrder()) - .orElseThrow(); - Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit; - if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty(); - - if (applicationSuspended(node)) return Optional.empty(); - if (affectedByMaintenance(node)) return Optional.empty(); - - return Optional.of(downSince); - } - private boolean applicationSuspended(Node node) { try { return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner()) |