aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
diff options
context:
space:
mode:
authorHarald Musum <musum@vespa.ai>2024-01-06 00:56:32 +0100
committerGitHub <noreply@github.com>2024-01-06 00:56:32 +0100
commit93203c07633be5c148f4c2b23746f4dac83561b2 (patch)
tree8f1239d3816625b4519324d8c9736f9d6ae64f6b /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
parente4da75db4556a3cd72b034c4406027f9bba73918 (diff)
Revert "Reset downtime at resume"
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java29
1 files changed, 6 insertions, 23 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 27301c9bf2a..6c4be09c489 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -25,7 +25,6 @@ import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
-import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
@@ -34,7 +33,6 @@ import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
-import java.util.stream.Stream;
/**
* Maintains information in the node repo about when this node last responded to ping
@@ -112,7 +110,12 @@ public class NodeFailer extends NodeRepositoryMaintainer {
failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes"));
for (Node node : activeNodes) {
- downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant)));
+ Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
+ if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !affectedByMaintenance(node)) {
+ // Allow a grace period after node re-activation
+ if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
+ failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit));
+ }
}
for (Node node : activeNodes) {
@@ -146,26 +149,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return !reasonsToFailHost(host).isEmpty();
}
- private Optional<Instant> downSince(Node node) {
- Optional<Instant> downInstant = node.history().downSince();
- if (downInstant.isEmpty()) return Optional.empty();
-
- Instant downSince = Stream.of(downInstant,
- node.history().resumedSince(),
- node.history().event(History.Event.Type.activated).map(History.Event::at))
- .filter(Optional::isPresent)
- .map(Optional::get)
- .max(Comparator.naturalOrder())
- .orElseThrow();
- Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit;
- if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty();
-
- if (applicationSuspended(node)) return Optional.empty();
- if (affectedByMaintenance(node)) return Optional.empty();
-
- return Optional.of(downSince);
- }
-
private boolean applicationSuspended(Node node) {
try {
return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner())