diff options
author | Ola Aunrønning <olaa@yahooinc.com> | 2023-04-14 16:56:41 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-14 16:56:41 +0200 |
commit | db5302e19949e6a3b5989a47631a44d8392ea017 (patch) | |
tree | f9d3ded90d7f086a2785e92dbfdb3a47422f06a5 /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | |
parent | d025a93015e66efc0027d81a64e70530d6cb240e (diff) |
Don't fail nodes undergoing CMR (#26743)
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index f9ff2f08375..766bc688c62 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; +import com.yahoo.slime.SlimeUtils; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -109,7 +110,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { for (Node node : activeNodes) { Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); - if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { + if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !undergoingCmr(node)) { // Allow a grace period after node re-activation if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); @@ -157,6 +158,19 @@ public class NodeFailer extends NodeRepositoryMaintainer { } } + private boolean undergoingCmr(Node node) { + return node.reports().getReport("vcmr") + .map(report -> + SlimeUtils.entriesStream(report.getInspector().field("upcoming")) + .anyMatch(cmr -> { + var startTime = cmr.field("plannedStartTime").asLong(); + var endTime = cmr.field("plannedEndTime").asLong(); + var now = clock().instant().getEpochSecond(); + return now > startTime && now < endTime; + }) + ).orElse(false); + } + /** Is the node and all active children suspended? */ private boolean allSuspended(Node node, NodeList activeNodes) { if (!nodeRepository().nodes().suspended(node)) return false; |