From 0db0da43c891e1869247f6c2738bbbd4fcee2835 Mon Sep 17 00:00:00 2001 From: Jon Marius Venstad Date: Wed, 19 Jan 2022 22:44:36 +0100 Subject: Let deployment run for some time before giving up nodes which are down --- .../hosted/controller/deployment/InternalStepRunner.java | 14 ++++++-------- .../com/yahoo/vespa/hosted/controller/deployment/Run.java | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'controller-server/src') diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 225634634b2..a6d1800bf71 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -349,14 +349,12 @@ public class InternalStepRunner implements StepRunner { String failureReason = null; - NodeList suspendedTooLong = nodeList - .isStateful() - .suspendedSince(controller.clock().instant().minus(timeouts.statefulNodesDown())) - .and(nodeList - .not().isStateful() - .suspendedSince(controller.clock().instant().minus(timeouts.statelessNodesDown())) - ); - if ( ! suspendedTooLong.isEmpty()) { + NodeList suspendedTooLong = nodeList.isStateful() + .suspendedSince(controller.clock().instant().minus(timeouts.statefulNodesDown())) + .and(nodeList.not().isStateful() + .suspendedSince(controller.clock().instant().minus(timeouts.statelessNodesDown())) + ); + if ( ! suspendedTooLong.isEmpty() && deployment.get().at().plus(timeouts.statelessNodesDown()).isBefore(controller.clock().instant())) { failureReason = "Some nodes have been suspended for more than the allowed threshold:\n" + suspendedTooLong.asList().stream().map(node -> node.node().hostname().value()).collect(joining("\n")); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java index 2b9e3dd0733..4b38306f905 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java @@ -210,7 +210,7 @@ public class Run { return lastVespaLogTimestamp; } - /** Returns the timestamp of the last time no nodes were allowed to be down. */ + /** Returns since when no nodes have been allowed to be down. */ public Optional noNodesDownSince() { return noNodesDownSince; } -- cgit v1.2.3