diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2021-11-24 16:17:15 +0100 |
---|---|---|
committer | Ola Aunrønning <olaa@verizonmedia.com> | 2021-11-24 16:17:15 +0100 |
commit | 498152c8e39c14e2d746df2dfe642491fc5e18dc (patch) | |
tree | fd50711b42e42cd20bde2c959466206a41fe522d /controller-server | |
parent | 9b9f6956221465b6c1e3e0678fe639977cc14771 (diff) |
Increase allowed suspension time for content nodes
Diffstat (limited to 'controller-server')
4 files changed, 19 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 09eac53f218..e28273870d7 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -349,9 +349,15 @@ public class InternalStepRunner implements StepRunner { String failureReason = null; - NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(timeouts.nodesDown())); + NodeList suspendedTooLong = nodeList + .isStateful() + .suspendedSince(controller.clock().instant().minus(timeouts.statefulNodesDown())) + .and(nodeList + .not().isStateful() + .suspendedSince(controller.clock().instant().minus(timeouts.statelessNodesDown())) + ); if ( ! suspendedTooLong.isEmpty()) { - failureReason = "Some nodes have been suspended for more than " + timeouts.nodesDown().toMinutes() + " minutes:\n" + + failureReason = "Some nodes have been suspended for more than the allowed threshold:\n" + suspendedTooLong.asList().stream().map(node -> node.node().hostname().value()).collect(joining("\n")); } @@ -1042,7 +1048,8 @@ public class InternalStepRunner implements StepRunner { Duration endpoint() { return Duration.ofMinutes(15); } Duration endpointCertificate() { return Duration.ofMinutes(20); } Duration tester() { return Duration.ofMinutes(30); } - Duration nodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 60); } + Duration statelessNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 60); } + Duration statefulNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 720); } Duration noNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 240); } Duration testerCertificate() { return Duration.ofMinutes(300); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java index 12c226241e1..cb0ff0644fa 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java @@ -87,6 +87,10 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList> return matching(NodeWithServices::needsNewConfig); } + public NodeList isStateful() { + return matching(NodeWithServices::isStateful); + } + /** The nodes that are retiring. */ public NodeList retiring() { return matching(node -> node.node().retired()); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java index bd589af190e..d8f88d31759 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java @@ -82,6 +82,10 @@ public class NodeWithServices { return services.stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration()); } + public boolean isStateful() { + return node.clusterType() == Node.ClusterType.content || node.clusterType() == Node.ClusterType.combined; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java index 5cf554f2c01..ae92fd46f26 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java @@ -210,7 +210,7 @@ public class InternalStepRunnerTest { assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal)); - tester.clock().advance(InternalStepRunner.Timeouts.of(system()).nodesDown().minus(Duration.ofSeconds(3))); + tester.clock().advance(InternalStepRunner.Timeouts.of(system()).statelessNodesDown().minus(Duration.ofSeconds(3))); tester.runner().run(); assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); |