diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2021-11-24 17:18:34 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-11-24 17:18:34 +0100 |
commit | f4e89674916bec5ca1eede86c0947a078f5494d8 (patch) | |
tree | f700d51f53a74c12773b54b38c3fc48c27eb15af /controller-server | |
parent | 9e1e838ba0891d1c2a5ec10006a511c59d4efde0 (diff) | |
parent | 498152c8e39c14e2d746df2dfe642491fc5e18dc (diff) |
Merge pull request #20193 from vespa-engine/olaa/increase-content-node-timeout
Increase allowed suspension time for content nodes
Diffstat (limited to 'controller-server')
4 files changed, 19 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 09eac53f218..e28273870d7 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -349,9 +349,15 @@ public class InternalStepRunner implements StepRunner { String failureReason = null; - NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(timeouts.nodesDown())); + NodeList suspendedTooLong = nodeList + .isStateful() + .suspendedSince(controller.clock().instant().minus(timeouts.statefulNodesDown())) + .and(nodeList + .not().isStateful() + .suspendedSince(controller.clock().instant().minus(timeouts.statelessNodesDown())) + ); if ( ! suspendedTooLong.isEmpty()) { - failureReason = "Some nodes have been suspended for more than " + timeouts.nodesDown().toMinutes() + " minutes:\n" + + failureReason = "Some nodes have been suspended for more than the allowed threshold:\n" + suspendedTooLong.asList().stream().map(node -> node.node().hostname().value()).collect(joining("\n")); } @@ -1042,7 +1048,8 @@ public class InternalStepRunner implements StepRunner { Duration endpoint() { return Duration.ofMinutes(15); } Duration endpointCertificate() { return Duration.ofMinutes(20); } Duration tester() { return Duration.ofMinutes(30); } - Duration nodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 60); } + Duration statelessNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 60); } + Duration statefulNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 720); } Duration noNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 240); } Duration testerCertificate() { return Duration.ofMinutes(300); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java index 12c226241e1..cb0ff0644fa 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java @@ -87,6 +87,10 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList> return matching(NodeWithServices::needsNewConfig); } + public NodeList isStateful() { + return matching(NodeWithServices::isStateful); + } + /** The nodes that are retiring. */ public NodeList retiring() { return matching(node -> node.node().retired()); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java index bd589af190e..d8f88d31759 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java @@ -82,6 +82,10 @@ public class NodeWithServices { return services.stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration()); } + public boolean isStateful() { + return node.clusterType() == Node.ClusterType.content || node.clusterType() == Node.ClusterType.combined; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java index 5cf554f2c01..ae92fd46f26 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java @@ -210,7 +210,7 @@ public class InternalStepRunnerTest { assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal)); - tester.clock().advance(InternalStepRunner.Timeouts.of(system()).nodesDown().minus(Duration.ofSeconds(3))); + tester.clock().advance(InternalStepRunner.Timeouts.of(system()).statelessNodesDown().minus(Duration.ofSeconds(3))); tester.runner().run(); assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); |