aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server/src
diff options
context:
space:
mode:
authorOla Aunrønning <olaa@verizonmedia.com>2021-11-24 16:17:15 +0100
committerOla Aunrønning <olaa@verizonmedia.com>2021-11-24 16:17:15 +0100
commit498152c8e39c14e2d746df2dfe642491fc5e18dc (patch)
treefd50711b42e42cd20bde2c959466206a41fe522d /controller-server/src
parent9b9f6956221465b6c1e3e0678fe639977cc14771 (diff)
Increase allowed suspension time for content nodes
Diffstat (limited to 'controller-server/src')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java13
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java4
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java2
4 files changed, 19 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 09eac53f218..e28273870d7 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -349,9 +349,15 @@ public class InternalStepRunner implements StepRunner {
String failureReason = null;
- NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(timeouts.nodesDown()));
+ NodeList suspendedTooLong = nodeList
+ .isStateful()
+ .suspendedSince(controller.clock().instant().minus(timeouts.statefulNodesDown()))
+ .and(nodeList
+ .not().isStateful()
+ .suspendedSince(controller.clock().instant().minus(timeouts.statelessNodesDown()))
+ );
if ( ! suspendedTooLong.isEmpty()) {
- failureReason = "Some nodes have been suspended for more than " + timeouts.nodesDown().toMinutes() + " minutes:\n" +
+ failureReason = "Some nodes have been suspended for more than the allowed threshold:\n" +
suspendedTooLong.asList().stream().map(node -> node.node().hostname().value()).collect(joining("\n"));
}
@@ -1042,7 +1048,8 @@ public class InternalStepRunner implements StepRunner {
Duration endpoint() { return Duration.ofMinutes(15); }
Duration endpointCertificate() { return Duration.ofMinutes(20); }
Duration tester() { return Duration.ofMinutes(30); }
- Duration nodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 60); }
+ Duration statelessNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 60); }
+ Duration statefulNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 720); }
Duration noNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 240); }
Duration testerCertificate() { return Duration.ofMinutes(300); }
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
index 12c226241e1..cb0ff0644fa 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
@@ -87,6 +87,10 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList>
return matching(NodeWithServices::needsNewConfig);
}
+ public NodeList isStateful() {
+ return matching(NodeWithServices::isStateful);
+ }
+
/** The nodes that are retiring. */
public NodeList retiring() {
return matching(node -> node.node().retired());
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java
index bd589af190e..d8f88d31759 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java
@@ -82,6 +82,10 @@ public class NodeWithServices {
return services.stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration());
}
+ public boolean isStateful() {
+ return node.clusterType() == Node.ClusterType.content || node.clusterType() == Node.ClusterType.combined;
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
index 5cf554f2c01..ae92fd46f26 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
@@ -210,7 +210,7 @@ public class InternalStepRunnerTest {
assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal));
- tester.clock().advance(InternalStepRunner.Timeouts.of(system()).nodesDown().minus(Duration.ofSeconds(3)));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(system()).statelessNodesDown().minus(Duration.ofSeconds(3)));
tester.runner().run();
assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));