summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2020-03-19 14:12:54 +0100
committerMartin Polden <mpolden@mpolden.no>2020-03-19 14:14:46 +0100
commit3ba839216b0770086160c8fa27cb807e40b62a13 (patch)
treef07b2cef4f8cfef53443fb4a31f3544cd3e82e70 /controller-server
parent7d4d3cdcd2065da4ea25bd66159b54e541422343 (diff)
Wait longer before counting node as failing to upgrade in metric
Some zones spend more than 1 hour upgrading.
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java7
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java2
2 files changed, 7 insertions, 2 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 4c1dd56ee64..9c414ce8348 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -43,7 +43,12 @@ public class MetricsReporter extends Maintainer {
public static final String REMAINING_ROTATIONS = "remaining_rotations";
public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests";
- private static final Duration NODE_UPGRADE_TIMEOUT = Duration.ofHours(1);
+ // The time a node belonging to a system application can spend from being told to upgrade until the upgrade is
+ // completed. Nodes exceeding this time are counted as failures.
+ private static final Duration NODE_UPGRADE_TIMEOUT = Duration.ofMinutes(90);
+
+ // The time a single node can spend performing an OS upgrade after being told to upgrade. Nodes exceeding this time
+ // multiplied by the number of nodes upgrading are counted as failures.
private static final Duration OS_UPGRADE_TIME_ALLOWANCE_PER_NODE = Duration.ofMinutes(30);
private final Metric metric;
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index 188f6a0848e..c00705149e9 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -256,7 +256,7 @@ public class MetricsReporterTest {
tester.configServer().nodeRepository().list(zone1.getId(), SystemApplication.configServer.id()).stream()
.map(Node::wantedVersion).min(Comparator.naturalOrder()).get());
tester.configServer().setVersion(SystemApplication.configServer.id(), zone1.getId(), version, 1);
- tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+ tester.clock().advance(Duration.ofMinutes(60).plus(Duration.ofSeconds(1)));
tester.computeVersionStatus();
reporter.maintain();
assertEquals(2, getNodesFailingUpgrade());