diff options
author | Martin Polden <mpolden@mpolden.no> | 2020-03-19 14:12:54 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2020-03-19 14:14:46 +0100 |
commit | 3ba839216b0770086160c8fa27cb807e40b62a13 (patch) | |
tree | f07b2cef4f8cfef53443fb4a31f3544cd3e82e70 /controller-server | |
parent | 7d4d3cdcd2065da4ea25bd66159b54e541422343 (diff) |
Wait longer before counting node as failing to upgrade in metric
Some zones spend more than 1 hour upgrading.
Diffstat (limited to 'controller-server')
2 files changed, 7 insertions, 2 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 4c1dd56ee64..9c414ce8348 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -43,7 +43,12 @@ public class MetricsReporter extends Maintainer { public static final String REMAINING_ROTATIONS = "remaining_rotations"; public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests"; - private static final Duration NODE_UPGRADE_TIMEOUT = Duration.ofHours(1); + // The time a node belonging to a system application can spend from being told to upgrade until the upgrade is + // completed. Nodes exceeding this time are counted as failures. + private static final Duration NODE_UPGRADE_TIMEOUT = Duration.ofMinutes(90); + + // The time a single node can spend performing an OS upgrade after being told to upgrade. Nodes exceeding this time + // multiplied by the number of nodes upgrading are counted as failures. private static final Duration OS_UPGRADE_TIME_ALLOWANCE_PER_NODE = Duration.ofMinutes(30); private final Metric metric; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 188f6a0848e..c00705149e9 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -256,7 +256,7 @@ public class MetricsReporterTest { tester.configServer().nodeRepository().list(zone1.getId(), SystemApplication.configServer.id()).stream() .map(Node::wantedVersion).min(Comparator.naturalOrder()).get()); tester.configServer().setVersion(SystemApplication.configServer.id(), zone1.getId(), version, 1); - tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + tester.clock().advance(Duration.ofMinutes(60).plus(Duration.ofSeconds(1))); tester.computeVersionStatus(); reporter.maintain(); assertEquals(2, getNodesFailingUpgrade()); |