summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2019-10-10 11:32:05 +0200
committerMartin Polden <mpolden@mpolden.no>2019-10-10 15:36:38 +0200
commitd01885d1971b43b67c2941ee5cd81f0b24c1d030 (patch)
treea838451e50cf51bfa6e37f5338c884b2acb6b676 /controller-server
parentafba8e4ecd0237eed6f670b0de65d68cf9a6047d (diff)
Make OS upgrade timeout dynamic
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java30
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java20
2 files changed, 29 insertions, 21 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 3fcc63113be..568366d7f2a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -7,7 +7,6 @@ import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.Instance;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion;
-import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics;
@@ -45,6 +44,7 @@ public class MetricsReporter extends Maintainer {
public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests";
private static final Duration NODE_UPGRADE_TIMEOUT = Duration.ofHours(1);
+ private static final Duration OS_UPGRADE_TIME_ALLOWANCE_PER_NODE = Duration.ofMinutes(30);
private final Metric metric;
private final Clock clock;
@@ -60,8 +60,7 @@ public class MetricsReporter extends Maintainer {
reportDeploymentMetrics();
reportRemainingRotations();
reportQueuedNameServiceRequests();
- reportNodesFailingSystemUpgrade();
- reportNodesFailingOsUpgrade();
+ reportNodesFailingUpgrade();
}
private void reportRemainingRotations() {
@@ -106,11 +105,8 @@ public class MetricsReporter extends Maintainer {
metric.createContext(Map.of()));
}
- private void reportNodesFailingSystemUpgrade() {
+ private void reportNodesFailingUpgrade() {
metric.set(NODES_FAILING_SYSTEM_UPGRADE, nodesFailingSystemUpgrade(), metric.createContext(Map.of()));
- }
-
- private void reportNodesFailingOsUpgrade() {
metric.set(NODES_FAILING_OS_UPGRADE, nodesFailingOsUpgrade(), metric.createContext(Map.of()));
}
@@ -119,20 +115,24 @@ public class MetricsReporter extends Maintainer {
return nodesFailingUpgrade(controller().versionStatus().versions(), (vespaVersion) -> {
if (vespaVersion.confidence() == VespaVersion.Confidence.broken) return NodeVersions.EMPTY;
return vespaVersion.nodeVersions();
- });
+ }, NODE_UPGRADE_TIMEOUT);
}
private int nodesFailingOsUpgrade() {
- return nodesFailingUpgrade(controller().osVersionStatus().versions().entrySet(), (kv) -> {
- var osVersion = kv.getKey();
- if (osVersion.version().isEmpty()) return NodeVersions.EMPTY;
- return kv.getValue();
- });
+ var allNodeVersions = controller().osVersionStatus().versions().values();
+ var totalTimeout = 0L;
+ for (var nodeVersions : allNodeVersions) {
+ for (var nodeVersion : nodeVersions.asMap().values()) {
+ if (!nodeVersion.changing()) continue;
+ totalTimeout += OS_UPGRADE_TIME_ALLOWANCE_PER_NODE.toMillis();
+ }
+ }
+ return nodesFailingUpgrade(allNodeVersions, Function.identity(), Duration.ofMillis(totalTimeout));
}
- private <V> int nodesFailingUpgrade(Collection<V> collection, Function<V, NodeVersions> nodeVersionsFunction) {
+ private <V> int nodesFailingUpgrade(Collection<V> collection, Function<V, NodeVersions> nodeVersionsFunction, Duration timeout) {
var nodesFailingUpgrade = 0;
- var acceptableInstant = clock.instant().minus(NODE_UPGRADE_TIMEOUT);
+ var acceptableInstant = clock.instant().minus(timeout);
for (var object : collection) {
for (var nodeVersion : nodeVersionsFunction.apply(object).asMap().values()) {
if (!nodeVersion.changing()) continue;
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index 0ea4abc2203..81e33b490f3 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -275,13 +275,14 @@ public class MetricsReporterTest {
new JobControl(tester.controllerTester().curator()), CloudName.defaultName());;
var statusUpdater = new OsVersionStatusUpdater(tester.controller(), Duration.ofDays(1),
new JobControl(tester.controller().curator()));
- tester.configServer().bootstrap(List.of(zone.getId()), SystemApplication.tenantHost);
+ tester.configServer().bootstrap(List.of(zone.getId()), SystemApplication.configServerHost, SystemApplication.tenantHost);
// All nodes upgrade to initial OS version
var version0 = Version.fromString("8.0");
tester.controller().upgradeOsIn(cloud, version0, false);
osUpgrader.maintain();
tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version0);
+ tester.configServer().setOsVersion(SystemApplication.configServerHost.id(), zone.getId(), version0);
statusUpdater.maintain();
reporter.maintain();
assertEquals(0, getNodesFailingOsUpgrade());
@@ -300,15 +301,22 @@ public class MetricsReporterTest {
reporter.maintain();
assertEquals(0, getNodesFailingOsUpgrade());
- // 1/3 nodes upgrade within timeout
- tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version, 1);
- tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+ // 2/6 nodes upgrade within timeout
+ tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version, 2);
+ tester.clock().advance(Duration.ofMinutes(30 * 3 /* time allowance * node count */).plus(Duration.ofSeconds(1)));
statusUpdater.maintain();
reporter.maintain();
- assertEquals(2, getNodesFailingOsUpgrade());
+ assertEquals(4, getNodesFailingOsUpgrade());
- // 3/3 nodes upgrade
+ // 5/6 nodes upgrade
tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version);
+ tester.configServer().setOsVersion(SystemApplication.configServerHost.id(), zone.getId(), version, 2);
+ statusUpdater.maintain();
+ reporter.maintain();
+ assertEquals(1, getNodesFailingOsUpgrade());
+
+ // Final node upgrades
+ tester.configServer().setOsVersion(SystemApplication.configServerHost.id(), zone.getId(), version);
statusUpdater.maintain();
reporter.maintain();
assertEquals(0, getNodesFailingOsUpgrade());