diff options
author | Martin Polden <mpolden@mpolden.no> | 2019-10-11 08:52:38 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-10-11 08:52:38 +0200 |
commit | 28913b4f7dc9597966c6e93f1a77af923549eea2 (patch) | |
tree | 2af6cd29d8fdacc39e4652139c82248d04e7d2ff /controller-server | |
parent | de7a927242b1a326eecbc30e96af7ebc5440ca52 (diff) | |
parent | d01885d1971b43b67c2941ee5cd81f0b24c1d030 (diff) |
Merge pull request #10943 from vespa-engine/mpolden/os-upgrade-dynamic-timeout
Make OS upgrade timeout dynamic
Diffstat (limited to 'controller-server')
2 files changed, 29 insertions, 21 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 3fcc63113be..568366d7f2a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -7,7 +7,6 @@ import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion; -import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics; @@ -45,6 +44,7 @@ public class MetricsReporter extends Maintainer { public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests"; private static final Duration NODE_UPGRADE_TIMEOUT = Duration.ofHours(1); + private static final Duration OS_UPGRADE_TIME_ALLOWANCE_PER_NODE = Duration.ofMinutes(30); private final Metric metric; private final Clock clock; @@ -60,8 +60,7 @@ public class MetricsReporter extends Maintainer { reportDeploymentMetrics(); reportRemainingRotations(); reportQueuedNameServiceRequests(); - reportNodesFailingSystemUpgrade(); - reportNodesFailingOsUpgrade(); + reportNodesFailingUpgrade(); } private void reportRemainingRotations() { @@ -106,11 +105,8 @@ public class MetricsReporter extends Maintainer { metric.createContext(Map.of())); } - private void reportNodesFailingSystemUpgrade() { + private void reportNodesFailingUpgrade() { metric.set(NODES_FAILING_SYSTEM_UPGRADE, nodesFailingSystemUpgrade(), metric.createContext(Map.of())); - } - - private void reportNodesFailingOsUpgrade() { metric.set(NODES_FAILING_OS_UPGRADE, nodesFailingOsUpgrade(), metric.createContext(Map.of())); } @@ -119,20 +115,24 @@ public class MetricsReporter extends Maintainer { return nodesFailingUpgrade(controller().versionStatus().versions(), (vespaVersion) -> { if (vespaVersion.confidence() == VespaVersion.Confidence.broken) return NodeVersions.EMPTY; return vespaVersion.nodeVersions(); - }); + }, NODE_UPGRADE_TIMEOUT); } private int nodesFailingOsUpgrade() { - return nodesFailingUpgrade(controller().osVersionStatus().versions().entrySet(), (kv) -> { - var osVersion = kv.getKey(); - if (osVersion.version().isEmpty()) return NodeVersions.EMPTY; - return kv.getValue(); - }); + var allNodeVersions = controller().osVersionStatus().versions().values(); + var totalTimeout = 0L; + for (var nodeVersions : allNodeVersions) { + for (var nodeVersion : nodeVersions.asMap().values()) { + if (!nodeVersion.changing()) continue; + totalTimeout += OS_UPGRADE_TIME_ALLOWANCE_PER_NODE.toMillis(); + } + } + return nodesFailingUpgrade(allNodeVersions, Function.identity(), Duration.ofMillis(totalTimeout)); } - private <V> int nodesFailingUpgrade(Collection<V> collection, Function<V, NodeVersions> nodeVersionsFunction) { + private <V> int nodesFailingUpgrade(Collection<V> collection, Function<V, NodeVersions> nodeVersionsFunction, Duration timeout) { var nodesFailingUpgrade = 0; - var acceptableInstant = clock.instant().minus(NODE_UPGRADE_TIMEOUT); + var acceptableInstant = clock.instant().minus(timeout); for (var object : collection) { for (var nodeVersion : nodeVersionsFunction.apply(object).asMap().values()) { if (!nodeVersion.changing()) continue; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 0ea4abc2203..81e33b490f3 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -275,13 +275,14 @@ public class MetricsReporterTest { new JobControl(tester.controllerTester().curator()), CloudName.defaultName());; var statusUpdater = new OsVersionStatusUpdater(tester.controller(), Duration.ofDays(1), new JobControl(tester.controller().curator())); - tester.configServer().bootstrap(List.of(zone.getId()), SystemApplication.tenantHost); + tester.configServer().bootstrap(List.of(zone.getId()), SystemApplication.configServerHost, SystemApplication.tenantHost); // All nodes upgrade to initial OS version var version0 = Version.fromString("8.0"); tester.controller().upgradeOsIn(cloud, version0, false); osUpgrader.maintain(); tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version0); + tester.configServer().setOsVersion(SystemApplication.configServerHost.id(), zone.getId(), version0); statusUpdater.maintain(); reporter.maintain(); assertEquals(0, getNodesFailingOsUpgrade()); @@ -300,15 +301,22 @@ public class MetricsReporterTest { reporter.maintain(); assertEquals(0, getNodesFailingOsUpgrade()); - // 1/3 nodes upgrade within timeout - tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version, 1); - tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + // 2/6 nodes upgrade within timeout + tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version, 2); + tester.clock().advance(Duration.ofMinutes(30 * 3 /* time allowance * node count */).plus(Duration.ofSeconds(1))); statusUpdater.maintain(); reporter.maintain(); - assertEquals(2, getNodesFailingOsUpgrade()); + assertEquals(4, getNodesFailingOsUpgrade()); - // 3/3 nodes upgrade + // 5/6 nodes upgrade tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version); + tester.configServer().setOsVersion(SystemApplication.configServerHost.id(), zone.getId(), version, 2); + statusUpdater.maintain(); + reporter.maintain(); + assertEquals(1, getNodesFailingOsUpgrade()); + + // Final node upgrades + tester.configServer().setOsVersion(SystemApplication.configServerHost.id(), zone.getId(), version); statusUpdater.maintain(); reporter.maintain(); assertEquals(0, getNodesFailingOsUpgrade()); |