diff options
author | Martin Polden <mpolden@mpolden.no> | 2019-10-07 13:02:27 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2019-10-07 13:59:40 +0200 |
commit | 374b324b2f5fdbcc99761e064bb4a182e0f6aa07 (patch) | |
tree | 074cc669ec6ded569d81aeba6c041c108e8b5bb0 | |
parent | dd2739ec3abce6ca0ab3341b0ef6968f48f26492 (diff) |
Report metric for nodes failing OS upgrade
3 files changed, 113 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 9253e249765..361cc43da50 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -12,6 +12,7 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics; import com.yahoo.vespa.hosted.controller.application.JobList; import com.yahoo.vespa.hosted.controller.application.JobStatus; import com.yahoo.vespa.hosted.controller.rotation.RotationLock; +import com.yahoo.vespa.hosted.controller.versions.NodeVersions; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Clock; @@ -20,6 +21,7 @@ import java.time.Instant; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.function.Function; import java.util.stream.Collectors; /** @@ -36,6 +38,7 @@ public class MetricsReporter extends Maintainer { public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds"; public static final String DEPLOYMENT_WARNINGS = "deployment.warnings"; public static final String NODES_FAILING_SYSTEM_UPGRADE = "deployment.nodesFailingSystemUpgrade"; + public static final String NODES_FAILING_OS_UPGRADE = "deployment.nodesFailingOsUpgrade"; public static final String REMAINING_ROTATIONS = "remaining_rotations"; public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests"; @@ -56,6 +59,7 @@ public class MetricsReporter extends Maintainer { reportRemainingRotations(); reportQueuedNameServiceRequests(); reportNodesFailingSystemUpgrade(); + reportNodesFailingOsUpgrade(); } private void reportRemainingRotations() { @@ -103,13 +107,31 @@ public class MetricsReporter extends Maintainer { metric.set(NODES_FAILING_SYSTEM_UPGRADE, nodesFailingSystemUpgrade(), metric.createContext(Map.of())); } + private void reportNodesFailingOsUpgrade() { + metric.set(NODES_FAILING_OS_UPGRADE, nodesFailingOsUpgrade(), metric.createContext(Map.of())); + } + private int nodesFailingSystemUpgrade() { if (!controller().versionStatus().isUpgrading()) return 0; + return nodesFailingUpgrade(controller().versionStatus().versions(), (vespaVersion) -> { + if (vespaVersion.confidence() == VespaVersion.Confidence.broken) return NodeVersions.EMPTY; + return vespaVersion.nodeVersions(); + }); + } + + private int nodesFailingOsUpgrade() { + return nodesFailingUpgrade(controller().osVersionStatus().versions().entrySet(), (kv) -> { + var osVersion = kv.getKey(); + if (osVersion.version().isEmpty()) return NodeVersions.EMPTY; + return kv.getValue(); + }); + } + + private <V> int nodesFailingUpgrade(Collection<V> collection, Function<V, NodeVersions> nodeVersionsFunction) { var nodesFailingUpgrade = 0; var acceptableInstant = clock.instant().minus(NODE_UPGRADE_TIMEOUT); - for (var vespaVersion : controller().versionStatus().versions()) { - if (vespaVersion.confidence() == VespaVersion.Confidence.broken) continue; - for (var nodeVersion : vespaVersion.nodeVersions().asMap().values()) { + for (var object : collection) { + for (var nodeVersion : nodeVersionsFunction.apply(object).asMap().values()) { if (!nodeVersion.changing()) continue; if (nodeVersion.changedAt().isBefore(acceptableInstant)) nodesFailingUpgrade++; } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java index 6da77a967f1..6e7a50b5f81 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java @@ -151,15 +151,44 @@ public class ConfigServerMock extends AbstractComponent implements ConfigServer /** Set version for an application in a given zone */ public void setVersion(ApplicationId application, ZoneId zone, Version version) { - setVersion(application, zone, version, -1); + setVersion(application, zone, version, -1, false); } /** Set version for nodeCount number of nodes in application in a given zone */ public void setVersion(ApplicationId application, ZoneId zone, Version version, int nodeCount) { + setVersion(application, zone, version, nodeCount, false); + } + + /** Set OS version for an application in a given zone */ + public void setOsVersion(ApplicationId application, ZoneId zone, Version version) { + setOsVersion(application, zone, version, -1); + } + + /** Set OS version for an application in a given zone */ + public void setOsVersion(ApplicationId application, ZoneId zone, Version version, int nodeCount) { + setVersion(application, zone, version, nodeCount, true); + } + + private void setVersion(ApplicationId application, ZoneId zone, Version version, int nodeCount, boolean osVersion) { int n = 0; for (Node node : nodeRepository().list(zone, application)) { - nodeRepository().putByHostname(zone, new Node(node.hostname(), node.state(), node.type(), node.owner(), - version, version)); + Node newNode; + if (osVersion) { + newNode = new Node(node.hostname(), node.state(), node.type(), node.owner(), node.currentVersion(), + node.wantedVersion(), version, version, node.serviceState(), + node.restartGeneration(), node.wantedRestartGeneration(), node.rebootGeneration(), + node.wantedRebootGeneration(), node.vcpu(), node.memoryGb(), node.diskGb(), + node.bandwidthGbps(), node.fastDisk(), node.cost(), node.canonicalFlavor(), + node.clusterId(), node.clusterType()); + } else { + newNode = new Node(node.hostname(), node.state(), node.type(), node.owner(), version, + version, node.currentOsVersion(), node.wantedOsVersion(), node.serviceState(), + node.restartGeneration(), node.wantedRestartGeneration(), node.rebootGeneration(), + node.wantedRebootGeneration(), node.vcpu(), node.memoryGb(), node.diskGb(), + node.bandwidthGbps(), node.fastDisk(), node.cost(), node.canonicalFlavor(), + node.clusterId(), node.clusterType()); + } + nodeRepository().putByHostname(zone, newNode); if (++n == nodeCount) break; } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 9cb40d60677..44785407874 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.CloudName; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.zone.UpgradePolicy; import com.yahoo.config.provision.zone.ZoneId; @@ -262,6 +263,57 @@ public class MetricsReporterTest { } } + @Test + public void test_nodes_failing_os_upgrade() { + var tester = new DeploymentTester(); + var reporter = createReporter(tester.controller()); + var zone = ZoneApiMock.fromId("prod.eu-west-1"); + var cloud = CloudName.defaultName(); + tester.controllerTester().zoneRegistry().setOsUpgradePolicy(cloud, UpgradePolicy.create().upgrade(zone)); + var osUpgrader = new OsUpgrader(tester.controller(), Duration.ofDays(1), + new JobControl(tester.controllerTester().curator()), CloudName.defaultName());; + var statusUpdater = new OsVersionStatusUpdater(tester.controller(), Duration.ofDays(1), + new JobControl(tester.controller().curator())); + tester.configServer().bootstrap(List.of(zone.getId()), SystemApplication.tenantHost); + + // All nodes upgrade to initial OS version + var version0 = Version.fromString("8.0"); + tester.controller().upgradeOsIn(cloud, version0, false); + osUpgrader.maintain(); + tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version0); + statusUpdater.maintain(); + reporter.maintain(); + assertEquals(0, getNodesFailingOsUpgrade()); + + for (var version : List.of(Version.fromString("8.1"), Version.fromString("8.2"))) { + // System starts upgrading to next OS version + tester.controller().upgradeOsIn(cloud, version, false); + osUpgrader.maintain(); + statusUpdater.maintain(); + reporter.maintain(); + assertEquals(0, getNodesFailingOsUpgrade()); + + // 30 minutes pass and nothing happens + tester.clock().advance(Duration.ofMinutes(30)); + statusUpdater.maintain(); + reporter.maintain(); + assertEquals(0, getNodesFailingOsUpgrade()); + + // 1/3 nodes upgrade within timeout + tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version, 1); + tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + statusUpdater.maintain(); + reporter.maintain(); + assertEquals(2, getNodesFailingOsUpgrade()); + + // 3/3 nodes upgrade + tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version); + statusUpdater.maintain(); + reporter.maintain(); + assertEquals(0, getNodesFailingOsUpgrade()); + } + } + private Duration getAverageDeploymentDuration(ApplicationId id) { return Duration.ofSeconds(getMetric(MetricsReporter.DEPLOYMENT_AVERAGE_DURATION, id).longValue()); } @@ -278,6 +330,10 @@ public class MetricsReporterTest { return metrics.getMetric(MetricsReporter.NODES_FAILING_SYSTEM_UPGRADE).intValue(); } + private int getNodesFailingOsUpgrade() { + return metrics.getMetric(MetricsReporter.NODES_FAILING_OS_UPGRADE).intValue(); + } + private Number getMetric(String name, ApplicationId id) { return metrics.getMetric((dimensions) -> id.tenant().value().equals(dimensions.get("tenant")) && appDimension(id).equals(dimensions.get("app")), |