summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2019-10-07 13:02:27 +0200
committerMartin Polden <mpolden@mpolden.no>2019-10-07 13:59:40 +0200
commit374b324b2f5fdbcc99761e064bb4a182e0f6aa07 (patch)
tree074cc669ec6ded569d81aeba6c041c108e8b5bb0
parentdd2739ec3abce6ca0ab3341b0ef6968f48f26492 (diff)
Report metric for nodes failing OS upgrade
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java28
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java35
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java56
3 files changed, 113 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 9253e249765..361cc43da50 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -12,6 +12,7 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics;
import com.yahoo.vespa.hosted.controller.application.JobList;
import com.yahoo.vespa.hosted.controller.application.JobStatus;
import com.yahoo.vespa.hosted.controller.rotation.RotationLock;
+import com.yahoo.vespa.hosted.controller.versions.NodeVersions;
import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import java.time.Clock;
@@ -20,6 +21,7 @@ import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Map;
+import java.util.function.Function;
import java.util.stream.Collectors;
/**
@@ -36,6 +38,7 @@ public class MetricsReporter extends Maintainer {
public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds";
public static final String DEPLOYMENT_WARNINGS = "deployment.warnings";
public static final String NODES_FAILING_SYSTEM_UPGRADE = "deployment.nodesFailingSystemUpgrade";
+ public static final String NODES_FAILING_OS_UPGRADE = "deployment.nodesFailingOsUpgrade";
public static final String REMAINING_ROTATIONS = "remaining_rotations";
public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests";
@@ -56,6 +59,7 @@ public class MetricsReporter extends Maintainer {
reportRemainingRotations();
reportQueuedNameServiceRequests();
reportNodesFailingSystemUpgrade();
+ reportNodesFailingOsUpgrade();
}
private void reportRemainingRotations() {
@@ -103,13 +107,31 @@ public class MetricsReporter extends Maintainer {
metric.set(NODES_FAILING_SYSTEM_UPGRADE, nodesFailingSystemUpgrade(), metric.createContext(Map.of()));
}
+ private void reportNodesFailingOsUpgrade() {
+ metric.set(NODES_FAILING_OS_UPGRADE, nodesFailingOsUpgrade(), metric.createContext(Map.of()));
+ }
+
private int nodesFailingSystemUpgrade() {
if (!controller().versionStatus().isUpgrading()) return 0;
+ return nodesFailingUpgrade(controller().versionStatus().versions(), (vespaVersion) -> {
+ if (vespaVersion.confidence() == VespaVersion.Confidence.broken) return NodeVersions.EMPTY;
+ return vespaVersion.nodeVersions();
+ });
+ }
+
+ private int nodesFailingOsUpgrade() {
+ return nodesFailingUpgrade(controller().osVersionStatus().versions().entrySet(), (kv) -> {
+ var osVersion = kv.getKey();
+ if (osVersion.version().isEmpty()) return NodeVersions.EMPTY;
+ return kv.getValue();
+ });
+ }
+
+ private <V> int nodesFailingUpgrade(Collection<V> collection, Function<V, NodeVersions> nodeVersionsFunction) {
var nodesFailingUpgrade = 0;
var acceptableInstant = clock.instant().minus(NODE_UPGRADE_TIMEOUT);
- for (var vespaVersion : controller().versionStatus().versions()) {
- if (vespaVersion.confidence() == VespaVersion.Confidence.broken) continue;
- for (var nodeVersion : vespaVersion.nodeVersions().asMap().values()) {
+ for (var object : collection) {
+ for (var nodeVersion : nodeVersionsFunction.apply(object).asMap().values()) {
if (!nodeVersion.changing()) continue;
if (nodeVersion.changedAt().isBefore(acceptableInstant)) nodesFailingUpgrade++;
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java
index 6da77a967f1..6e7a50b5f81 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ConfigServerMock.java
@@ -151,15 +151,44 @@ public class ConfigServerMock extends AbstractComponent implements ConfigServer
/** Set version for an application in a given zone */
public void setVersion(ApplicationId application, ZoneId zone, Version version) {
- setVersion(application, zone, version, -1);
+ setVersion(application, zone, version, -1, false);
}
/** Set version for nodeCount number of nodes in application in a given zone */
public void setVersion(ApplicationId application, ZoneId zone, Version version, int nodeCount) {
+ setVersion(application, zone, version, nodeCount, false);
+ }
+
+ /** Set OS version for an application in a given zone */
+ public void setOsVersion(ApplicationId application, ZoneId zone, Version version) {
+ setOsVersion(application, zone, version, -1);
+ }
+
+ /** Set OS version for an application in a given zone */
+ public void setOsVersion(ApplicationId application, ZoneId zone, Version version, int nodeCount) {
+ setVersion(application, zone, version, nodeCount, true);
+ }
+
+ private void setVersion(ApplicationId application, ZoneId zone, Version version, int nodeCount, boolean osVersion) {
int n = 0;
for (Node node : nodeRepository().list(zone, application)) {
- nodeRepository().putByHostname(zone, new Node(node.hostname(), node.state(), node.type(), node.owner(),
- version, version));
+ Node newNode;
+ if (osVersion) {
+ newNode = new Node(node.hostname(), node.state(), node.type(), node.owner(), node.currentVersion(),
+ node.wantedVersion(), version, version, node.serviceState(),
+ node.restartGeneration(), node.wantedRestartGeneration(), node.rebootGeneration(),
+ node.wantedRebootGeneration(), node.vcpu(), node.memoryGb(), node.diskGb(),
+ node.bandwidthGbps(), node.fastDisk(), node.cost(), node.canonicalFlavor(),
+ node.clusterId(), node.clusterType());
+ } else {
+ newNode = new Node(node.hostname(), node.state(), node.type(), node.owner(), version,
+ version, node.currentOsVersion(), node.wantedOsVersion(), node.serviceState(),
+ node.restartGeneration(), node.wantedRestartGeneration(), node.rebootGeneration(),
+ node.wantedRebootGeneration(), node.vcpu(), node.memoryGb(), node.diskGb(),
+ node.bandwidthGbps(), node.fastDisk(), node.cost(), node.canonicalFlavor(),
+ node.clusterId(), node.clusterType());
+ }
+ nodeRepository().putByHostname(zone, newNode);
if (++n == nodeCount) break;
}
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index 9cb40d60677..44785407874 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.CloudName;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.zone.UpgradePolicy;
import com.yahoo.config.provision.zone.ZoneId;
@@ -262,6 +263,57 @@ public class MetricsReporterTest {
}
}
+ @Test
+ public void test_nodes_failing_os_upgrade() {
+ var tester = new DeploymentTester();
+ var reporter = createReporter(tester.controller());
+ var zone = ZoneApiMock.fromId("prod.eu-west-1");
+ var cloud = CloudName.defaultName();
+ tester.controllerTester().zoneRegistry().setOsUpgradePolicy(cloud, UpgradePolicy.create().upgrade(zone));
+ var osUpgrader = new OsUpgrader(tester.controller(), Duration.ofDays(1),
+ new JobControl(tester.controllerTester().curator()), CloudName.defaultName());;
+ var statusUpdater = new OsVersionStatusUpdater(tester.controller(), Duration.ofDays(1),
+ new JobControl(tester.controller().curator()));
+ tester.configServer().bootstrap(List.of(zone.getId()), SystemApplication.tenantHost);
+
+ // All nodes upgrade to initial OS version
+ var version0 = Version.fromString("8.0");
+ tester.controller().upgradeOsIn(cloud, version0, false);
+ osUpgrader.maintain();
+ tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version0);
+ statusUpdater.maintain();
+ reporter.maintain();
+ assertEquals(0, getNodesFailingOsUpgrade());
+
+ for (var version : List.of(Version.fromString("8.1"), Version.fromString("8.2"))) {
+ // System starts upgrading to next OS version
+ tester.controller().upgradeOsIn(cloud, version, false);
+ osUpgrader.maintain();
+ statusUpdater.maintain();
+ reporter.maintain();
+ assertEquals(0, getNodesFailingOsUpgrade());
+
+ // 30 minutes pass and nothing happens
+ tester.clock().advance(Duration.ofMinutes(30));
+ statusUpdater.maintain();
+ reporter.maintain();
+ assertEquals(0, getNodesFailingOsUpgrade());
+
+ // 1/3 nodes upgrade within timeout
+ tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version, 1);
+ tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+ statusUpdater.maintain();
+ reporter.maintain();
+ assertEquals(2, getNodesFailingOsUpgrade());
+
+ // 3/3 nodes upgrade
+ tester.configServer().setOsVersion(SystemApplication.tenantHost.id(), zone.getId(), version);
+ statusUpdater.maintain();
+ reporter.maintain();
+ assertEquals(0, getNodesFailingOsUpgrade());
+ }
+ }
+
private Duration getAverageDeploymentDuration(ApplicationId id) {
return Duration.ofSeconds(getMetric(MetricsReporter.DEPLOYMENT_AVERAGE_DURATION, id).longValue());
}
@@ -278,6 +330,10 @@ public class MetricsReporterTest {
return metrics.getMetric(MetricsReporter.NODES_FAILING_SYSTEM_UPGRADE).intValue();
}
+ private int getNodesFailingOsUpgrade() {
+ return metrics.getMetric(MetricsReporter.NODES_FAILING_OS_UPGRADE).intValue();
+ }
+
private Number getMetric(String name, ApplicationId id) {
return metrics.getMetric((dimensions) -> id.tenant().value().equals(dimensions.get("tenant")) &&
appDimension(id).equals(dimensions.get("app")),