summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java46
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java61
2 files changed, 71 insertions, 36 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 95b1066b7c9..85f45ded6bd 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -1,7 +1,10 @@
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
+import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.HostName;
+import com.yahoo.config.provision.zone.ZoneId;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
@@ -22,8 +25,11 @@ import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.stream.Collectors;
@@ -48,6 +54,9 @@ public class MetricsReporter extends ControllerMaintainer {
private final Metric metric;
private final Clock clock;
+ // Tracks hosts and versions for which we have reported change duration metrics
+ private final ConcurrentHashMap<HostName, Set<Version>> reportedVersions = new ConcurrentHashMap<>();
+
public MetricsReporter(Controller controller, Metric metric) {
super(controller, Duration.ofMinutes(1)); // use fixed rate for metrics
this.metric = metric;
@@ -59,7 +68,8 @@ public class MetricsReporter extends ControllerMaintainer {
reportDeploymentMetrics();
reportRemainingRotations();
reportQueuedNameServiceRequests();
- reportChangeDurations();
+ reportChangeDurations(osChangeDurations(), OS_CHANGE_DURATION);
+ reportChangeDurations(platformChangeDurations(), PLATFORM_CHANGE_DURATION);
}
private void reportRemainingRotations() {
@@ -101,14 +111,26 @@ public class MetricsReporter extends ControllerMaintainer {
metric.createContext(Map.of()));
}
- private void reportChangeDurations() {
- Map<NodeVersion, Duration> platformChangeDurations = platformChangeDurations();
- Map<NodeVersion, Duration> osChangeDurations = osChangeDurations();
- platformChangeDurations.forEach((nodeVersion, duration) -> {
- metric.set(PLATFORM_CHANGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(nodeVersion)));
+ private void reportChangeDurations(Map<NodeVersion, Duration> changeDurations, String metricName) {
+ changeDurations.forEach((nodeVersion, duration) -> {
+ // Zero the metric for past versions because our metrics framework remembers the last value for each unique
+ // dimension set for each metric.
+ reportVersion(nodeVersion.hostname(), nodeVersion.currentVersion());
+ for (var reportedVersion : reportedVersions.get(nodeVersion.hostname())) {
+ if (reportedVersion.equals(nodeVersion.currentVersion())) continue;
+ metric.set(metricName, 0, metric.createContext(dimensions(nodeVersion.hostname(), nodeVersion.zone(), reportedVersion)));
+ }
+ metric.set(metricName, duration.toSeconds(), metric.createContext(dimensions(nodeVersion.hostname(), nodeVersion.zone(), nodeVersion.currentVersion())));
});
- osChangeDurations.forEach((nodeVersion, duration) -> {
- metric.set(OS_CHANGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(nodeVersion)));
+ }
+
+ private void reportVersion(HostName hostname, Version version) {
+ reportedVersions.compute(hostname, (ignored, values) -> {
+ if (values == null) {
+ values = new HashSet<>();
+ }
+ values.add(version);
+ return values;
});
}
@@ -185,10 +207,10 @@ public class MetricsReporter extends ControllerMaintainer {
"app",application.application().value() + "." + application.instance().value());
}
- private static Map<String, String> dimensions(NodeVersion nodeVersion) {
- return Map.of("host", nodeVersion.hostname().value(),
- "zone", nodeVersion.zone().value(),
- "currentVersion", nodeVersion.currentVersion().toFullString());
+ private static Map<String, String> dimensions(HostName hostname, ZoneId zone, Version currentVersion) {
+ return Map.of("host", hostname.value(),
+ "zone", zone.value(),
+ "currentVersion", currentVersion.toFullString());
}
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index 31f25cd17ca..4e4c31c519e 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -240,19 +240,22 @@ public class MetricsReporterTest {
tester.upgradeSystem(version0);
reporter.maintain();
var hosts = tester.configServer().nodeRepository().list(zone, SystemApplication.configServer.id());
- assertPlatformChangeDuration(Duration.ZERO, hosts);
+ assertPlatformChangeDuration(Duration.ZERO, hosts, version0);
- for (var version : List.of(Version.fromString("7.1"), Version.fromString("7.2"))) {
+ var targets = List.of(Version.fromString("7.1"), Version.fromString("7.2"));
+ for (int i = 0; i < targets.size(); i++) {
+ var currentVersion = i == 0 ? version0 : targets.get(i - 1);
+ var version = targets.get(i);
// System starts upgrading to next version
tester.upgradeController(version);
reporter.maintain();
- assertPlatformChangeDuration(Duration.ZERO, hosts);
+ assertPlatformChangeDuration(Duration.ZERO, hosts, currentVersion);
systemUpgrader.maintain();
// 30 minutes pass and nothing happens
tester.clock().advance(Duration.ofMinutes(30));
runAll(tester::computeVersionStatus, reporter);
- assertPlatformChangeDuration(Duration.ZERO, hosts);
+ assertPlatformChangeDuration(Duration.ZERO, hosts, currentVersion);
// 1/3 nodes upgrade within timeout
assertEquals("Wanted version is raised for all nodes", version,
@@ -267,13 +270,13 @@ public class MetricsReporterTest {
// 2/3 spend their budget and are reported as failures
tester.clock().advance(Duration.ofHours(1));
runAll(tester::computeVersionStatus, reporter);
- assertPlatformChangeDuration(Duration.ZERO, List.of(firstHost));
- assertPlatformChangeDuration(Duration.ofHours(1), hosts.subList(1, hosts.size()));
+ assertPlatformChangeDuration(Duration.ZERO, List.of(firstHost), version);
+ assertPlatformChangeDuration(Duration.ofHours(1), hosts.subList(1, hosts.size()), currentVersion);
// Remaining nodes eventually upgrade
upgradeTo(version, hosts.subList(1, hosts.size()), zone, tester);
runAll(tester::computeVersionStatus, reporter);
- assertPlatformChangeDuration(Duration.ZERO, hosts);
+ assertPlatformChangeDuration(Duration.ZERO, hosts, version);
assertEquals(version, tester.controller().systemVersion());
}
}
@@ -299,18 +302,21 @@ public class MetricsReporterTest {
tester.configServer().setOsVersion(version0, SystemApplication.configServerHost.id(), zone);
runAll(statusUpdater, reporter);
List<Node> hosts = tester.configServer().nodeRepository().list(zone);
- assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsChangeDuration(Duration.ZERO, hosts, version0);
- for (var version : List.of(Version.fromString("8.1"), Version.fromString("8.2"))) {
+ var targets = List.of(Version.fromString("8.1"), Version.fromString("8.2"));
+ for (int i = 0; i < targets.size(); i++) {
+ var currentVersion = i == 0 ? version0 : targets.get(i - 1);
+ var version = targets.get(i);
// System starts upgrading to next OS version
tester.controller().upgradeOsIn(cloud, version, false);
runAll(osUpgrader, statusUpdater, reporter);
- assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsChangeDuration(Duration.ZERO, hosts, currentVersion);
// Over 30 minutes pass and nothing happens
tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsChangeDuration(Duration.ZERO, hosts, currentVersion);
// Nodes are told to upgrade, but do not suspend yet
assertEquals("Wanted OS version is raised for all nodes", version,
@@ -323,40 +329,40 @@ public class MetricsReporterTest {
// Another 30 minutes pass
tester.clock().advance(Duration.ofMinutes(30));
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsChangeDuration(Duration.ZERO, hosts, currentVersion);
// 3/6 hosts suspend
var suspendedHosts = hosts.subList(0, 3);
suspend(suspendedHosts, zone, tester);
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsChangeDuration(Duration.ZERO, hosts, currentVersion);
// Two hosts spend 20 minutes upgrading
var hostsUpgraded = suspendedHosts.subList(0, 2);
tester.clock().advance(Duration.ofMinutes(20));
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ofMinutes(20), hostsUpgraded);
+ assertOsChangeDuration(Duration.ofMinutes(20), hostsUpgraded, currentVersion);
upgradeOsTo(version, hostsUpgraded, zone, tester);
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ZERO, hostsUpgraded);
+ assertOsChangeDuration(Duration.ZERO, hostsUpgraded, version);
// One host consumes budget without upgrading
var brokenHost = suspendedHosts.get(2);
tester.clock().advance(Duration.ofMinutes(15));
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ofMinutes(35), List.of(brokenHost));
+ assertOsChangeDuration(Duration.ofMinutes(35), List.of(brokenHost), currentVersion);
// Host eventually upgrades and is no longer reported
upgradeOsTo(version, List.of(brokenHost), zone, tester);
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ZERO, List.of(brokenHost));
+ assertOsChangeDuration(Duration.ZERO, List.of(brokenHost), version);
// Remaining hosts suspend and upgrade successfully
var remainingHosts = hosts.subList(3, hosts.size());
suspend(remainingHosts, zone, tester);
upgradeOsTo(version, remainingHosts, zone, tester);
runAll(statusUpdater, reporter);
- assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsChangeDuration(Duration.ZERO, hosts, version);
}
}
@@ -412,21 +418,28 @@ public class MetricsReporterTest {
return getMetric(MetricsReporter.DEPLOYMENT_WARNINGS, id).intValue();
}
- private Duration getChangeDuration(String metric, HostName hostname) {
- return metrics.getMetric((dimensions) -> hostname.value().equals(dimensions.get("host")), metric)
+ private Duration getChangeDuration(String metric, HostName hostname, Version currentVersion) {
+ return metrics.getMetrics((dimensions) -> hostname.value().equals(dimensions.get("host")) &&
+ currentVersion.toFullString().equals(dimensions.get("currentVersion")))
+ .values()
+ .stream()
+ .map(metrics -> metrics.get(metric))
.map(n -> Duration.ofSeconds(n.longValue()))
+ .findFirst()
.orElseThrow(() -> new IllegalArgumentException("Expected to find metric for " + hostname));
}
- private void assertPlatformChangeDuration(Duration duration, List<Node> nodes) {
+ private void assertPlatformChangeDuration(Duration duration, List<Node> nodes, Version currentVersion) {
for (var node : nodes) {
- assertEquals("Platform change duration of " + node.hostname(), duration, getChangeDuration(MetricsReporter.PLATFORM_CHANGE_DURATION, node.hostname()));
+ assertEquals("Platform change duration of " + node.hostname() + " on version " + currentVersion,
+ duration, getChangeDuration(MetricsReporter.PLATFORM_CHANGE_DURATION, node.hostname(), currentVersion));
}
}
- private void assertOsChangeDuration(Duration duration, List<Node> nodes) {
+ private void assertOsChangeDuration(Duration duration, List<Node> nodes, Version currentVersion) {
for (var node : nodes) {
- assertEquals("OS change duration of " + node.hostname(), duration, getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, node.hostname()));
+ assertEquals("OS change duration of " + node.hostname() + " on version " + currentVersion,
+ duration, getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, node.hostname(), currentVersion));
}
}