summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2020-05-13 16:06:15 +0200
committerMartin Polden <mpolden@mpolden.no>2020-05-13 16:15:42 +0200
commit1249a3344a891b609e2d6d0b60cddc688baa1e3a (patch)
tree12ef383f831c7dad06668d1f76d8dd39a0f5930c /controller-server
parentba2fab6058245da7882c634563ff933afabcba04 (diff)
Report node count per version as separate metric
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java78
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java32
2 files changed, 99 insertions, 11 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 0cee996ccdd..e83cc6b275f 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
+import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.HostName;
import com.yahoo.config.provision.zone.ZoneId;
@@ -24,8 +25,12 @@ import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.stream.Collectors;
@@ -44,12 +49,17 @@ public class MetricsReporter extends ControllerMaintainer {
public static final String DEPLOYMENT_WARNINGS = "deployment.warnings";
public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration";
public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration";
+ public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion";
+ public static final String PLATFORM_NODE_COUNT = "deployment.nodeCountByPlatformVersion";
public static final String REMAINING_ROTATIONS = "remaining_rotations";
public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests";
private final Metric metric;
private final Clock clock;
+ // Keep track of reported node counts for each version
+ private final ConcurrentHashMap<NodeCountKey, Map<String, Long>> nodeCounts = new ConcurrentHashMap<>();
+
public MetricsReporter(Controller controller, Metric metric) {
super(controller, Duration.ofMinutes(1)); // use fixed rate for metrics
this.metric = metric;
@@ -61,8 +71,16 @@ public class MetricsReporter extends ControllerMaintainer {
reportDeploymentMetrics();
reportRemainingRotations();
reportQueuedNameServiceRequests();
- reportChangeDurations(osChangeDurations(), OS_CHANGE_DURATION);
- reportChangeDurations(platformChangeDurations(), PLATFORM_CHANGE_DURATION);
+ reportInfrastructureUpgradeMetrics();
+ }
+
+ private void reportInfrastructureUpgradeMetrics() {
+ Map<NodeVersion, Duration> osChangeDurations = osChangeDurations();
+ Map<NodeVersion, Duration> platformChangeDurations = platformChangeDurations();
+ reportChangeDurations(osChangeDurations, OS_CHANGE_DURATION);
+ reportChangeDurations(platformChangeDurations, PLATFORM_CHANGE_DURATION);
+ reportNodeCount(osChangeDurations.keySet(), OS_NODE_COUNT);
+ reportNodeCount(platformChangeDurations.keySet(), PLATFORM_NODE_COUNT);
}
private void reportRemainingRotations() {
@@ -104,6 +122,31 @@ public class MetricsReporter extends ControllerMaintainer {
metric.createContext(Map.of()));
}
+ private void reportNodeCount(Set<NodeVersion> nodeVersions, String metricName) {
+ Map<NodeCountKey, Long> nodeCountByVersion = new HashMap<>();
+ Set<Version> knownVersions = new HashSet<>();
+ for (var nodeVersion : nodeVersions) {
+ NodeCountKey key = new NodeCountKey(nodeVersion.currentVersion(), nodeVersion.zone());
+ long count = nodeCountByVersion.getOrDefault(key, 0L);
+ nodeCountByVersion.put(key, ++count);
+ knownVersions.add(key.version);
+ }
+ nodeCountByVersion.forEach((nodeCountKey, count) -> {
+ nodeCounts.compute(nodeCountKey, (ignored, values) -> {
+ if (values == null) values = new HashMap<>();
+ values.put(metricName, count);
+ return values;
+ });
+ });
+ nodeCounts.forEach((nodeCountKey, value) -> {
+ long nodeCount = 0;
+ if (knownVersions.contains(nodeCountKey.version)) {
+ nodeCount = value.get(metricName);
+ }
+ metric.set(metricName, nodeCount, metric.createContext(dimensions(nodeCountKey.zone, nodeCountKey.version)));
+ });
+ }
+
private void reportChangeDurations(Map<NodeVersion, Duration> changeDurations, String metricName) {
changeDurations.forEach((nodeVersion, duration) -> {
metric.set(metricName, duration.toSeconds(), metric.createContext(dimensions(nodeVersion.hostname(), nodeVersion.zone())));
@@ -188,4 +231,35 @@ public class MetricsReporter extends ControllerMaintainer {
"zone", zone.value());
}
+ private static Map<String, String> dimensions(ZoneId zone, Version currentVersion) {
+ return Map.of("zone", zone.value(),
+ "currentVersion", currentVersion.toFullString());
+ }
+
+ private static class NodeCountKey {
+
+ private final Version version;
+ private final ZoneId zone;
+
+ public NodeCountKey(Version version, ZoneId zone) {
+ this.version = version;
+ this.zone = zone;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ NodeCountKey that = (NodeCountKey) o;
+ return version.equals(that.version) &&
+ zone.equals(that.zone);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(version, zone);
+ }
+
+ }
+
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index 68a6c8a489d..ad3b130706c 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -308,11 +308,12 @@ public class MetricsReporterTest {
var targets = List.of(Version.fromString("8.1"), Version.fromString("8.2"));
for (int i = 0; i < targets.size(); i++) {
var currentVersion = i == 0 ? version0 : targets.get(i - 1);
- var version = targets.get(i);
+ var nextVersion = targets.get(i);
// System starts upgrading to next OS version
- tester.controller().upgradeOsIn(cloud, version, Optional.empty(), false);
+ tester.controller().upgradeOsIn(cloud, nextVersion, Optional.empty(), false);
runAll(osUpgrader, statusUpdater, reporter);
assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsNodeCount(hosts.size(), currentVersion);
// Over 30 minutes pass and nothing happens
tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
@@ -320,7 +321,7 @@ public class MetricsReporterTest {
assertOsChangeDuration(Duration.ZERO, hosts);
// Nodes are told to upgrade, but do not suspend yet
- assertEquals("Wanted OS version is raised for all nodes", version,
+ assertEquals("Wanted OS version is raised for all nodes", nextVersion,
tester.configServer().nodeRepository().list(zone, SystemApplication.tenantHost.id()).stream()
.map(Node::wantedOsVersion).min(Comparator.naturalOrder()).get());
assertTrue("No nodes are suspended", tester.controller().serviceRegistry().configServer()
@@ -343,9 +344,10 @@ public class MetricsReporterTest {
tester.clock().advance(Duration.ofMinutes(20));
runAll(statusUpdater, reporter);
assertOsChangeDuration(Duration.ofMinutes(20), hostsUpgraded);
- upgradeOsTo(version, hostsUpgraded, zone, tester);
+ upgradeOsTo(nextVersion, hostsUpgraded, zone, tester);
runAll(statusUpdater, reporter);
assertOsChangeDuration(Duration.ZERO, hostsUpgraded);
+ assertOsNodeCount(hostsUpgraded.size(), nextVersion);
// One host consumes budget without upgrading
var brokenHost = suspendedHosts.get(2);
@@ -354,19 +356,31 @@ public class MetricsReporterTest {
assertOsChangeDuration(Duration.ofMinutes(35), List.of(brokenHost));
// Host eventually upgrades and is no longer reported
- upgradeOsTo(version, List.of(brokenHost), zone, tester);
+ upgradeOsTo(nextVersion, List.of(brokenHost), zone, tester);
runAll(statusUpdater, reporter);
assertOsChangeDuration(Duration.ZERO, List.of(brokenHost));
+ assertOsNodeCount(hostsUpgraded.size() + 1, nextVersion);
// Remaining hosts suspend and upgrade successfully
var remainingHosts = hosts.subList(3, hosts.size());
suspend(remainingHosts, zone, tester);
- upgradeOsTo(version, remainingHosts, zone, tester);
+ upgradeOsTo(nextVersion, remainingHosts, zone, tester);
runAll(statusUpdater, reporter);
assertOsChangeDuration(Duration.ZERO, hosts);
+ assertOsNodeCount(hosts.size(), nextVersion);
+ assertOsNodeCount(0, currentVersion);
}
}
+ private void assertOsNodeCount(int n, Version version) {
+ long nodeCount = metrics.getMetric((dimensions) -> version.toFullString().equals(dimensions.get("currentVersion")), MetricsReporter.OS_NODE_COUNT)
+ .stream()
+ .map(Number::longValue)
+ .findFirst()
+ .orElseThrow(() -> new IllegalArgumentException("Expected to find metric for version " + version));
+ assertEquals("Expected number of nodes are on " + version.toFullString(), n, nodeCount);
+ }
+
private void runAll(Runnable... runnables) {
for (var r : runnables) r.run();
}
@@ -393,9 +407,9 @@ public class MetricsReporterTest {
}
private List<Node> getNodes(ZoneId zone, List<Node> nodes, ControllerTester tester) {
- return tester.configServer().nodeRepository().list(zone, nodes.stream()
- .map(Node::hostname)
- .collect(Collectors.toList()));
+ return tester.configServer().nodeRepository().list(zone, nodes.stream()
+ .map(Node::hostname)
+ .collect(Collectors.toList()));
}
private void updateNodes(List<Node> nodes, UnaryOperator<Node.Builder> builderOps, ZoneId zone,