From 1249a3344a891b609e2d6d0b60cddc688baa1e3a Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Wed, 13 May 2020 16:06:15 +0200 Subject: Report node count per version as separate metric --- .../controller/maintenance/MetricsReporter.java | 78 +++++++++++++++++++++- .../maintenance/MetricsReporterTest.java | 32 ++++++--- 2 files changed, 99 insertions(+), 11 deletions(-) (limited to 'controller-server') diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 0cee996ccdd..e83cc6b275f 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -1,6 +1,7 @@ // Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.zone.ZoneId; @@ -24,8 +25,12 @@ import java.time.Duration; import java.time.Instant; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; import java.util.stream.Collectors; @@ -44,12 +49,17 @@ public class MetricsReporter extends ControllerMaintainer { public static final String DEPLOYMENT_WARNINGS = "deployment.warnings"; public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration"; public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration"; + public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion"; + public static final String PLATFORM_NODE_COUNT = "deployment.nodeCountByPlatformVersion"; public static final String REMAINING_ROTATIONS = "remaining_rotations"; public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests"; private final Metric metric; private final Clock clock; + // Keep track of reported node counts for each version + private final ConcurrentHashMap> nodeCounts = new ConcurrentHashMap<>(); + public MetricsReporter(Controller controller, Metric metric) { super(controller, Duration.ofMinutes(1)); // use fixed rate for metrics this.metric = metric; @@ -61,8 +71,16 @@ public class MetricsReporter extends ControllerMaintainer { reportDeploymentMetrics(); reportRemainingRotations(); reportQueuedNameServiceRequests(); - reportChangeDurations(osChangeDurations(), OS_CHANGE_DURATION); - reportChangeDurations(platformChangeDurations(), PLATFORM_CHANGE_DURATION); + reportInfrastructureUpgradeMetrics(); + } + + private void reportInfrastructureUpgradeMetrics() { + Map osChangeDurations = osChangeDurations(); + Map platformChangeDurations = platformChangeDurations(); + reportChangeDurations(osChangeDurations, OS_CHANGE_DURATION); + reportChangeDurations(platformChangeDurations, PLATFORM_CHANGE_DURATION); + reportNodeCount(osChangeDurations.keySet(), OS_NODE_COUNT); + reportNodeCount(platformChangeDurations.keySet(), PLATFORM_NODE_COUNT); } private void reportRemainingRotations() { @@ -104,6 +122,31 @@ public class MetricsReporter extends ControllerMaintainer { metric.createContext(Map.of())); } + private void reportNodeCount(Set nodeVersions, String metricName) { + Map nodeCountByVersion = new HashMap<>(); + Set knownVersions = new HashSet<>(); + for (var nodeVersion : nodeVersions) { + NodeCountKey key = new NodeCountKey(nodeVersion.currentVersion(), nodeVersion.zone()); + long count = nodeCountByVersion.getOrDefault(key, 0L); + nodeCountByVersion.put(key, ++count); + knownVersions.add(key.version); + } + nodeCountByVersion.forEach((nodeCountKey, count) -> { + nodeCounts.compute(nodeCountKey, (ignored, values) -> { + if (values == null) values = new HashMap<>(); + values.put(metricName, count); + return values; + }); + }); + nodeCounts.forEach((nodeCountKey, value) -> { + long nodeCount = 0; + if (knownVersions.contains(nodeCountKey.version)) { + nodeCount = value.get(metricName); + } + metric.set(metricName, nodeCount, metric.createContext(dimensions(nodeCountKey.zone, nodeCountKey.version))); + }); + } + private void reportChangeDurations(Map changeDurations, String metricName) { changeDurations.forEach((nodeVersion, duration) -> { metric.set(metricName, duration.toSeconds(), metric.createContext(dimensions(nodeVersion.hostname(), nodeVersion.zone()))); @@ -188,4 +231,35 @@ public class MetricsReporter extends ControllerMaintainer { "zone", zone.value()); } + private static Map dimensions(ZoneId zone, Version currentVersion) { + return Map.of("zone", zone.value(), + "currentVersion", currentVersion.toFullString()); + } + + private static class NodeCountKey { + + private final Version version; + private final ZoneId zone; + + public NodeCountKey(Version version, ZoneId zone) { + this.version = version; + this.zone = zone; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NodeCountKey that = (NodeCountKey) o; + return version.equals(that.version) && + zone.equals(that.zone); + } + + @Override + public int hashCode() { + return Objects.hash(version, zone); + } + + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 68a6c8a489d..ad3b130706c 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -308,11 +308,12 @@ public class MetricsReporterTest { var targets = List.of(Version.fromString("8.1"), Version.fromString("8.2")); for (int i = 0; i < targets.size(); i++) { var currentVersion = i == 0 ? version0 : targets.get(i - 1); - var version = targets.get(i); + var nextVersion = targets.get(i); // System starts upgrading to next OS version - tester.controller().upgradeOsIn(cloud, version, Optional.empty(), false); + tester.controller().upgradeOsIn(cloud, nextVersion, Optional.empty(), false); runAll(osUpgrader, statusUpdater, reporter); assertOsChangeDuration(Duration.ZERO, hosts); + assertOsNodeCount(hosts.size(), currentVersion); // Over 30 minutes pass and nothing happens tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); @@ -320,7 +321,7 @@ public class MetricsReporterTest { assertOsChangeDuration(Duration.ZERO, hosts); // Nodes are told to upgrade, but do not suspend yet - assertEquals("Wanted OS version is raised for all nodes", version, + assertEquals("Wanted OS version is raised for all nodes", nextVersion, tester.configServer().nodeRepository().list(zone, SystemApplication.tenantHost.id()).stream() .map(Node::wantedOsVersion).min(Comparator.naturalOrder()).get()); assertTrue("No nodes are suspended", tester.controller().serviceRegistry().configServer() @@ -343,9 +344,10 @@ public class MetricsReporterTest { tester.clock().advance(Duration.ofMinutes(20)); runAll(statusUpdater, reporter); assertOsChangeDuration(Duration.ofMinutes(20), hostsUpgraded); - upgradeOsTo(version, hostsUpgraded, zone, tester); + upgradeOsTo(nextVersion, hostsUpgraded, zone, tester); runAll(statusUpdater, reporter); assertOsChangeDuration(Duration.ZERO, hostsUpgraded); + assertOsNodeCount(hostsUpgraded.size(), nextVersion); // One host consumes budget without upgrading var brokenHost = suspendedHosts.get(2); @@ -354,19 +356,31 @@ public class MetricsReporterTest { assertOsChangeDuration(Duration.ofMinutes(35), List.of(brokenHost)); // Host eventually upgrades and is no longer reported - upgradeOsTo(version, List.of(brokenHost), zone, tester); + upgradeOsTo(nextVersion, List.of(brokenHost), zone, tester); runAll(statusUpdater, reporter); assertOsChangeDuration(Duration.ZERO, List.of(brokenHost)); + assertOsNodeCount(hostsUpgraded.size() + 1, nextVersion); // Remaining hosts suspend and upgrade successfully var remainingHosts = hosts.subList(3, hosts.size()); suspend(remainingHosts, zone, tester); - upgradeOsTo(version, remainingHosts, zone, tester); + upgradeOsTo(nextVersion, remainingHosts, zone, tester); runAll(statusUpdater, reporter); assertOsChangeDuration(Duration.ZERO, hosts); + assertOsNodeCount(hosts.size(), nextVersion); + assertOsNodeCount(0, currentVersion); } } + private void assertOsNodeCount(int n, Version version) { + long nodeCount = metrics.getMetric((dimensions) -> version.toFullString().equals(dimensions.get("currentVersion")), MetricsReporter.OS_NODE_COUNT) + .stream() + .map(Number::longValue) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("Expected to find metric for version " + version)); + assertEquals("Expected number of nodes are on " + version.toFullString(), n, nodeCount); + } + private void runAll(Runnable... runnables) { for (var r : runnables) r.run(); } @@ -393,9 +407,9 @@ public class MetricsReporterTest { } private List getNodes(ZoneId zone, List nodes, ControllerTester tester) { - return tester.configServer().nodeRepository().list(zone, nodes.stream() - .map(Node::hostname) - .collect(Collectors.toList())); + return tester.configServer().nodeRepository().list(zone, nodes.stream() + .map(Node::hostname) + .collect(Collectors.toList())); } private void updateNodes(List nodes, UnaryOperator builderOps, ZoneId zone, -- cgit v1.2.3