diff options
author | Martin Polden <mpolden@mpolden.no> | 2020-04-24 13:42:41 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2020-04-24 13:42:41 +0200 |
commit | 47e3cfe6c39985ba2db5204248fc46652b76bb67 (patch) | |
tree | 36ce448c9159647e8759124771731ab7f3d8b81d /controller-server | |
parent | 8fdbdf6b988f9fcddcc19ecdba7b0b2840f09ad5 (diff) |
Remove unused metrics
Diffstat (limited to 'controller-server')
2 files changed, 44 insertions, 73 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index d70c65f343a..dfd35d52b2d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -40,22 +40,11 @@ public class MetricsReporter extends Maintainer { public static final String DEPLOYMENT_FAILING_UPGRADES = "deployment.failingUpgrades"; public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds"; public static final String DEPLOYMENT_WARNINGS = "deployment.warnings"; - // TODO(mpolden): Remove these two metrics - public static final String NODES_FAILING_PLATFORM_UPGRADE = "deployment.nodesFailingSystemUpgrade"; - public static final String NODES_FAILING_OS_UPGRADE = "deployment.nodesFailingOsUpgrade"; public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration"; public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration"; public static final String REMAINING_ROTATIONS = "remaining_rotations"; public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests"; - // The time a system application node can spend after suspending for Vespa upgrade until the upgrade is completed. - // Nodes exceeding this budget are counted as failures. - private static final Duration PLATFORM_UPGRADE_BUDGET = Duration.ofMinutes(30); - - // The time a system application node can spend after suspending foor OS upgrade until the upgrade is completed. - // Nodes exceeding this budget are counted as failures. - private static final Duration OS_UPGRADE_BUDGET = Duration.ofMinutes(30); - private final Metric metric; private final Clock clock; @@ -113,16 +102,8 @@ public class MetricsReporter extends Maintainer { } private void reportChangeDurations() { - var platformChangeDurations = platformChangeDurations(); - var osChangeDurations = osChangeDurations(); - var nodesFailingSystemUpgrade = platformChangeDurations.values().stream() - .filter(duration -> duration.compareTo(PLATFORM_UPGRADE_BUDGET) > 0) - .count(); - var nodesFailingOsUpgrade = osChangeDurations.values().stream() - .filter(duration -> duration.compareTo(OS_UPGRADE_BUDGET) > 0) - .count(); - metric.set(NODES_FAILING_PLATFORM_UPGRADE, nodesFailingSystemUpgrade, metric.createContext(Map.of())); - metric.set(NODES_FAILING_OS_UPGRADE, nodesFailingOsUpgrade, metric.createContext(Map.of())); + Map<NodeVersion, Duration> platformChangeDurations = platformChangeDurations(); + Map<NodeVersion, Duration> osChangeDurations = osChangeDurations(); platformChangeDurations.forEach((nodeVersion, duration) -> { metric.set(PLATFORM_CHANGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(nodeVersion))); }); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index e2482fc9174..df5c3bcbc3c 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -240,26 +240,27 @@ public class MetricsReporterTest { var version0 = Version.fromString("7.0"); tester.upgradeSystem(version0); reporter.maintain(); - assertEquals(0, getNodesFailingUpgrade()); + var hosts = tester.configServer().nodeRepository().list(zone, SystemApplication.configServer.id()); + assertPlatformChangeDuration(Duration.ZERO, hosts); for (var version : List.of(Version.fromString("7.1"), Version.fromString("7.2"))) { // System starts upgrading to next version tester.upgradeController(version); reporter.maintain(); - assertEquals(0, getNodesFailingUpgrade()); + assertPlatformChangeDuration(Duration.ZERO, hosts); systemUpgrader.maintain(); // 30 minutes pass and nothing happens tester.clock().advance(Duration.ofMinutes(30)); runAll(tester::computeVersionStatus, reporter); - assertEquals(0, getNodesFailingUpgrade()); + assertPlatformChangeDuration(Duration.ZERO, hosts); // 1/3 nodes upgrade within timeout - var hosts = tester.configServer().nodeRepository().list(zone, SystemApplication.configServer.id()); - assertEquals("Wanted version is raised for all nodes", version, hosts.stream() - .map(Node::wantedVersion) - .min(Comparator.naturalOrder()) - .get()); + assertEquals("Wanted version is raised for all nodes", version, + getNodes(zone, hosts, tester).stream() + .map(Node::wantedVersion) + .min(Comparator.naturalOrder()) + .get()); suspend(hosts, zone, tester); var firstHost = hosts.get(0); upgradeTo(version, List.of(firstHost), zone, tester); @@ -267,21 +268,13 @@ public class MetricsReporterTest { // 2/3 spend their budget and are reported as failures tester.clock().advance(Duration.ofHours(1)); runAll(tester::computeVersionStatus, reporter); - assertEquals(2, getNodesFailingUpgrade()); - assertEquals("Change duration is reset for completed upgrade", Duration.ZERO, - getChangeDuration(MetricsReporter.PLATFORM_CHANGE_DURATION, firstHost.hostname())); - for (var host : hosts.subList(1, hosts.size())) { - assertEquals(Duration.ofHours(1), getChangeDuration(MetricsReporter.PLATFORM_CHANGE_DURATION, - host.hostname())); - } + assertPlatformChangeDuration(Duration.ZERO, List.of(firstHost)); + assertPlatformChangeDuration(Duration.ofHours(1), hosts.subList(1, hosts.size())); // Remaining nodes eventually upgrade - upgradeTo(version, hosts.subList(1, 3), zone, tester); + upgradeTo(version, hosts.subList(1, hosts.size()), zone, tester); runAll(tester::computeVersionStatus, reporter); - assertEquals(0, getNodesFailingUpgrade()); - for (var host : hosts) { - assertEquals(Duration.ZERO, getChangeDuration(MetricsReporter.PLATFORM_CHANGE_DURATION, host.hostname())); - } + assertPlatformChangeDuration(Duration.ZERO, hosts); assertEquals(version, tester.controller().systemVersion()); } } @@ -306,18 +299,19 @@ public class MetricsReporterTest { tester.configServer().setOsVersion(version0, SystemApplication.tenantHost.id(), zone); tester.configServer().setOsVersion(version0, SystemApplication.configServerHost.id(), zone); runAll(statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); + List<Node> hosts = tester.configServer().nodeRepository().list(zone); + assertOsChangeDuration(Duration.ZERO, hosts); for (var version : List.of(Version.fromString("8.1"), Version.fromString("8.2"))) { // System starts upgrading to next OS version tester.controller().upgradeOsIn(cloud, version, false); runAll(osUpgrader, statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); + assertOsChangeDuration(Duration.ZERO, hosts); // Over 30 minutes pass and nothing happens tester.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); runAll(statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); + assertOsChangeDuration(Duration.ZERO, hosts); // Nodes are told to upgrade, but do not suspend yet assertEquals("Wanted OS version is raised for all nodes", version, @@ -330,52 +324,40 @@ public class MetricsReporterTest { // Another 30 minutes pass tester.clock().advance(Duration.ofMinutes(30)); runAll(statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); + assertOsChangeDuration(Duration.ZERO, hosts); // 3/6 hosts suspend - var hosts = tester.configServer().nodeRepository().list(zone); var suspendedHosts = hosts.subList(0, 3); suspend(suspendedHosts, zone, tester); runAll(statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); + assertOsChangeDuration(Duration.ZERO, hosts); - // Two hosts upgrade after 20 minutes + // Two hosts spend 20 minutes upgrading var hostsUpgraded = suspendedHosts.subList(0, 2); tester.clock().advance(Duration.ofMinutes(20)); runAll(statusUpdater, reporter); - for (var host : suspendedHosts) { - assertEquals(Duration.ofMinutes(20), getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, host.hostname())); - } + assertOsChangeDuration(Duration.ofMinutes(20), hostsUpgraded); upgradeOsTo(version, hostsUpgraded, zone, tester); runAll(statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); - for (var host : hostsUpgraded) { - assertEquals(Duration.ZERO, getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, host.hostname())); - } + assertOsChangeDuration(Duration.ZERO, hostsUpgraded); // One host consumes budget without upgrading var brokenHost = suspendedHosts.get(2); tester.clock().advance(Duration.ofMinutes(15)); runAll(statusUpdater, reporter); - assertEquals(1, getNodesFailingOsUpgrade()); - assertEquals(Duration.ofMinutes(35), getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, - brokenHost.hostname())); + assertOsChangeDuration(Duration.ofMinutes(35), List.of(brokenHost)); // Host eventually upgrades and is no longer reported upgradeOsTo(version, List.of(brokenHost), zone, tester); runAll(statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); - assertEquals(Duration.ZERO, getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, brokenHost.hostname())); + assertOsChangeDuration(Duration.ZERO, List.of(brokenHost)); // Remaining hosts suspend and upgrade successfully var remainingHosts = hosts.subList(3, hosts.size()); suspend(remainingHosts, zone, tester); upgradeOsTo(version, remainingHosts, zone, tester); runAll(statusUpdater, reporter); - assertEquals(0, getNodesFailingOsUpgrade()); - for (var host : hosts) { - assertEquals(Duration.ZERO, getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, host.hostname())); - } + assertOsChangeDuration(Duration.ZERO, hosts); } } @@ -404,11 +386,15 @@ public class MetricsReporterTest { zone, tester); } + private List<Node> getNodes(ZoneId zone, List<Node> nodes, ControllerTester tester) { + return tester.configServer().nodeRepository().list(zone, nodes.stream() + .map(Node::hostname) + .collect(Collectors.toList())); + } + private void updateNodes(List<Node> nodes, UnaryOperator<Node.Builder> builderOps, ZoneId zone, ControllerTester tester) { - var currentNodes = tester.configServer().nodeRepository().list(zone, nodes.stream() - .map(Node::hostname) - .collect(Collectors.toList())); + var currentNodes = getNodes(zone, nodes, tester); var updatedNodes = currentNodes.stream() .map(node -> builderOps.apply(new Node.Builder(node)).build()) .collect(Collectors.toList()); @@ -427,18 +413,22 @@ public class MetricsReporterTest { return getMetric(MetricsReporter.DEPLOYMENT_WARNINGS, id).intValue(); } - private int getNodesFailingUpgrade() { - return metrics.getMetric(MetricsReporter.NODES_FAILING_PLATFORM_UPGRADE).intValue(); - } - private Duration getChangeDuration(String metric, HostName hostname) { return metrics.getMetric((dimensions) -> hostname.value().equals(dimensions.get("host")), metric) .map(n -> Duration.ofSeconds(n.longValue())) .orElseThrow(() -> new IllegalArgumentException("Expected to find metric for " + hostname)); } - private int getNodesFailingOsUpgrade() { - return metrics.getMetric(MetricsReporter.NODES_FAILING_OS_UPGRADE).intValue(); + private void assertPlatformChangeDuration(Duration duration, List<Node> nodes) { + for (var node : nodes) { + assertEquals(duration, getChangeDuration(MetricsReporter.PLATFORM_CHANGE_DURATION, node.hostname())); + } + } + + private void assertOsChangeDuration(Duration duration, List<Node> nodes) { + for (var node : nodes) { + assertEquals(duration, getChangeDuration(MetricsReporter.OS_CHANGE_DURATION, node.hostname())); + } } private Number getMetric(String name, ApplicationId id) { |