diff options
author | Geir Storli <geirst@verizonmedia.com> | 2021-02-08 12:48:26 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-08 12:48:26 +0100 |
commit | 4e5d6c8f7ebbf7863726fad05e62c5e69d87ea1d (patch) | |
tree | 92f005b6e7d326b3d893945bd9c4b460d61ffb9e /clustercontroller-core | |
parent | 13f55bd6fbae31fcb1c93f5006de081e22cfc49e (diff) | |
parent | 69d91455d41b70d9a417c491a468a5e8ee426fd0 (diff) |
Merge pull request #16424 from vespa-engine/geirst/resource-usage-metrics-in-cluster-controller
Add and expose resource usage metrics from the cluster controller.
Diffstat (limited to 'clustercontroller-core')
7 files changed, 226 insertions, 30 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index e238303b58b..60b14e86f50 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -383,7 +383,8 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd verifyInControllerThread(); ClusterState baselineState = stateBundle.getBaselineClusterState(); newStates.add(stateBundle); - metricUpdater.updateClusterStateMetrics(cluster, baselineState); + metricUpdater.updateClusterStateMetrics(cluster, baselineState, + ResourceUsageStats.calculateFrom(cluster.getNodeInfo(), options.clusterFeedBlockLimit, stateBundle.getFeedBlock())); lastMetricUpdateCycleCount = cycleCount; systemStateBroadcaster.handleNewClusterStates(stateBundle); // Iff master, always store new version in ZooKeeper _before_ publishing to any @@ -399,7 +400,8 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd if (cycleCount > 300 + lastMetricUpdateCycleCount) { ClusterStateBundle stateBundle = stateVersionTracker.getVersionedClusterStateBundle(); ClusterState baselineState = stateBundle.getBaselineClusterState(); - metricUpdater.updateClusterStateMetrics(cluster, baselineState); + metricUpdater.updateClusterStateMetrics(cluster, baselineState, + ResourceUsageStats.calculateFrom(cluster.getNodeInfo(), options.clusterFeedBlockLimit, stateBundle.getFeedBlock())); lastMetricUpdateCycleCount = cycleCount; return true; } else { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java index 650f7756bf9..40b10fe8145 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java @@ -32,7 +32,7 @@ public class MetricUpdater { + nodeCounts.getOrDefault(State.MAINTENANCE, 0); } - public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state) { + public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state, ResourceUsageStats resourceUsage) { Map<String, String> dimensions = new HashMap<>(); dimensions.put("cluster", cluster.getName()); for (NodeType type : NodeType.getTypes()) { @@ -59,6 +59,10 @@ public class MetricUpdater { dimensions.remove("node-type"); MetricReporter.Context context = createContext(dimensions); metricReporter.add("cluster-state-change", 1, context); + + metricReporter.set("resource_usage.max_disk_utilization", resourceUsage.getMaxDiskUtilization(), context); + metricReporter.set("resource_usage.max_memory_utilization", resourceUsage.getMaxMemoryUtilization(), context); + metricReporter.set("resource_usage.nodes_above_limit", resourceUsage.getNodesAboveLimit(), context); } public void updateMasterElectionMetrics(Map<Integer, Integer> data) { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStats.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStats.java new file mode 100644 index 00000000000..aef5b1be468 --- /dev/null +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStats.java @@ -0,0 +1,89 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import com.yahoo.vespa.clustercontroller.core.hostinfo.ContentNode; + +import java.util.Collection; +import java.util.Map; +import java.util.Optional; + +/** + * Represents resource usage stats for the cluster that are exposed as metrics. + */ +public class ResourceUsageStats { + + // Max disk utilization (usage / limit) among all content nodes. + private final double maxDiskUtilization; + + // Max memory utilization (usage / limit) among all content nodes. + private final double maxMemoryUtilization; + + // The number of content nodes that are above at least one resource limit. + // When this is above zero feed is blocked in the cluster. + private final int nodesAboveLimit; + + private static final String diskResource = "disk"; + private static final String memoryResource = "memory"; + + public ResourceUsageStats() { + this.maxDiskUtilization = 0.0; + this.maxMemoryUtilization = 0.0; + this.nodesAboveLimit = 0; + } + + public ResourceUsageStats(double maxDiskUtilization, + double maxMemoryUtilization, + int nodesAboveLimit) { + this.maxDiskUtilization = maxDiskUtilization; + this.maxMemoryUtilization = maxMemoryUtilization; + this.nodesAboveLimit = nodesAboveLimit; + } + + public double getMaxDiskUtilization() { + return maxDiskUtilization; + } + + public double getMaxMemoryUtilization() { + return maxMemoryUtilization; + } + + public int getNodesAboveLimit() { + return nodesAboveLimit; + } + + public static ResourceUsageStats calculateFrom(Collection<NodeInfo> nodeInfos, + Map<String, Double> feedBlockLimits, + Optional<ClusterStateBundle.FeedBlock> feedBlock) { + double maxDiskUsage = 0.0; + double maxMemoryUsage = 0.0; + for (NodeInfo info : nodeInfos) { + if (info.isStorage()) { + var node = info.getHostInfo().getContentNode(); + maxDiskUsage = Double.max(maxDiskUsage, resourceUsageOf(diskResource, node)); + maxMemoryUsage = Double.max(maxMemoryUsage, resourceUsageOf(memoryResource, node)); + } + } + return new ResourceUsageStats(maxDiskUsage / limitOf(diskResource, feedBlockLimits), + maxMemoryUsage / limitOf(memoryResource, feedBlockLimits), + calculateNodesAboveLimit(feedBlock)); + } + + private static double resourceUsageOf(String type, ContentNode node) { + var result = node.resourceUsageOf(type); + return result.isPresent() ? result.get().getUsage() : 0.0; + } + + private static int calculateNodesAboveLimit(Optional<ClusterStateBundle.FeedBlock> feedBlock) { + if (!feedBlock.isPresent()) { + return 0; + } + var exhaustions = feedBlock.get().getConcreteExhaustions(); + return (int) exhaustions.stream().map(resource -> resource.node).distinct().count(); + } + + private static double limitOf(String type, Map<String, Double> limits) { + var result = limits.get(type); + return (result != null) ? result : 1.0; + } +} + diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java index 650e4dc7888..2254435e629 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.clustercontroller.core; import com.yahoo.vdslib.state.Node; import com.yahoo.vdslib.state.NodeType; +import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import com.yahoo.vespa.clustercontroller.core.hostinfo.ResourceUsage; import java.util.Arrays; @@ -12,6 +13,8 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; +import static com.yahoo.vespa.clustercontroller.core.ClusterFixture.storageNode; + public class FeedBlockUtil { static class NodeAndUsages { @@ -90,5 +93,22 @@ public class FeedBlockUtil { return Arrays.stream(exhaustions).collect(Collectors.toCollection(LinkedHashSet::new)); } + static ClusterFixture createFixtureWithReportedUsages(NodeAndUsages... nodeAndUsages) { + var highestIndex = Arrays.stream(nodeAndUsages).mapToInt(u -> u.index).max(); + if (highestIndex.isEmpty()) { + throw new IllegalArgumentException("Can't have an empty cluster"); + } + var cf = ClusterFixture + .forFlatCluster(highestIndex.getAsInt() + 1) + .assignDummyRpcAddresses() + .bringEntireClusterUp(); + for (var nu : nodeAndUsages) { + cf.cluster().getNodeInfo(storageNode(nu.index)) + .setHostInfo(HostInfo.createHostInfo(createResourceUsageJson(nu.usages))); + } + return cf; + } + + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java index cf50c106b2e..68b5bf103a6 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java @@ -39,20 +39,26 @@ public class MetricReporterTest { } } - private static HasMetricContext.Dimension[] withNodeTypeDimension(String type) { + private static HasMetricContext.Dimension[] withClusterDimension() { // Dimensions that are always present HasMetricContext.Dimension controllerDim = withDimension("controller-index", "0"); HasMetricContext.Dimension clusterDim = withDimension("cluster", "foo"); + return new HasMetricContext.Dimension[] { controllerDim, clusterDim }; + } + + private static HasMetricContext.Dimension[] withNodeTypeDimension(String type) { // Node type-specific dimension HasMetricContext.Dimension nodeType = withDimension("node-type", type); - return new HasMetricContext.Dimension[] { controllerDim, clusterDim, nodeType }; + var otherDims = withClusterDimension(); + return new HasMetricContext.Dimension[] { otherDims[0], otherDims[1], nodeType }; } @Test public void metrics_are_emitted_for_different_node_state_counts() { Fixture f = new Fixture(); f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(), - ClusterState.stateFromString("distributor:10 .1.s:d storage:9 .1.s:d .2.s:m .4.s:d")); + ClusterState.stateFromString("distributor:10 .1.s:d storage:9 .1.s:d .2.s:m .4.s:d"), + new ResourceUsageStats()); verify(f.mockReporter).set(eq("cluster-controller.up.count"), eq(9), argThat(hasMetricContext(withNodeTypeDimension("distributor")))); @@ -68,7 +74,8 @@ public class MetricReporterTest { private void doTestRatiosInState(String clusterState, double distributorRatio, double storageRatio) { Fixture f = new Fixture(); - f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(), ClusterState.stateFromString(clusterState)); + f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(), ClusterState.stateFromString(clusterState), + new ResourceUsageStats()); verify(f.mockReporter).set(eq("cluster-controller.available-nodes.ratio"), doubleThat(closeTo(distributorRatio, 0.0001)), @@ -100,4 +107,24 @@ public class MetricReporterTest { doTestRatiosInState("distributor:10 storage:10 .0.s:m", 1.0, 1.0); } + @Test + public void metrics_are_emitted_for_resource_usage() { + Fixture f = new Fixture(); + f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(), + ClusterState.stateFromString("distributor:10 storage:10"), + new ResourceUsageStats(0.5, 0.6, 5)); + + verify(f.mockReporter).set(eq("cluster-controller.resource_usage.max_disk_utilization"), + doubleThat(closeTo(0.5, 0.0001)), + argThat(hasMetricContext(withClusterDimension()))); + + verify(f.mockReporter).set(eq("cluster-controller.resource_usage.max_memory_utilization"), + doubleThat(closeTo(0.6, 0.0001)), + argThat(hasMetricContext(withClusterDimension()))); + + verify(f.mockReporter).set(eq("cluster-controller.resource_usage.nodes_above_limit"), + eq(5), + argThat(hasMetricContext(withClusterDimension()))); + } + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java index 54686919a7b..f5f7b4676d8 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java @@ -1,19 +1,13 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.clustercontroller.core; -import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import org.junit.Test; -import java.util.Arrays; - import static com.yahoo.vespa.clustercontroller.core.ClusterFixture.storageNode; -import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.NodeAndUsages; +import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.createFixtureWithReportedUsages; import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.forNode; import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.mapOf; -import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.setOf; import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.usage; -import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.createResourceUsageJson; - import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; @@ -21,22 +15,6 @@ import static org.junit.Assert.assertTrue; public class ResourceExhaustionCalculatorTest { - private static ClusterFixture createFixtureWithReportedUsages(NodeAndUsages... nodeAndUsages) { - var highestIndex = Arrays.stream(nodeAndUsages).mapToInt(u -> u.index).max(); - if (highestIndex.isEmpty()) { - throw new IllegalArgumentException("Can't have an empty cluster"); - } - var cf = ClusterFixture - .forFlatCluster(highestIndex.getAsInt() + 1) - .assignDummyRpcAddresses() - .bringEntireClusterUp(); - for (var nu : nodeAndUsages) { - cf.cluster().getNodeInfo(storageNode(nu.index)) - .setHostInfo(HostInfo.createHostInfo(createResourceUsageJson(nu.usages))); - } - return cf; - } - @Test public void no_feed_block_returned_when_no_resources_lower_than_limit() { var calc = new ResourceExhaustionCalculator(true, mapOf(usage("disk", 0.5), usage("memory", 0.8))); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStatsTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStatsTest.java new file mode 100644 index 00000000000..9eeb36265e0 --- /dev/null +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStatsTest.java @@ -0,0 +1,76 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import org.junit.Test; + +import java.util.Collection; +import java.util.Collections; +import java.util.Map; +import java.util.Optional; + +import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.createFixtureWithReportedUsages; +import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.exhaustion; +import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.forNode; +import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.setOf; +import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.usage; + +import static org.junit.Assert.assertEquals; + +public class ResourceUsageStatsTest { + + private final double DELTA = 0.00001; + + @Test + public void disk_and_memory_utilization_is_max_among_all_content_nodes() { + var stats = ResourceUsageStats.calculateFrom(createNodeInfo( + forNode(1, usage("disk", 0.3), usage("memory", 0.6)), + forNode(2, usage("disk", 0.4), usage("memory", 0.5))), + createFeedBlockLimits(0.8, 0.9), + Optional.empty()); + assertEquals(0.4 / 0.8, stats.getMaxDiskUtilization(), DELTA); + assertEquals(0.6 / 0.9, stats.getMaxMemoryUtilization(), DELTA); + } + + @Test + public void disk_and_memory_utilization_is_zero_if_no_samples_are_available() { + var stats = ResourceUsageStats.calculateFrom(createNodeInfo( + forNode(1), forNode(2)), + createFeedBlockLimits(0.8, 0.9), + Optional.empty()); + assertEquals(0.0, stats.getMaxDiskUtilization(), DELTA); + assertEquals(0.0, stats.getMaxMemoryUtilization(), DELTA); + } + + @Test + public void nodes_above_limit_is_zero_without_feed_block_status() { + var stats = ResourceUsageStats.calculateFrom(Collections.emptyList(), Collections.emptyMap(), Optional.empty()); + assertEquals(0, stats.getNodesAboveLimit()); + } + + @Test + public void nodes_above_limit_is_equal_to_node_resource_exhaustions() { + var stats = ResourceUsageStats.calculateFrom(Collections.emptyList(), Collections.emptyMap(), + createFeedBlock(exhaustion(1, "disk"), exhaustion(2, "memory"))); + assertEquals(2, stats.getNodesAboveLimit()); + } + + @Test + public void nodes_above_limit_counts_each_node_only_once() { + var stats = ResourceUsageStats.calculateFrom(Collections.emptyList(), Collections.emptyMap(), + createFeedBlock(exhaustion(1, "disk"), exhaustion(1, "memory"))); + assertEquals(1, stats.getNodesAboveLimit()); + } + + private static Collection<NodeInfo> createNodeInfo(FeedBlockUtil.NodeAndUsages... nodeAndUsages) { + return createFixtureWithReportedUsages(nodeAndUsages).cluster().getNodeInfo(); + } + + private static Map<String, Double> createFeedBlockLimits(double diskLimit, double memoryLimit) { + return Map.of("disk", diskLimit, "memory", memoryLimit); + } + + private static Optional<ClusterStateBundle.FeedBlock> createFeedBlock(NodeResourceExhaustion... exhaustions) { + return Optional.of(new ClusterStateBundle.FeedBlock(true, "", setOf(exhaustions))); + } +} + |