summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2021-02-05 16:27:08 +0000
committerGeir Storli <geirst@verizonmedia.com>2021-02-05 16:27:08 +0000
commit6a6d1dd303c60bc96b7020ff2c6b2334dea540d5 (patch)
treeb9db7d07cc77c89a44d55ac4402cdad6b7e55f90
parent1691b77f23c5cceb9141403a79441d78c3de797d (diff)
Add and expose resource usage metrics from the cluster controller.
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStats.java82
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java20
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java35
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java24
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStatsTest.java69
7 files changed, 212 insertions, 30 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index e238303b58b..60b14e86f50 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -383,7 +383,8 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
verifyInControllerThread();
ClusterState baselineState = stateBundle.getBaselineClusterState();
newStates.add(stateBundle);
- metricUpdater.updateClusterStateMetrics(cluster, baselineState);
+ metricUpdater.updateClusterStateMetrics(cluster, baselineState,
+ ResourceUsageStats.calculateFrom(cluster.getNodeInfo(), options.clusterFeedBlockLimit, stateBundle.getFeedBlock()));
lastMetricUpdateCycleCount = cycleCount;
systemStateBroadcaster.handleNewClusterStates(stateBundle);
// Iff master, always store new version in ZooKeeper _before_ publishing to any
@@ -399,7 +400,8 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
if (cycleCount > 300 + lastMetricUpdateCycleCount) {
ClusterStateBundle stateBundle = stateVersionTracker.getVersionedClusterStateBundle();
ClusterState baselineState = stateBundle.getBaselineClusterState();
- metricUpdater.updateClusterStateMetrics(cluster, baselineState);
+ metricUpdater.updateClusterStateMetrics(cluster, baselineState,
+ ResourceUsageStats.calculateFrom(cluster.getNodeInfo(), options.clusterFeedBlockLimit, stateBundle.getFeedBlock()));
lastMetricUpdateCycleCount = cycleCount;
return true;
} else {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
index 650f7756bf9..40b10fe8145 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
@@ -32,7 +32,7 @@ public class MetricUpdater {
+ nodeCounts.getOrDefault(State.MAINTENANCE, 0);
}
- public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state) {
+ public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state, ResourceUsageStats resourceUsage) {
Map<String, String> dimensions = new HashMap<>();
dimensions.put("cluster", cluster.getName());
for (NodeType type : NodeType.getTypes()) {
@@ -59,6 +59,10 @@ public class MetricUpdater {
dimensions.remove("node-type");
MetricReporter.Context context = createContext(dimensions);
metricReporter.add("cluster-state-change", 1, context);
+
+ metricReporter.set("resource_usage.max_disk_utilization", resourceUsage.getMaxDiskUtilization(), context);
+ metricReporter.set("resource_usage.max_memory_utilization", resourceUsage.getMaxMemoryUtilization(), context);
+ metricReporter.set("resource_usage.nodes_above_limit", resourceUsage.getNodesAboveLimit(), context);
}
public void updateMasterElectionMetrics(Map<Integer, Integer> data) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStats.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStats.java
new file mode 100644
index 00000000000..c02397cb043
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStats.java
@@ -0,0 +1,82 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import com.yahoo.vespa.clustercontroller.core.hostinfo.ContentNode;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Optional;
+
+/**
+ * Represents resource usage stats for the cluster that are exposed as metrics.
+ */
+public class ResourceUsageStats {
+
+ // Max disk utilization (usage / limit) among all content nodes.
+ private final double maxDiskUtilization;
+
+ // Max memory utilization (usage / limit) among all content nodes.
+ private final double maxMemoryUtilization;
+
+ // The number of content nodes that are above at least one resource limit.
+ // When this is above zero feed is blocked in the cluster.
+ private final int nodesAboveLimit;
+
+ private static final String diskResource = "disk";
+ private static final String memoryResource = "memory";
+
+ public ResourceUsageStats() {
+ this.maxDiskUtilization = 0.0;
+ this.maxMemoryUtilization = 0.0;
+ this.nodesAboveLimit = 0;
+ }
+
+ public ResourceUsageStats(double maxDiskUtilization,
+ double maxMemoryUtilization,
+ int nodesAboveLimit) {
+ this.maxDiskUtilization = maxDiskUtilization;
+ this.maxMemoryUtilization = maxMemoryUtilization;
+ this.nodesAboveLimit = nodesAboveLimit;
+ }
+
+ public double getMaxDiskUtilization() {
+ return maxDiskUtilization;
+ }
+
+ public double getMaxMemoryUtilization() {
+ return maxMemoryUtilization;
+ }
+
+ public int getNodesAboveLimit() {
+ return nodesAboveLimit;
+ }
+
+ public static ResourceUsageStats calculateFrom(Collection<NodeInfo> nodeInfos,
+ Map<String, Double> feedBlockLimits,
+ Optional<ClusterStateBundle.FeedBlock> feedBlock) {
+ double maxDiskUsage = 0.0;
+ double maxMemoryUsage = 0.0;
+ for (NodeInfo info : nodeInfos) {
+ if (info.isStorage()) {
+ var node = info.getHostInfo().getContentNode();
+ maxDiskUsage = Double.max(maxDiskUsage, resourceUsageOf(diskResource, node));
+ maxMemoryUsage = Double.max(maxMemoryUsage, resourceUsageOf(memoryResource, node));
+ }
+ }
+ int nodesAboveLimit = (feedBlock.isPresent() ? feedBlock.get().getConcreteExhaustions().size() : 0);
+ return new ResourceUsageStats(maxDiskUsage / limitOf(diskResource, feedBlockLimits),
+ maxMemoryUsage / limitOf(memoryResource, feedBlockLimits),
+ nodesAboveLimit);
+ }
+
+ private static double resourceUsageOf(String type, ContentNode node) {
+ var result = node.resourceUsageOf(type);
+ return result.isPresent() ? result.get().getUsage() : 0.0;
+ }
+
+ private static double limitOf(String type, Map<String, Double> limits) {
+ var result = limits.get(type);
+ return (result != null) ? result : 1.0;
+ }
+}
+
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java
index 650e4dc7888..2254435e629 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.clustercontroller.core;
import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeType;
+import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.clustercontroller.core.hostinfo.ResourceUsage;
import java.util.Arrays;
@@ -12,6 +13,8 @@ import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
+import static com.yahoo.vespa.clustercontroller.core.ClusterFixture.storageNode;
+
public class FeedBlockUtil {
static class NodeAndUsages {
@@ -90,5 +93,22 @@ public class FeedBlockUtil {
return Arrays.stream(exhaustions).collect(Collectors.toCollection(LinkedHashSet::new));
}
+ static ClusterFixture createFixtureWithReportedUsages(NodeAndUsages... nodeAndUsages) {
+ var highestIndex = Arrays.stream(nodeAndUsages).mapToInt(u -> u.index).max();
+ if (highestIndex.isEmpty()) {
+ throw new IllegalArgumentException("Can't have an empty cluster");
+ }
+ var cf = ClusterFixture
+ .forFlatCluster(highestIndex.getAsInt() + 1)
+ .assignDummyRpcAddresses()
+ .bringEntireClusterUp();
+ for (var nu : nodeAndUsages) {
+ cf.cluster().getNodeInfo(storageNode(nu.index))
+ .setHostInfo(HostInfo.createHostInfo(createResourceUsageJson(nu.usages)));
+ }
+ return cf;
+ }
+
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java
index cf50c106b2e..68b5bf103a6 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MetricReporterTest.java
@@ -39,20 +39,26 @@ public class MetricReporterTest {
}
}
- private static HasMetricContext.Dimension[] withNodeTypeDimension(String type) {
+ private static HasMetricContext.Dimension[] withClusterDimension() {
// Dimensions that are always present
HasMetricContext.Dimension controllerDim = withDimension("controller-index", "0");
HasMetricContext.Dimension clusterDim = withDimension("cluster", "foo");
+ return new HasMetricContext.Dimension[] { controllerDim, clusterDim };
+ }
+
+ private static HasMetricContext.Dimension[] withNodeTypeDimension(String type) {
// Node type-specific dimension
HasMetricContext.Dimension nodeType = withDimension("node-type", type);
- return new HasMetricContext.Dimension[] { controllerDim, clusterDim, nodeType };
+ var otherDims = withClusterDimension();
+ return new HasMetricContext.Dimension[] { otherDims[0], otherDims[1], nodeType };
}
@Test
public void metrics_are_emitted_for_different_node_state_counts() {
Fixture f = new Fixture();
f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(),
- ClusterState.stateFromString("distributor:10 .1.s:d storage:9 .1.s:d .2.s:m .4.s:d"));
+ ClusterState.stateFromString("distributor:10 .1.s:d storage:9 .1.s:d .2.s:m .4.s:d"),
+ new ResourceUsageStats());
verify(f.mockReporter).set(eq("cluster-controller.up.count"), eq(9),
argThat(hasMetricContext(withNodeTypeDimension("distributor"))));
@@ -68,7 +74,8 @@ public class MetricReporterTest {
private void doTestRatiosInState(String clusterState, double distributorRatio, double storageRatio) {
Fixture f = new Fixture();
- f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(), ClusterState.stateFromString(clusterState));
+ f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(), ClusterState.stateFromString(clusterState),
+ new ResourceUsageStats());
verify(f.mockReporter).set(eq("cluster-controller.available-nodes.ratio"),
doubleThat(closeTo(distributorRatio, 0.0001)),
@@ -100,4 +107,24 @@ public class MetricReporterTest {
doTestRatiosInState("distributor:10 storage:10 .0.s:m", 1.0, 1.0);
}
+ @Test
+ public void metrics_are_emitted_for_resource_usage() {
+ Fixture f = new Fixture();
+ f.metricUpdater.updateClusterStateMetrics(f.clusterFixture.cluster(),
+ ClusterState.stateFromString("distributor:10 storage:10"),
+ new ResourceUsageStats(0.5, 0.6, 5));
+
+ verify(f.mockReporter).set(eq("cluster-controller.resource_usage.max_disk_utilization"),
+ doubleThat(closeTo(0.5, 0.0001)),
+ argThat(hasMetricContext(withClusterDimension())));
+
+ verify(f.mockReporter).set(eq("cluster-controller.resource_usage.max_memory_utilization"),
+ doubleThat(closeTo(0.6, 0.0001)),
+ argThat(hasMetricContext(withClusterDimension())));
+
+ verify(f.mockReporter).set(eq("cluster-controller.resource_usage.nodes_above_limit"),
+ eq(5),
+ argThat(hasMetricContext(withClusterDimension())));
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
index 54686919a7b..f5f7b4676d8 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
@@ -1,19 +1,13 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
-import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import org.junit.Test;
-import java.util.Arrays;
-
import static com.yahoo.vespa.clustercontroller.core.ClusterFixture.storageNode;
-import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.NodeAndUsages;
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.createFixtureWithReportedUsages;
import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.forNode;
import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.mapOf;
-import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.setOf;
import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.usage;
-import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.createResourceUsageJson;
-
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
@@ -21,22 +15,6 @@ import static org.junit.Assert.assertTrue;
public class ResourceExhaustionCalculatorTest {
- private static ClusterFixture createFixtureWithReportedUsages(NodeAndUsages... nodeAndUsages) {
- var highestIndex = Arrays.stream(nodeAndUsages).mapToInt(u -> u.index).max();
- if (highestIndex.isEmpty()) {
- throw new IllegalArgumentException("Can't have an empty cluster");
- }
- var cf = ClusterFixture
- .forFlatCluster(highestIndex.getAsInt() + 1)
- .assignDummyRpcAddresses()
- .bringEntireClusterUp();
- for (var nu : nodeAndUsages) {
- cf.cluster().getNodeInfo(storageNode(nu.index))
- .setHostInfo(HostInfo.createHostInfo(createResourceUsageJson(nu.usages)));
- }
- return cf;
- }
-
@Test
public void no_feed_block_returned_when_no_resources_lower_than_limit() {
var calc = new ResourceExhaustionCalculator(true, mapOf(usage("disk", 0.5), usage("memory", 0.8)));
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStatsTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStatsTest.java
new file mode 100644
index 00000000000..0f5d27d46e3
--- /dev/null
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceUsageStatsTest.java
@@ -0,0 +1,69 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import org.junit.Test;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Optional;
+
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.createFixtureWithReportedUsages;
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.exhaustion;
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.forNode;
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.setOf;
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.usage;
+
+import static org.junit.Assert.assertEquals;
+
+public class ResourceUsageStatsTest {
+
+ private final double DELTA = 0.00001;
+
+ @Test
+ public void disk_and_memory_utilization_is_max_among_all_content_nodes() {
+ var stats = ResourceUsageStats.calculateFrom(createNodeInfo(
+ forNode(1, usage("disk", 0.3), usage("memory", 0.6)),
+ forNode(2, usage("disk", 0.4), usage("memory", 0.5))),
+ createFeedBlockLimits(0.8, 0.9),
+ Optional.empty());
+ assertEquals(0.4 / 0.8, stats.getMaxDiskUtilization(), DELTA);
+ assertEquals(0.6 / 0.9, stats.getMaxMemoryUtilization(), DELTA);
+ }
+
+ @Test
+ public void disk_and_memory_utilization_is_zero_if_no_samples_are_available() {
+ var stats = ResourceUsageStats.calculateFrom(createNodeInfo(
+ forNode(1), forNode(2)),
+ createFeedBlockLimits(0.8, 0.9),
+ Optional.empty());
+ assertEquals(0.0, stats.getMaxDiskUtilization(), DELTA);
+ assertEquals(0.0, stats.getMaxMemoryUtilization(), DELTA);
+ }
+
+ @Test
+ public void nodes_above_limit_is_zero_without_feed_block_status() {
+ var stats = ResourceUsageStats.calculateFrom(Collections.emptyList(), Collections.emptyMap(), Optional.empty());
+ assertEquals(0, stats.getNodesAboveLimit());
+ }
+
+ @Test
+ public void nodes_above_limit_is_equal_to_node_resource_exhaustions() {
+ var stats = ResourceUsageStats.calculateFrom(Collections.emptyList(), Collections.emptyMap(),
+ createFeedBlock(exhaustion(1, "disk"), exhaustion(2, "memory")));
+ assertEquals(2, stats.getNodesAboveLimit());
+ }
+
+ private static Collection<NodeInfo> createNodeInfo(FeedBlockUtil.NodeAndUsages... nodeAndUsages) {
+ return createFixtureWithReportedUsages(nodeAndUsages).cluster().getNodeInfo();
+ }
+
+ private static Map<String, Double> createFeedBlockLimits(double diskLimit, double memoryLimit) {
+ return Map.of("disk", diskLimit, "memory", memoryLimit);
+ }
+
+ private static Optional<ClusterStateBundle.FeedBlock> createFeedBlock(NodeResourceExhaustion... exhaustions) {
+ return Optional.of(new ClusterStateBundle.FeedBlock(true, "", setOf(exhaustions)));
+ }
+}
+