From d0892bfdb01390b1196f22391760a844c8c37482 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Fri, 3 May 2024 15:34:00 +0000 Subject: Emit single metric for how out of sync the cluster data is With these changes the cluster controller continuously maintains a global aggregate across all content nodes that represents the number of pending and total buckets per bucket space. This aggregate can be sampled in O(1) time. An explicit metric `cluster-buckets-out-of-sync-ratio` has been added, and the value is also emitted as part of the cluster state REST API. Note: only emitted when statistics have been received from _all_ distributors for a particular cluster state, as it would otherwise potentially represent a state somewhere arbitrary between two or more distinct states. --- .../core/ClusterStatsAggregatorTest.java | 27 ++++++++-- .../core/ContentNodeStatsBuilder.java | 8 ++- .../core/GlobalBucketSyncStatsCalculatorTest.java | 59 ++++++++++++++++++++++ .../core/restapiv2/ClusterControllerMock.java | 18 +++++++ .../core/restapiv2/ClusterTest.java | 42 +++++++++++++++ .../core/restapiv2/StateRestApiTest.java | 2 +- 6 files changed, 150 insertions(+), 6 deletions(-) create mode 100644 clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculatorTest.java (limited to 'clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core') diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregatorTest.java index aa47ce2ec82..14276c51416 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregatorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregatorTest.java @@ -33,6 +33,10 @@ public class ClusterStatsAggregatorTest { assertEquals(expectedStats.build(), aggregator.getAggregatedStatsForDistributor(distributorIndex)); } + public void verifyGlobal(ContentNodeStatsBuilder expectedStats) { + assertEquals(expectedStats.build(), aggregator.getAggregatedStats().getGlobalStats()); + } + boolean hasUpdatesFromAllDistributors() { return aggregator.getAggregatedStats().hasUpdatesFromAllDistributors(); } @@ -64,6 +68,10 @@ public class ClusterStatsAggregatorTest { return Sets.newHashSet(indices); } + private static ContentNodeStatsBuilder globalStatsBuilder() { + return ContentNodeStatsBuilder.forNode(-1); + } + @Test void aggregator_handles_updates_to_single_distributor_and_content_node() { Fixture f = new Fixture(distributorNodes(1), contentNodes(3)); @@ -72,6 +80,9 @@ public class ClusterStatsAggregatorTest { .add(3, "global", 11, 2); f.update(1, stats); f.verify(stats); + f.verifyGlobal(globalStatsBuilder() + .add("default", 10, 1) + .add("global", 11, 2)); } @Test @@ -80,9 +91,13 @@ public class ClusterStatsAggregatorTest { f.verify(new ContentClusterStatsBuilder() .add(3, "default", 10 + 14, 1 + 5) - .add(3, "global", 11 + 15, 2 + 6) + .add(3, "global", 11 + 15, 2 + 6) .add(4, "default", 12 + 16, 3 + 7) - .add(4, "global", 13 + 17, 4 + 8)); + .add(4, "global", 13 + 17, 4 + 8)); + + f.verifyGlobal(globalStatsBuilder() + .add("default", (10 + 14) + (12 + 16), (1 + 5) + (3 + 7)) + .add("global", (11 + 15) + (13 + 17), (2 + 6) + (4 + 8))); } @Test @@ -94,28 +109,34 @@ public class ClusterStatsAggregatorTest { f.update(2, new ContentClusterStatsBuilder().add(3, "default", 10, 1)); f.verify(new ContentClusterStatsBuilder().addInvalid(3, "default", 10, 1)); + f.verifyGlobal(globalStatsBuilder().addInvalid("default", 10, 1)); f.update(1, new ContentClusterStatsBuilder().add(3, "default", 11, 2)); f.verify(new ContentClusterStatsBuilder().add(3, "default", 10 + 11, 1 + 2)); + f.verifyGlobal(globalStatsBuilder().add("default", 10 + 11, 1 + 2)); f.update(2, new ContentClusterStatsBuilder().add(3, "default", 15, 6)); f.verify(new ContentClusterStatsBuilder().add(3, "default", 11 + 15, 2 + 6)); + f.verifyGlobal(globalStatsBuilder().add("default", 11 + 15, 2 + 6)); f.update(1, new ContentClusterStatsBuilder().add(3, "default", 16, 7)); f.verify(new ContentClusterStatsBuilder().add(3, "default", 15 + 16, 6 + 7)); + f.verifyGlobal(globalStatsBuilder().add("default", 15 + 16, 6 + 7)); f.update(2, new ContentClusterStatsBuilder().add(3, "default", 12, 3)); f.verify(new ContentClusterStatsBuilder().add(3, "default", 16 + 12, 7 + 3)); + f.verifyGlobal(globalStatsBuilder().add("default", 16 + 12, 7 + 3)); } @Test - void aggregator_handles_more_content_nodes_that_distributors() { + void aggregator_handles_more_content_nodes_than_distributors() { Fixture f = new Fixture(distributorNodes(1), contentNodes(3, 4)); ContentClusterStatsBuilder stats = new ContentClusterStatsBuilder() .add(3, "default", 10, 1) .add(4, "default", 11, 2); f.update(1, stats); f.verify(stats); + f.verifyGlobal(globalStatsBuilder().add("default", 10 + 11, 1 + 2)); } @Test diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentNodeStatsBuilder.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentNodeStatsBuilder.java index 9d4664a9362..34035793e75 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentNodeStatsBuilder.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentNodeStatsBuilder.java @@ -13,7 +13,7 @@ public class ContentNodeStatsBuilder { this.nodeIndex = nodeIndex; } - static ContentNodeStatsBuilder forNode(int nodeIndex) { + public static ContentNodeStatsBuilder forNode(int nodeIndex) { return new ContentNodeStatsBuilder(nodeIndex); } @@ -21,12 +21,16 @@ public class ContentNodeStatsBuilder { return add(bucketSpace, ContentNodeStats.BucketSpaceStats.of(bucketsTotal, bucketsPending)); } + public ContentNodeStatsBuilder addInvalid(String bucketSpace, long bucketsTotal, long bucketsPending) { + return add(bucketSpace, ContentNodeStats.BucketSpaceStats.invalid(bucketsTotal, bucketsPending)); + } + public ContentNodeStatsBuilder add(String bucketSpace, ContentNodeStats.BucketSpaceStats bucketSpaceStats) { stats.put(bucketSpace, bucketSpaceStats); return this; } - ContentNodeStats build() { + public ContentNodeStats build() { return new ContentNodeStats(nodeIndex, stats); } } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculatorTest.java new file mode 100644 index 00000000000..d44aaa54a1d --- /dev/null +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculatorTest.java @@ -0,0 +1,59 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class GlobalBucketSyncStatsCalculatorTest { + + private static ContentNodeStatsBuilder globalStatsBuilder() { + return ContentNodeStatsBuilder.forNode(-1); + } + + private static void assertComputedRatio(double expected, ContentNodeStatsBuilder statsBuilder) { + var maybeRatio = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(statsBuilder.build()); + if (maybeRatio.isEmpty()) { + throw new IllegalArgumentException("Expected calculation to yield a value, but was empty"); + } + assertEquals(expected, maybeRatio.get(), 0.00001); + } + + private static void assertEmptyComputedRatio(ContentNodeStatsBuilder statsBuilder) { + var maybeRatio = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(statsBuilder.build()); + assertTrue(maybeRatio.isEmpty()); + } + + @Test + void no_buckets_imply_fully_in_sync() { + // Can't have anything out of sync if you don't have anything to be out of sync with *taps side of head* + assertComputedRatio(0.0, globalStatsBuilder().add("default", 0, 0)); + } + + @Test + void no_pending_buckets_implies_fully_in_sync() { + assertComputedRatio(0.0, globalStatsBuilder().add("default", 100, 0)); + assertComputedRatio(0.0, globalStatsBuilder().add("default", 100, 0).add("global", 50, 0)); + } + + @Test + void invalid_stats_returns_empty() { + assertEmptyComputedRatio(globalStatsBuilder().add("default", ContentNodeStats.BucketSpaceStats.invalid())); + assertEmptyComputedRatio(globalStatsBuilder() + .add("default", 100, 0) + .add("global", ContentNodeStats.BucketSpaceStats.invalid())); + } + + @Test + void pending_buckets_return_expected_ratio() { + assertComputedRatio(0.50, globalStatsBuilder().add("default", 10, 5)); + assertComputedRatio(0.80, globalStatsBuilder().add("default", 10, 8)); + assertComputedRatio(0.10, globalStatsBuilder().add("default", 100, 10)); + assertComputedRatio(0.01, globalStatsBuilder().add("default", 100, 1)); + assertComputedRatio(0.05, globalStatsBuilder().add("default", 50, 5).add("global", 50, 0)); + assertComputedRatio(0.05, globalStatsBuilder().add("default", 50, 0).add("global", 50, 5)); + assertComputedRatio(0.10, globalStatsBuilder().add("default", 50, 5).add("global", 50, 5)); + } + +} diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterControllerMock.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterControllerMock.java index d06cc730b3f..902b1bce24a 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterControllerMock.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterControllerMock.java @@ -15,6 +15,8 @@ public class ClusterControllerMock implements RemoteClusterControllerTaskSchedul private final int fleetControllerIndex; Integer fleetControllerMaster; private final StringBuilder events = new StringBuilder(); + ContentNodeStats globalClusterStats = new ContentNodeStats(-1); + boolean enableGlobalStatsReporting = false; ClusterControllerMock(ContentCluster cluster, ClusterState state, ClusterStateBundle publishedClusterStateBundle, @@ -88,6 +90,22 @@ public class ClusterControllerMock implements RemoteClusterControllerTaskSchedul } }; + context.aggregatedClusterStats = new AggregatedClusterStats() { + @Override + public boolean hasUpdatesFromAllDistributors() { + return enableGlobalStatsReporting; + } + + @Override + public ContentClusterStats getStats() { + return null; + } + + @Override + public ContentNodeStats getGlobalStats() { + return globalClusterStats; + } + }; } @Override diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterTest.java index e4b3c0b9f2c..cb1213542ce 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/ClusterTest.java @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.clustercontroller.core.restapiv2; +import com.yahoo.vespa.clustercontroller.core.ContentNodeStatsBuilder; import com.yahoo.vespa.clustercontroller.utils.staterestapi.response.UnitResponse; import org.junit.jupiter.api.Test; @@ -105,4 +106,45 @@ public class ClusterTest extends StateRestApiTest { }""", jsonWriter.createJson(response).toPrettyString()); } + + @Test + void emit_cluster_stats_if_present() throws Exception { + setUp(true); + books.globalClusterStats.add(ContentNodeStatsBuilder.forNode(-1).add("default", 10, 4).build()); + books.enableGlobalStatsReporting = true; + UnitResponse response = restAPI.getState(new StateRequest("books", 0)); + assertEquals(""" + { + "state" : { + "generated" : { + "state" : "up", + "reason" : "" + } + }, + "metrics" : { + "cluster-buckets-out-of-sync-ratio" : 0.4 + }, + "service" : { + "storage" : { + "link" : "/cluster/v2/books/storage" + }, + "distributor" : { + "link" : "/cluster/v2/books/distributor" + } + }, + "distribution-states" : { + "published" : { + "baseline" : "distributor:4 storage:4", + "bucket-spaces" : [ { + "name" : "default", + "state" : "distributor:4 storage:4 .3.s:m" + }, { + "name" : "global", + "state" : "distributor:4 storage:4" + } ] + } + } + }""", + jsonWriter.createJson(response).toPrettyString()); + } } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/StateRestApiTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/StateRestApiTest.java index dfd9783ecef..1ad5f6828b7 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/StateRestApiTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/StateRestApiTest.java @@ -30,7 +30,7 @@ import java.util.stream.Collectors; public abstract class StateRestApiTest { - private ClusterControllerMock books; + ClusterControllerMock books; ClusterControllerMock music; StateRestAPI restAPI; JsonWriter jsonWriter = new JsonWriter(); -- cgit v1.2.3