aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-29 13:42:45 +0100
committerGitHub <noreply@github.com>2024-01-29 13:42:45 +0100
commitad797f8f6b74d9e1079e08e35d11f1386d5d7660 (patch)
treec9d09c11bbf1680648c23972869d1eec78b8a8d3
parent3bc84f0fd2d864fae946ca39d63341dc4e5ff86e (diff)
parentc7a0edb4b093520caccd59a4b4021ee163cba059 (diff)
Merge pull request #30076 from vespa-engine/vekterli/use-storage-entry-count-as-permanent-down-metricv8.295.13
Use stored entry count rather than bucket count for (dis-)allowing permanent node down edge
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java69
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java233
-rw-r--r--metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java1
3 files changed, 193 insertions, 110 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index 93cf9beb70f..814cb48c49f 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -11,6 +11,7 @@ import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
+import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics;
import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
@@ -25,6 +26,7 @@ import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
+import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
@@ -49,7 +51,9 @@ public class NodeStateChangeChecker {
private static final Logger log = Logger.getLogger(NodeStateChangeChecker.class.getName());
private static final String BUCKETS_METRIC_NAME = StorageMetrics.VDS_DATASTORED_BUCKET_SPACE_BUCKETS_TOTAL.baseName();
- private static final Map<String, String> BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
+ private static final String ENTRIES_METRIC_NAME = StorageMetrics.VDS_DATASTORED_BUCKET_SPACE_ENTRIES.baseName();
+ private static final String DOCS_METRIC_NAME = StorageMetrics.VDS_DATASTORED_BUCKET_SPACE_DOCS.baseName();
+ private static final Map<String, String> DEFAULT_SPACE_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
private final int requiredRedundancy;
private final HierarchicalGroupVisiting groupVisiting;
@@ -107,6 +111,50 @@ public class NodeStateChangeChecker {
&& Objects.equals(newWantedState.getDescription(), oldWantedState.getDescription());
}
+ private record NodeDataMetrics(Optional<Metrics.Value> buckets,
+ Optional<Metrics.Value> entries,
+ Optional<Metrics.Value> docs) {}
+
+ private static NodeDataMetrics dataMetricsFromHostInfo(HostInfo hostInfo) {
+ return new NodeDataMetrics(
+ hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS),
+ hostInfo.getMetrics().getValueAt(ENTRIES_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS),
+ hostInfo.getMetrics().getValueAt(DOCS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS));
+ }
+
+ private static Optional<Result> checkZeroEntriesStoredOnContentNode(NodeDataMetrics metrics, int nodeIndex) {
+ if (metrics.entries.isEmpty() || metrics.entries.get().getLast() == null) {
+ // To allow for rolling upgrades in clusters with content node versions that do not report
+ // an entry count, defer to legacy bucket count check if the entry metric can't be found.
+ return Optional.empty();
+ }
+ if (metrics.docs.isEmpty() || metrics.docs.get().getLast() == null) {
+ log.log(Level.WARNING, "Host info inconsistency: storage node %d reports entry count but not document count".formatted(nodeIndex));
+ return Optional.of(disallow("The storage node host info reports stored entry count, but not document count"));
+ }
+ long lastEntries = metrics.entries.get().getLast();
+ long lastDocs = metrics.docs.get().getLast();
+ if (lastEntries != 0) {
+ long buckets = metrics.buckets.map(Metrics.Value::getLast).orElse(-1L);
+ long tombstones = lastEntries - lastDocs; // docs are a subset of entries, so |docs| <= |entries|
+ return Optional.of(disallow("The storage node stores %d documents and %d tombstones across %d buckets".formatted(lastDocs, tombstones, buckets)));
+ }
+ // At this point we believe we have zero entries. Cross-check with visible doc count; it should
+ // always be present when an entry count of zero is present and transitively always be zero.
+ if (lastDocs != 0) {
+ log.log(Level.WARNING, "Host info inconsistency: storage node %d reports 0 entries, but %d documents".formatted(nodeIndex, lastDocs));
+ return Optional.of(disallow("The storage node reports 0 entries, but %d documents".formatted(lastDocs)));
+ }
+ return Optional.of(allow());
+ }
+
+ private static Result checkLegacyZeroBucketsStoredOnContentNode(long lastBuckets) {
+ if (lastBuckets != 0) {
+ return disallow("The storage node manages %d buckets".formatted(lastBuckets));
+ }
+ return allow();
+ }
+
private Result canSetStateDownPermanently(NodeInfo nodeInfo, ClusterState clusterState, String newDescription) {
var result = checkIfStateSetWithDifferentDescription(nodeInfo, newDescription);
if (result.notAllowed())
@@ -129,15 +177,20 @@ public class NodeStateChangeChecker {
" got info for storage node " + nodeIndex + " at a different version " +
hostInfoNodeVersion);
- var bucketsMetric = hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, BUCKETS_METRIC_DIMENSIONS);
- if (bucketsMetric.isEmpty() || bucketsMetric.get().getLast() == null)
+ var metrics = dataMetricsFromHostInfo(hostInfo);
+ // Bucket count metric should always be present
+ if (metrics.buckets.isEmpty() || metrics.buckets.get().getLast() == null) {
return disallow("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + nodeIndex);
+ }
- long lastBuckets = bucketsMetric.get().getLast();
- if (lastBuckets > 0)
- return disallow("The storage node manages " + lastBuckets + " buckets");
-
- return allow();
+ // TODO should also ideally check merge pending from the distributors' perspectives.
+ // - This goes in particular for the global space, as we only check for zero entries in
+ // the _default_ space (as global entries are retained even for retired nodes).
+ // - Due to global merges being prioritized above everything else, it is highly unlikely
+ // that there will be any pending global merges, but the possibility still exists.
+ // - Would need wiring of aggregated content cluster stats
+ var entriesCheckResult = checkZeroEntriesStoredOnContentNode(metrics, nodeIndex);
+ return entriesCheckResult.orElseGet(() -> checkLegacyZeroBucketsStoredOnContentNode(metrics.buckets.get().getLast()));
}
private Result canSetStateUp(NodeInfo nodeInfo, NodeState oldWantedState) {
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
index 199a23f49ba..c1acc19ae9e 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
@@ -1,7 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
-import com.yahoo.log.LogSetup;
import com.yahoo.vdslib.distribution.ConfiguredNode;
import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.state.ClusterState;
@@ -15,6 +14,8 @@ import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.util.ArrayList;
import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
import static com.yahoo.vdslib.state.NodeType.DISTRIBUTOR;
import static com.yahoo.vdslib.state.NodeType.STORAGE;
@@ -709,6 +710,10 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.isAlreadySet());
}
+ private record HostInfoMetrics(int bucketCount, Integer entryCountOrNull, Integer docCountOrNull) {
+ static HostInfoMetrics zero() { return new HostInfoMetrics(0, 0, 0); }
+ }
+
@ParameterizedTest
@ValueSource(ints = {-1, 1})
void testDownDisallowedByNonRetiredState(int maxNumberOfGroupsAllowedToBeDown) {
@@ -716,25 +721,43 @@ public class NodeStateChangeCheckerTest {
defaultAllUpClusterState(),
UP,
currentClusterStateVersion,
- 0,
+ HostInfoMetrics.zero(),
maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.allowed());
assertFalse(result.isAlreadySet());
assertEquals("Only retired nodes are allowed to be set to DOWN in safe mode - is Up", result.reason());
}
+ private record MetricsAndMessage(HostInfoMetrics hostInfoMetrics, String expectedMessage) {}
+
@ParameterizedTest
@ValueSource(ints = {-1, 1})
- void testDownDisallowedByBuckets(int maxNumberOfGroupsAllowedToBeDown) {
- Result result = evaluateDownTransition(
- retiredClusterStateSuffix(),
- UP,
- currentClusterStateVersion,
- 1,
- maxNumberOfGroupsAllowedToBeDown);
- assertFalse(result.allowed());
- assertFalse(result.isAlreadySet());
- assertEquals("The storage node manages 1 buckets", result.reason());
+ void down_disallowed_by_host_info_metrics_implying_node_still_stores_data(int maxNumberOfGroupsAllowedToBeDown) {
+ var disallowCases = List.of(
+ // Non-zero bucket count, and no entry/doc count metrics
+ new MetricsAndMessage(new HostInfoMetrics(1, null, null), "The storage node manages 1 buckets"),
+ // Non-zero bucket count and non-zero entries (note that we prefer reporting the entry count over the bucket count)
+ new MetricsAndMessage(new HostInfoMetrics(1, 2, 1), "The storage node stores 1 documents and 1 tombstones across 1 buckets"),
+
+ // These are cases that should not normally happen, but we test them nevertheless:
+ // Bucket count should never be zero if the entry count is > 0
+ new MetricsAndMessage(new HostInfoMetrics(0, 2, 1), "The storage node stores 1 documents and 1 tombstones across 0 buckets"),
+ // Entry count should never be zero if the document count is > 0
+ new MetricsAndMessage(new HostInfoMetrics(0, 0, 2), "The storage node reports 0 entries, but 2 documents"),
+ // Document count should always be present alongside entry count
+ new MetricsAndMessage(new HostInfoMetrics(0, 0, null), "The storage node host info reports stored entry count, but not document count")
+ );
+ for (var dc : disallowCases) {
+ Result result = evaluateDownTransition(
+ retiredClusterStateSuffix(),
+ UP,
+ currentClusterStateVersion,
+ dc.hostInfoMetrics,
+ maxNumberOfGroupsAllowedToBeDown);
+ assertFalse(result.allowed());
+ assertFalse(result.isAlreadySet());
+ assertEquals(dc.expectedMessage, result.reason());
+ }
}
@ParameterizedTest
@@ -744,7 +767,7 @@ public class NodeStateChangeCheckerTest {
retiredClusterStateSuffix(),
INITIALIZING,
currentClusterStateVersion,
- 0,
+ HostInfoMetrics.zero(),
maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.allowed());
assertFalse(result.isAlreadySet());
@@ -758,7 +781,7 @@ public class NodeStateChangeCheckerTest {
retiredClusterStateSuffix(),
UP,
currentClusterStateVersion - 1,
- 0,
+ HostInfoMetrics.zero(),
maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.allowed());
assertFalse(result.isAlreadySet());
@@ -766,14 +789,42 @@ public class NodeStateChangeCheckerTest {
result.reason());
}
+ // Legacy fallback when the content node does not report stored entry count (docs + tombstones)
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void allowed_to_set_down_when_no_buckets_without_entry_metrics(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = evaluateDownTransition(
+ retiredClusterStateSuffix(),
+ UP,
+ currentClusterStateVersion,
+ new HostInfoMetrics(0, null, null),
+ maxNumberOfGroupsAllowedToBeDown);
+ assertTrue(result.allowed());
+ assertFalse(result.isAlreadySet());
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void allowed_to_set_down_when_no_stored_entries_or_buckets(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = evaluateDownTransition(
+ retiredClusterStateSuffix(),
+ UP,
+ currentClusterStateVersion,
+ HostInfoMetrics.zero(),
+ maxNumberOfGroupsAllowedToBeDown);
+ assertTrue(result.allowed());
+ assertFalse(result.isAlreadySet());
+ }
+
@ParameterizedTest
@ValueSource(ints = {-1, 1})
- void testAllowedToSetDown(int maxNumberOfGroupsAllowedToBeDown) {
+ void allowed_to_set_down_when_no_stored_entries_but_empty_buckets_are_present(int maxNumberOfGroupsAllowedToBeDown) {
Result result = evaluateDownTransition(
retiredClusterStateSuffix(),
UP,
currentClusterStateVersion,
- 0,
+ // The node has (orphaned) buckets, but nothing is stored in them
+ new HostInfoMetrics(100, 0, 0),
maxNumberOfGroupsAllowedToBeDown);
assertTrue(result.allowed());
assertFalse(result.isAlreadySet());
@@ -782,14 +833,14 @@ public class NodeStateChangeCheckerTest {
private Result evaluateDownTransition(ClusterState clusterState,
State reportedState,
int hostInfoClusterStateVersion,
- int lastAlldisksBuckets,
+ HostInfoMetrics hostInfoMetrics,
int maxNumberOfGroupsAllowedToBeDown) {
ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex());
nodeInfo.setReportedState(new NodeState(STORAGE, reportedState), 0);
- nodeInfo.setHostInfo(createHostInfoWithMetrics(hostInfoClusterStateVersion, lastAlldisksBuckets));
+ nodeInfo.setHostInfo(createHostInfoWithMetrics(hostInfoClusterStateVersion, hostInfoMetrics));
return nodeStateChangeChecker.evaluateTransition(
nodeStorage, clusterState, SAFE,
@@ -802,90 +853,68 @@ public class NodeStateChangeCheckerTest {
nodeStorage.getIndex()));
}
- private static HostInfo createHostInfoWithMetrics(int clusterStateVersion, int lastAlldisksBuckets) {
- return HostInfo.createHostInfo(String.format("{\n" +
- " \"metrics\":\n" +
- " {\n" +
- " \"snapshot\":\n" +
- " {\n" +
- " \"from\":1494940706,\n" +
- " \"to\":1494940766\n" +
- " },\n" +
- " \"values\":\n" +
- " [\n" +
- " {\n" +
- " \"name\":\"vds.datastored.alldisks.buckets\",\n" +
- " \"description\":\"buckets managed\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":262144.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":262144,\n" +
- " \"max\":262144,\n" +
- " \"last\":%d\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " }\n" +
- " },\n" +
- " {\n" +
- " \"name\":\"vds.datastored.alldisks.docs\",\n" +
- " \"description\":\"documents stored\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":154689587.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":154689587,\n" +
- " \"max\":154689587,\n" +
- " \"last\":154689587\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " }\n" +
- " },\n" +
- " {\n" +
- " \"name\":\"vds.datastored.bucket_space.buckets_total\",\n" +
- " \"description\":\"Total number buckets present in the bucket space (ready + not ready)\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":0.0,\n" +
- " \"sum\":0.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":0,\n" +
- " \"max\":0,\n" +
- " \"last\":0\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " \"bucketSpace\":\"global\"\n" +
- " }\n" +
- " },\n" +
- " {\n" +
- " \"name\":\"vds.datastored.bucket_space.buckets_total\",\n" +
- " \"description\":\"Total number buckets present in the bucket space (ready + not ready)\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":129.0,\n" +
- " \"sum\":129.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":129,\n" +
- " \"max\":129,\n" +
- " \"last\":%d\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " \"bucketSpace\":\"default\"\n" +
- " }\n" +
- " }\n" +
- " ]\n" +
- " },\n" +
- " \"cluster-state-version\":%d\n" +
- "}",
- lastAlldisksBuckets, lastAlldisksBuckets, clusterStateVersion));
+ private static String bucketSpacesMetricJsonIfNonNull(String metric, Integer lastValueOrNull) {
+ if (lastValueOrNull != null) {
+ // We fake the value for the global space; its actual value does not matter, as global
+ // document [entry] count is not taken into consideration for node removals (they are never
+ // moved away during retirement). We just want a unique value to test that we don't
+ // accidentally use the wrong dimension.
+ return Stream.of("default", "global")
+ .map(bucketSpace ->
+ """
+ {
+ "name":"vds.datastored.bucket_space.%s",
+ "values":{"last":%d},
+ "dimensions":{"bucketSpace":"%s"}
+ },""".formatted(metric, (lastValueOrNull + (bucketSpace.equals("default") ? 0 : 123)), bucketSpace))
+ .collect(Collectors.joining("\n"));
+ }
+ return "";
+ }
+
+ private static HostInfo createHostInfoWithMetrics(int clusterStateVersion, HostInfoMetrics hostInfoMetrics) {
+ return HostInfo.createHostInfo(String.format("""
+ {
+ "metrics":
+ {
+ "snapshot":
+ {
+ "from":1494940706,
+ "to":1494940766
+ },
+ "values":
+ [
+ %s
+ %s
+ %s
+ {
+ "name":"vds.datastored.alldisks.buckets",
+ "values":
+ {
+ "last":%d
+ },
+ "dimensions":
+ {
+ }
+ },
+ {
+ "name":"vds.datastored.alldisks.docs",
+ "values":
+ {
+ "last":154689587
+ },
+ "dimensions":
+ {
+ }
+ }
+ ]
+ },
+ "cluster-state-version":%d
+ }""",
+ bucketSpacesMetricJsonIfNonNull("buckets_total", hostInfoMetrics.bucketCount),
+ bucketSpacesMetricJsonIfNonNull("entries", hostInfoMetrics.entryCountOrNull),
+ bucketSpacesMetricJsonIfNonNull("docs", hostInfoMetrics.docCountOrNull),
+ hostInfoMetrics.bucketCount, clusterStateVersion));
}
private List<ConfiguredNode> createNodes(int count) {
diff --git a/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java b/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java
index dd8c2a2a1af..4d91cc1d989 100644
--- a/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java
+++ b/metrics/src/main/java/ai/vespa/metrics/StorageMetrics.java
@@ -152,6 +152,7 @@ public enum StorageMetrics implements VespaMetrics {
VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_ONHOLD_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.onhold_bytes", Unit.BYTE, "The number of bytes on hold"),
VDS_DATASTORED_BUCKET_SPACE_BUCKET_DB_MEMORY_USAGE_USED_BYTES("vds.datastored.bucket_space.bucket_db.memory_usage.used_bytes", Unit.BYTE, "The number of used bytes (<= allocated_bytes)"),
VDS_DATASTORED_BUCKET_SPACE_BUCKETS_TOTAL("vds.datastored.bucket_space.buckets_total", Unit.BUCKET, "Total number buckets present in the bucket space (ready + not ready)"),
+ VDS_DATASTORED_BUCKET_SPACE_ENTRIES("vds.datastored.bucket_space.entries", Unit.DOCUMENT, "Number of entries (documents + tombstones) stored in the bucket space"),
VDS_DATASTORED_BUCKET_SPACE_BYTES("vds.datastored.bucket_space.bytes", Unit.BYTE, "Bytes stored across all documents in the bucket space"),
VDS_DATASTORED_BUCKET_SPACE_DOCS("vds.datastored.bucket_space.docs", Unit.DOCUMENT, "Documents stored in the bucket space"),
VDS_DATASTORED_BUCKET_SPACE_READY_BUCKETS("vds.datastored.bucket_space.ready_buckets", Unit.BUCKET, "Number of ready buckets in the bucket space"),