summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-26 13:01:46 +0100
committerTor Brede Vekterli <vekterli@vespa.ai>2024-01-26 14:31:37 +0100
commitac1f180cb7524da83bb4fbd18933b73c087cf4b6 (patch)
tree343eb81e069fcec1f9e10f71ad6bffd060976ce3 /clustercontroller-core
parent8dc79844053124419d3f217e904f3473483b9da2 (diff)
Use stored entry count rather than bucket count for (dis-)allowing permanent node down edge
The stored entry count encompasses both visible documents and tombstones. Using this count rather than bucket count avoids any issues where a node only containing empty buckets (i.e. no actual data) is prohibited from being marked as permanently down. Entry count is cross-checked with the visible document count; if the former is zero, the latter should always be zero as well. Since entry/doc counts were only recently introduced as part of the HostInfo payload, we have to handle the case where these do not exist. If entry count is not present, the decision to allow or disallow the transition falls back to the bucket count check.
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java61
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java233
2 files changed, 182 insertions, 112 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index 93cf9beb70f..5bcbb88ed88 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -25,6 +25,7 @@ import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
+import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
@@ -49,7 +50,9 @@ public class NodeStateChangeChecker {
private static final Logger log = Logger.getLogger(NodeStateChangeChecker.class.getName());
private static final String BUCKETS_METRIC_NAME = StorageMetrics.VDS_DATASTORED_BUCKET_SPACE_BUCKETS_TOTAL.baseName();
- private static final Map<String, String> BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
+ private static final String ENTRIES_METRIC_NAME = StorageMetrics.VDS_DATASTORED_BUCKET_SPACE_ENTRIES.baseName();
+ private static final String DOCS_METRIC_NAME = StorageMetrics.VDS_DATASTORED_BUCKET_SPACE_DOCS.baseName();
+ private static final Map<String, String> DEFAULT_SPACE_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
private final int requiredRedundancy;
private final HierarchicalGroupVisiting groupVisiting;
@@ -107,6 +110,45 @@ public class NodeStateChangeChecker {
&& Objects.equals(newWantedState.getDescription(), oldWantedState.getDescription());
}
+ private static Optional<Result> checkZeroEntriesStoredOnContentNode(HostInfo hostInfo, int nodeIndex) {
+ var entriesMetric = hostInfo.getMetrics().getValueAt(ENTRIES_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS);
+ if (entriesMetric.isEmpty() || entriesMetric.get().getLast() == null) {
+ // To allow for rolling upgrades in clusters with content node versions that do not report
+ // an entry count, defer to legacy bucket count check if the entry metric can't be found.
+ return Optional.empty();
+ }
+ long lastEntries = entriesMetric.get().getLast();
+ if (lastEntries > 0) {
+ return Optional.of(disallow("The storage node stores %d document entries".formatted(lastEntries)));
+ }
+ // At this point we believe we have zero entries. Cross-check with visible doc count; it should
+ // always be present when an entry count of zero is present and transitively always be zero.
+ var docsMetric = hostInfo.getMetrics().getValueAt(DOCS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS);
+ if (docsMetric.isEmpty() || docsMetric.get().getLast() == null) {
+ log.log(Level.WARNING, "Host info inconsistency: storage node %d reports entry count but not document count".formatted(nodeIndex));
+ return Optional.of(disallow("The storage node host info reports stored entry count, but not document count"));
+ }
+ long lastDocs = docsMetric.get().getLast();
+ if (lastDocs > 0) {
+ log.log(Level.WARNING, "Host info inconsistency: storage node %d reports 0 entries, but %d documents".formatted(nodeIndex, lastDocs));
+ return Optional.of(disallow("The storage node reports 0 entries, but %d documents".formatted(lastDocs)));
+ }
+ return Optional.of(allow());
+ }
+
+ private static Result checkLegacyZeroBucketsStoredOnContentNode(HostInfo hostInfo, int nodeIndex) {
+ var bucketsMetric = hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS);
+ if (bucketsMetric.isEmpty() || bucketsMetric.get().getLast() == null) {
+ return disallow("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + nodeIndex);
+ }
+
+ long lastBuckets = bucketsMetric.get().getLast();
+ if (lastBuckets > 0) {
+ return disallow("The storage node manages " + lastBuckets + " buckets");
+ }
+ return allow();
+ }
+
private Result canSetStateDownPermanently(NodeInfo nodeInfo, ClusterState clusterState, String newDescription) {
var result = checkIfStateSetWithDifferentDescription(nodeInfo, newDescription);
if (result.notAllowed())
@@ -129,15 +171,14 @@ public class NodeStateChangeChecker {
" got info for storage node " + nodeIndex + " at a different version " +
hostInfoNodeVersion);
- var bucketsMetric = hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, BUCKETS_METRIC_DIMENSIONS);
- if (bucketsMetric.isEmpty() || bucketsMetric.get().getLast() == null)
- return disallow("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + nodeIndex);
-
- long lastBuckets = bucketsMetric.get().getLast();
- if (lastBuckets > 0)
- return disallow("The storage node manages " + lastBuckets + " buckets");
-
- return allow();
+ // TODO should also ideally check merge pending from the distributors' perspectives.
+ // - This goes in particular for the global space, as we only check for zero entries in
+ // the _default_ space (as global entries are retained even for retired nodes).
+ // - Due to global merges being prioritized above everything else, it is highly unlikely
+ // that there will be any pending global merges, but the possibility still exists.
+ // - Would need wiring of aggregated content cluster stats
+ var entriesCheckResult = checkZeroEntriesStoredOnContentNode(hostInfo, nodeIndex);
+ return entriesCheckResult.orElseGet(() -> checkLegacyZeroBucketsStoredOnContentNode(hostInfo, nodeIndex));
}
private Result canSetStateUp(NodeInfo nodeInfo, NodeState oldWantedState) {
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
index 199a23f49ba..7baa2bae03b 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
@@ -1,7 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
-import com.yahoo.log.LogSetup;
import com.yahoo.vdslib.distribution.ConfiguredNode;
import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.state.ClusterState;
@@ -15,6 +14,8 @@ import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.util.ArrayList;
import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
import static com.yahoo.vdslib.state.NodeType.DISTRIBUTOR;
import static com.yahoo.vdslib.state.NodeType.STORAGE;
@@ -709,6 +710,10 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.isAlreadySet());
}
+ private record HostInfoMetrics(int bucketCount, Integer entryCountOrNull, Integer docCountOrNull) {
+ static HostInfoMetrics zero() { return new HostInfoMetrics(0, 0, 0); }
+ }
+
@ParameterizedTest
@ValueSource(ints = {-1, 1})
void testDownDisallowedByNonRetiredState(int maxNumberOfGroupsAllowedToBeDown) {
@@ -716,25 +721,43 @@ public class NodeStateChangeCheckerTest {
defaultAllUpClusterState(),
UP,
currentClusterStateVersion,
- 0,
+ HostInfoMetrics.zero(),
maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.allowed());
assertFalse(result.isAlreadySet());
assertEquals("Only retired nodes are allowed to be set to DOWN in safe mode - is Up", result.reason());
}
+ private record MetricsAndMessage(HostInfoMetrics hostInfoMetrics, String expectedMessage) {}
+
@ParameterizedTest
@ValueSource(ints = {-1, 1})
- void testDownDisallowedByBuckets(int maxNumberOfGroupsAllowedToBeDown) {
- Result result = evaluateDownTransition(
- retiredClusterStateSuffix(),
- UP,
- currentClusterStateVersion,
- 1,
- maxNumberOfGroupsAllowedToBeDown);
- assertFalse(result.allowed());
- assertFalse(result.isAlreadySet());
- assertEquals("The storage node manages 1 buckets", result.reason());
+ void down_disallowed_by_host_info_metrics_implying_node_still_stores_data(int maxNumberOfGroupsAllowedToBeDown) {
+ var disallowCases = List.of(
+ // Non-zero bucket count, and no entry/doc count metrics
+ new MetricsAndMessage(new HostInfoMetrics(1, null, null), "The storage node manages 1 buckets"),
+ // Non-zero bucket count and non-zero entries (note that we prefer reporting the entry count over the bucket count)
+ new MetricsAndMessage(new HostInfoMetrics(1, 2, 1), "The storage node stores 2 document entries"),
+
+ // These are cases that should not normally happen, but we test them nevertheless:
+ // Bucket count should never be zero if the entry count is > 0
+ new MetricsAndMessage(new HostInfoMetrics(0, 2, 1), "The storage node stores 2 document entries"),
+ // Entry count should never be zero if the document count is > 0
+ new MetricsAndMessage(new HostInfoMetrics(0, 0, 2), "The storage node reports 0 entries, but 2 documents"),
+ // Document count should always be present alongside entry count
+ new MetricsAndMessage(new HostInfoMetrics(0, 0, null), "The storage node host info reports stored entry count, but not document count")
+ );
+ for (var dc : disallowCases) {
+ Result result = evaluateDownTransition(
+ retiredClusterStateSuffix(),
+ UP,
+ currentClusterStateVersion,
+ dc.hostInfoMetrics,
+ maxNumberOfGroupsAllowedToBeDown);
+ assertFalse(result.allowed());
+ assertFalse(result.isAlreadySet());
+ assertEquals(dc.expectedMessage, result.reason());
+ }
}
@ParameterizedTest
@@ -744,7 +767,7 @@ public class NodeStateChangeCheckerTest {
retiredClusterStateSuffix(),
INITIALIZING,
currentClusterStateVersion,
- 0,
+ HostInfoMetrics.zero(),
maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.allowed());
assertFalse(result.isAlreadySet());
@@ -758,7 +781,7 @@ public class NodeStateChangeCheckerTest {
retiredClusterStateSuffix(),
UP,
currentClusterStateVersion - 1,
- 0,
+ HostInfoMetrics.zero(),
maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.allowed());
assertFalse(result.isAlreadySet());
@@ -766,14 +789,42 @@ public class NodeStateChangeCheckerTest {
result.reason());
}
+ // Legacy fallback when the content node does not report stored entry count (docs + tombstones)
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void allowed_to_set_down_when_no_buckets_without_entry_metrics(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = evaluateDownTransition(
+ retiredClusterStateSuffix(),
+ UP,
+ currentClusterStateVersion,
+ new HostInfoMetrics(0, null, null),
+ maxNumberOfGroupsAllowedToBeDown);
+ assertTrue(result.allowed());
+ assertFalse(result.isAlreadySet());
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void allowed_to_set_down_when_no_stored_entries_or_buckets(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = evaluateDownTransition(
+ retiredClusterStateSuffix(),
+ UP,
+ currentClusterStateVersion,
+ HostInfoMetrics.zero(),
+ maxNumberOfGroupsAllowedToBeDown);
+ assertTrue(result.allowed());
+ assertFalse(result.isAlreadySet());
+ }
+
@ParameterizedTest
@ValueSource(ints = {-1, 1})
- void testAllowedToSetDown(int maxNumberOfGroupsAllowedToBeDown) {
+ void allowed_to_set_down_when_no_stored_entries_but_empty_buckets_are_present(int maxNumberOfGroupsAllowedToBeDown) {
Result result = evaluateDownTransition(
retiredClusterStateSuffix(),
UP,
currentClusterStateVersion,
- 0,
+ // The node has (orphaned) buckets, but nothing is stored in them
+ new HostInfoMetrics(100, 0, 0),
maxNumberOfGroupsAllowedToBeDown);
assertTrue(result.allowed());
assertFalse(result.isAlreadySet());
@@ -782,14 +833,14 @@ public class NodeStateChangeCheckerTest {
private Result evaluateDownTransition(ClusterState clusterState,
State reportedState,
int hostInfoClusterStateVersion,
- int lastAlldisksBuckets,
+ HostInfoMetrics hostInfoMetrics,
int maxNumberOfGroupsAllowedToBeDown) {
ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex());
nodeInfo.setReportedState(new NodeState(STORAGE, reportedState), 0);
- nodeInfo.setHostInfo(createHostInfoWithMetrics(hostInfoClusterStateVersion, lastAlldisksBuckets));
+ nodeInfo.setHostInfo(createHostInfoWithMetrics(hostInfoClusterStateVersion, hostInfoMetrics));
return nodeStateChangeChecker.evaluateTransition(
nodeStorage, clusterState, SAFE,
@@ -802,90 +853,68 @@ public class NodeStateChangeCheckerTest {
nodeStorage.getIndex()));
}
- private static HostInfo createHostInfoWithMetrics(int clusterStateVersion, int lastAlldisksBuckets) {
- return HostInfo.createHostInfo(String.format("{\n" +
- " \"metrics\":\n" +
- " {\n" +
- " \"snapshot\":\n" +
- " {\n" +
- " \"from\":1494940706,\n" +
- " \"to\":1494940766\n" +
- " },\n" +
- " \"values\":\n" +
- " [\n" +
- " {\n" +
- " \"name\":\"vds.datastored.alldisks.buckets\",\n" +
- " \"description\":\"buckets managed\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":262144.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":262144,\n" +
- " \"max\":262144,\n" +
- " \"last\":%d\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " }\n" +
- " },\n" +
- " {\n" +
- " \"name\":\"vds.datastored.alldisks.docs\",\n" +
- " \"description\":\"documents stored\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":154689587.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":154689587,\n" +
- " \"max\":154689587,\n" +
- " \"last\":154689587\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " }\n" +
- " },\n" +
- " {\n" +
- " \"name\":\"vds.datastored.bucket_space.buckets_total\",\n" +
- " \"description\":\"Total number buckets present in the bucket space (ready + not ready)\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":0.0,\n" +
- " \"sum\":0.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":0,\n" +
- " \"max\":0,\n" +
- " \"last\":0\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " \"bucketSpace\":\"global\"\n" +
- " }\n" +
- " },\n" +
- " {\n" +
- " \"name\":\"vds.datastored.bucket_space.buckets_total\",\n" +
- " \"description\":\"Total number buckets present in the bucket space (ready + not ready)\",\n" +
- " \"values\":\n" +
- " {\n" +
- " \"average\":129.0,\n" +
- " \"sum\":129.0,\n" +
- " \"count\":1,\n" +
- " \"rate\":0.016666,\n" +
- " \"min\":129,\n" +
- " \"max\":129,\n" +
- " \"last\":%d\n" +
- " },\n" +
- " \"dimensions\":\n" +
- " {\n" +
- " \"bucketSpace\":\"default\"\n" +
- " }\n" +
- " }\n" +
- " ]\n" +
- " },\n" +
- " \"cluster-state-version\":%d\n" +
- "}",
- lastAlldisksBuckets, lastAlldisksBuckets, clusterStateVersion));
+ private static String bucketSpacesMetricJsonIfNonNull(String metric, Integer lastValueOrNull) {
+ if (lastValueOrNull != null) {
+ // We fake the value for the global space; its actual value does not matter, as global
+ // document [entry] count is not taken into consideration for node removals (they are never
+ // moved away during retirement). We just want a unique value to test that we don't
+ // accidentally use the wrong dimension.
+ return Stream.of("default", "global")
+ .map(bucketSpace ->
+ """
+ {
+ "name":"vds.datastored.bucket_space.%s",
+ "values":{"last":%d},
+ "dimensions":{"bucketSpace":"%s"}
+ },""".formatted(metric, (lastValueOrNull + (bucketSpace.equals("default") ? 0 : 123)), bucketSpace))
+ .collect(Collectors.joining("\n"));
+ }
+ return "";
+ }
+
+ private static HostInfo createHostInfoWithMetrics(int clusterStateVersion, HostInfoMetrics hostInfoMetrics) {
+ return HostInfo.createHostInfo(String.format("""
+ {
+ "metrics":
+ {
+ "snapshot":
+ {
+ "from":1494940706,
+ "to":1494940766
+ },
+ "values":
+ [
+ %s
+ %s
+ %s
+ {
+ "name":"vds.datastored.alldisks.buckets",
+ "values":
+ {
+ "last":%d
+ },
+ "dimensions":
+ {
+ }
+ },
+ {
+ "name":"vds.datastored.alldisks.docs",
+ "values":
+ {
+ "last":154689587
+ },
+ "dimensions":
+ {
+ }
+ }
+ ]
+ },
+ "cluster-state-version":%d
+ }""",
+ bucketSpacesMetricJsonIfNonNull("buckets_total", hostInfoMetrics.bucketCount),
+ bucketSpacesMetricJsonIfNonNull("entries", hostInfoMetrics.entryCountOrNull),
+ bucketSpacesMetricJsonIfNonNull("docs", hostInfoMetrics.docCountOrNull),
+ hostInfoMetrics.bucketCount, clusterStateVersion));
}
private List<ConfiguredNode> createNodes(int count) {