summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-29 12:56:15 +0100
committerTor Brede Vekterli <vekterli@vespa.ai>2024-01-29 12:56:15 +0100
commite9a8055e83560ba83712419ff3bc6b8a659f0266 (patch)
tree21dfa9fbc55633ff71c1a0bde2f12385d0cfbb55 /clustercontroller-core
parentac1f180cb7524da83bb4fbd18933b73c087cf4b6 (diff)
Explicitly report docs, tombstones and buckets in disallow-message
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java60
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java4
2 files changed, 38 insertions, 26 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index 5bcbb88ed88..eed8bc7ce92 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -11,6 +11,7 @@ import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
+import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics;
import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
@@ -110,41 +111,46 @@ public class NodeStateChangeChecker {
&& Objects.equals(newWantedState.getDescription(), oldWantedState.getDescription());
}
- private static Optional<Result> checkZeroEntriesStoredOnContentNode(HostInfo hostInfo, int nodeIndex) {
- var entriesMetric = hostInfo.getMetrics().getValueAt(ENTRIES_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS);
- if (entriesMetric.isEmpty() || entriesMetric.get().getLast() == null) {
+ private record NodeDataMetrics(Optional<Metrics.Value> buckets,
+ Optional<Metrics.Value> entries,
+ Optional<Metrics.Value> docs) {}
+
+ private static NodeDataMetrics dataMetricsFromHostInfo(HostInfo hostInfo) {
+ return new NodeDataMetrics(
+ hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS),
+ hostInfo.getMetrics().getValueAt(ENTRIES_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS),
+ hostInfo.getMetrics().getValueAt(DOCS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS));
+ }
+
+ private static Optional<Result> checkZeroEntriesStoredOnContentNode(NodeDataMetrics metrics, int nodeIndex) {
+ if (metrics.entries.isEmpty() || metrics.entries.get().getLast() == null) {
// To allow for rolling upgrades in clusters with content node versions that do not report
// an entry count, defer to legacy bucket count check if the entry metric can't be found.
return Optional.empty();
}
- long lastEntries = entriesMetric.get().getLast();
- if (lastEntries > 0) {
- return Optional.of(disallow("The storage node stores %d document entries".formatted(lastEntries)));
- }
- // At this point we believe we have zero entries. Cross-check with visible doc count; it should
- // always be present when an entry count of zero is present and transitively always be zero.
- var docsMetric = hostInfo.getMetrics().getValueAt(DOCS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS);
- if (docsMetric.isEmpty() || docsMetric.get().getLast() == null) {
+ if (metrics.docs.isEmpty() || metrics.docs.get().getLast() == null) {
log.log(Level.WARNING, "Host info inconsistency: storage node %d reports entry count but not document count".formatted(nodeIndex));
return Optional.of(disallow("The storage node host info reports stored entry count, but not document count"));
}
- long lastDocs = docsMetric.get().getLast();
- if (lastDocs > 0) {
+ long lastEntries = metrics.entries.get().getLast();
+ long lastDocs = metrics.docs.get().getLast();
+ if (lastEntries != 0) {
+ long buckets = metrics.buckets.orElseThrow().getLast();
+ long tombstones = lastEntries - lastDocs; // docs are a subset of entries, so |docs| <= |entries|
+ return Optional.of(disallow("The storage node stores %d documents and %d tombstones across %d buckets".formatted(lastDocs, tombstones, buckets)));
+ }
+ // At this point we believe we have zero entries. Cross-check with visible doc count; it should
+ // always be present when an entry count of zero is present and transitively always be zero.
+ if (lastDocs != 0) {
log.log(Level.WARNING, "Host info inconsistency: storage node %d reports 0 entries, but %d documents".formatted(nodeIndex, lastDocs));
return Optional.of(disallow("The storage node reports 0 entries, but %d documents".formatted(lastDocs)));
}
return Optional.of(allow());
}
- private static Result checkLegacyZeroBucketsStoredOnContentNode(HostInfo hostInfo, int nodeIndex) {
- var bucketsMetric = hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS);
- if (bucketsMetric.isEmpty() || bucketsMetric.get().getLast() == null) {
- return disallow("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + nodeIndex);
- }
-
- long lastBuckets = bucketsMetric.get().getLast();
- if (lastBuckets > 0) {
- return disallow("The storage node manages " + lastBuckets + " buckets");
+ private static Result checkLegacyZeroBucketsStoredOnContentNode(long lastBuckets) {
+ if (lastBuckets != 0) {
+ return disallow("The storage node manages %d buckets".formatted(lastBuckets));
}
return allow();
}
@@ -171,14 +177,20 @@ public class NodeStateChangeChecker {
" got info for storage node " + nodeIndex + " at a different version " +
hostInfoNodeVersion);
+ var metrics = dataMetricsFromHostInfo(hostInfo);
+ // Bucket count metric should always be present
+ if (metrics.buckets.isEmpty() || metrics.buckets.get().getLast() == null) {
+ return disallow("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + nodeIndex);
+ }
+
// TODO should also ideally check merge pending from the distributors' perspectives.
// - This goes in particular for the global space, as we only check for zero entries in
// the _default_ space (as global entries are retained even for retired nodes).
// - Due to global merges being prioritized above everything else, it is highly unlikely
// that there will be any pending global merges, but the possibility still exists.
// - Would need wiring of aggregated content cluster stats
- var entriesCheckResult = checkZeroEntriesStoredOnContentNode(hostInfo, nodeIndex);
- return entriesCheckResult.orElseGet(() -> checkLegacyZeroBucketsStoredOnContentNode(hostInfo, nodeIndex));
+ var entriesCheckResult = checkZeroEntriesStoredOnContentNode(metrics, nodeIndex);
+ return entriesCheckResult.orElseGet(() -> checkLegacyZeroBucketsStoredOnContentNode(metrics.buckets.get().getLast()));
}
private Result canSetStateUp(NodeInfo nodeInfo, NodeState oldWantedState) {
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
index 7baa2bae03b..c1acc19ae9e 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
@@ -737,11 +737,11 @@ public class NodeStateChangeCheckerTest {
// Non-zero bucket count, and no entry/doc count metrics
new MetricsAndMessage(new HostInfoMetrics(1, null, null), "The storage node manages 1 buckets"),
// Non-zero bucket count and non-zero entries (note that we prefer reporting the entry count over the bucket count)
- new MetricsAndMessage(new HostInfoMetrics(1, 2, 1), "The storage node stores 2 document entries"),
+ new MetricsAndMessage(new HostInfoMetrics(1, 2, 1), "The storage node stores 1 documents and 1 tombstones across 1 buckets"),
// These are cases that should not normally happen, but we test them nevertheless:
// Bucket count should never be zero if the entry count is > 0
- new MetricsAndMessage(new HostInfoMetrics(0, 2, 1), "The storage node stores 2 document entries"),
+ new MetricsAndMessage(new HostInfoMetrics(0, 2, 1), "The storage node stores 1 documents and 1 tombstones across 0 buckets"),
// Entry count should never be zero if the document count is > 0
new MetricsAndMessage(new HostInfoMetrics(0, 0, 2), "The storage node reports 0 entries, but 2 documents"),
// Document count should always be present alongside entry count