diff options
2 files changed, 38 insertions, 26 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java index 5bcbb88ed88..eed8bc7ce92 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java @@ -11,6 +11,7 @@ import com.yahoo.vdslib.state.Node; import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.State; import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; +import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics; import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode; import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest; @@ -110,41 +111,46 @@ public class NodeStateChangeChecker { && Objects.equals(newWantedState.getDescription(), oldWantedState.getDescription()); } - private static Optional<Result> checkZeroEntriesStoredOnContentNode(HostInfo hostInfo, int nodeIndex) { - var entriesMetric = hostInfo.getMetrics().getValueAt(ENTRIES_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS); - if (entriesMetric.isEmpty() || entriesMetric.get().getLast() == null) { + private record NodeDataMetrics(Optional<Metrics.Value> buckets, + Optional<Metrics.Value> entries, + Optional<Metrics.Value> docs) {} + + private static NodeDataMetrics dataMetricsFromHostInfo(HostInfo hostInfo) { + return new NodeDataMetrics( + hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS), + hostInfo.getMetrics().getValueAt(ENTRIES_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS), + hostInfo.getMetrics().getValueAt(DOCS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS)); + } + + private static Optional<Result> checkZeroEntriesStoredOnContentNode(NodeDataMetrics metrics, int nodeIndex) { + if (metrics.entries.isEmpty() || metrics.entries.get().getLast() == null) { // To allow for rolling upgrades in clusters with content node versions that do not report // an entry count, defer to legacy bucket count check if the entry metric can't be found. return Optional.empty(); } - long lastEntries = entriesMetric.get().getLast(); - if (lastEntries > 0) { - return Optional.of(disallow("The storage node stores %d document entries".formatted(lastEntries))); - } - // At this point we believe we have zero entries. Cross-check with visible doc count; it should - // always be present when an entry count of zero is present and transitively always be zero. - var docsMetric = hostInfo.getMetrics().getValueAt(DOCS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS); - if (docsMetric.isEmpty() || docsMetric.get().getLast() == null) { + if (metrics.docs.isEmpty() || metrics.docs.get().getLast() == null) { log.log(Level.WARNING, "Host info inconsistency: storage node %d reports entry count but not document count".formatted(nodeIndex)); return Optional.of(disallow("The storage node host info reports stored entry count, but not document count")); } - long lastDocs = docsMetric.get().getLast(); - if (lastDocs > 0) { + long lastEntries = metrics.entries.get().getLast(); + long lastDocs = metrics.docs.get().getLast(); + if (lastEntries != 0) { + long buckets = metrics.buckets.orElseThrow().getLast(); + long tombstones = lastEntries - lastDocs; // docs are a subset of entries, so |docs| <= |entries| + return Optional.of(disallow("The storage node stores %d documents and %d tombstones across %d buckets".formatted(lastDocs, tombstones, buckets))); + } + // At this point we believe we have zero entries. Cross-check with visible doc count; it should + // always be present when an entry count of zero is present and transitively always be zero. + if (lastDocs != 0) { log.log(Level.WARNING, "Host info inconsistency: storage node %d reports 0 entries, but %d documents".formatted(nodeIndex, lastDocs)); return Optional.of(disallow("The storage node reports 0 entries, but %d documents".formatted(lastDocs))); } return Optional.of(allow()); } - private static Result checkLegacyZeroBucketsStoredOnContentNode(HostInfo hostInfo, int nodeIndex) { - var bucketsMetric = hostInfo.getMetrics().getValueAt(BUCKETS_METRIC_NAME, DEFAULT_SPACE_METRIC_DIMENSIONS); - if (bucketsMetric.isEmpty() || bucketsMetric.get().getLast() == null) { - return disallow("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + nodeIndex); - } - - long lastBuckets = bucketsMetric.get().getLast(); - if (lastBuckets > 0) { - return disallow("The storage node manages " + lastBuckets + " buckets"); + private static Result checkLegacyZeroBucketsStoredOnContentNode(long lastBuckets) { + if (lastBuckets != 0) { + return disallow("The storage node manages %d buckets".formatted(lastBuckets)); } return allow(); } @@ -171,14 +177,20 @@ public class NodeStateChangeChecker { " got info for storage node " + nodeIndex + " at a different version " + hostInfoNodeVersion); + var metrics = dataMetricsFromHostInfo(hostInfo); + // Bucket count metric should always be present + if (metrics.buckets.isEmpty() || metrics.buckets.get().getLast() == null) { + return disallow("Missing last value of the " + BUCKETS_METRIC_NAME + " metric for storage node " + nodeIndex); + } + // TODO should also ideally check merge pending from the distributors' perspectives. // - This goes in particular for the global space, as we only check for zero entries in // the _default_ space (as global entries are retained even for retired nodes). // - Due to global merges being prioritized above everything else, it is highly unlikely // that there will be any pending global merges, but the possibility still exists. // - Would need wiring of aggregated content cluster stats - var entriesCheckResult = checkZeroEntriesStoredOnContentNode(hostInfo, nodeIndex); - return entriesCheckResult.orElseGet(() -> checkLegacyZeroBucketsStoredOnContentNode(hostInfo, nodeIndex)); + var entriesCheckResult = checkZeroEntriesStoredOnContentNode(metrics, nodeIndex); + return entriesCheckResult.orElseGet(() -> checkLegacyZeroBucketsStoredOnContentNode(metrics.buckets.get().getLast())); } private Result canSetStateUp(NodeInfo nodeInfo, NodeState oldWantedState) { diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java index 7baa2bae03b..c1acc19ae9e 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java @@ -737,11 +737,11 @@ public class NodeStateChangeCheckerTest { // Non-zero bucket count, and no entry/doc count metrics new MetricsAndMessage(new HostInfoMetrics(1, null, null), "The storage node manages 1 buckets"), // Non-zero bucket count and non-zero entries (note that we prefer reporting the entry count over the bucket count) - new MetricsAndMessage(new HostInfoMetrics(1, 2, 1), "The storage node stores 2 document entries"), + new MetricsAndMessage(new HostInfoMetrics(1, 2, 1), "The storage node stores 1 documents and 1 tombstones across 1 buckets"), // These are cases that should not normally happen, but we test them nevertheless: // Bucket count should never be zero if the entry count is > 0 - new MetricsAndMessage(new HostInfoMetrics(0, 2, 1), "The storage node stores 2 document entries"), + new MetricsAndMessage(new HostInfoMetrics(0, 2, 1), "The storage node stores 1 documents and 1 tombstones across 0 buckets"), // Entry count should never be zero if the document count is > 0 new MetricsAndMessage(new HostInfoMetrics(0, 0, 2), "The storage node reports 0 entries, but 2 documents"), // Document count should always be present alongside entry count |