aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'clustercontroller-core/src/main')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java9
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java10
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java45
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java4
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java1
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java55
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java4
10 files changed, 131 insertions, 11 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java
index 37698a3ad00..aa2a1d29ec0 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java
@@ -10,4 +10,6 @@ public interface AggregatedClusterStats {
ContentClusterStats getStats();
+ ContentNodeStats getGlobalStats();
+
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java
index f1c19bac9b6..6fb31cc1b1c 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java
@@ -38,6 +38,9 @@ public class ClusterStatsAggregator {
// Maps the content node index to the content node stats for that node.
// This MUST be kept up-to-date with distributorToStats;
private final ContentClusterStats aggregatedStats;
+ // This is the aggregate of aggregates across content nodes, allowing a reader to
+ // get a O(1) view of all merges pending in the cluster.
+ private final ContentNodeStats globallyAggregatedNodeStats = new ContentNodeStats(-1);
ClusterStatsAggregator(Set<Integer> distributors, Set<Integer> storageNodes) {
this.distributors = distributors;
@@ -58,6 +61,10 @@ public class ClusterStatsAggregator {
return aggregatedStats;
}
+ @Override
+ public ContentNodeStats getGlobalStats() {
+ return globallyAggregatedNodeStats;
+ }
};
}
@@ -96,12 +103,14 @@ public class ClusterStatsAggregator {
ContentNodeStats statsToAdd = clusterStats.getNodeStats(nodeIndex);
if (statsToAdd != null) {
contentNode.add(statsToAdd);
+ globallyAggregatedNodeStats.add(statsToAdd);
}
if (prevClusterStats != null) {
ContentNodeStats statsToSubtract = prevClusterStats.getNodeStats(nodeIndex);
if (statsToSubtract != null) {
contentNode.subtract(statsToSubtract);
+ globallyAggregatedNodeStats.subtract(statsToSubtract);
}
}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 3e520d95d2c..3f7214c31e2 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -542,6 +542,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
didWork |= metricUpdater.forWork("processNextQueuedRemoteTask", this::processNextQueuedRemoteTask);
didWork |= metricUpdater.forWork("completeSatisfiedVersionDependentTasks", this::completeSatisfiedVersionDependentTasks);
didWork |= metricUpdater.forWork("maybePublishOldMetrics", this::maybePublishOldMetrics);
+ updateClusterSyncMetrics();
processingCycle = false;
++cycleCount;
@@ -563,6 +564,14 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
}
}
+ private void updateClusterSyncMetrics() {
+ var stats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats();
+ if (stats.hasUpdatesFromAllDistributors()) {
+ GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats.getGlobalStats())
+ .ifPresent(metricUpdater::updateClusterBucketsOutOfSyncRatio);
+ }
+ }
+
private boolean updateMasterElectionState() {
try {
return masterElectionHandler.watchMasterElection(database, databaseContext);
@@ -689,6 +698,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
context.cluster = cluster;
context.currentConsolidatedState = consolidatedClusterState();
context.publishedClusterStateBundle = stateVersionTracker.getVersionedClusterStateBundle();
+ context.aggregatedClusterStats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats();
context.masterInfo = new MasterInterface() {
@Override public boolean isMaster() { return isMaster; }
@Override public Integer getMaster() { return masterElectionHandler.getMaster(); }
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java
new file mode 100644
index 00000000000..0137ea2c29e
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java
@@ -0,0 +1,45 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import java.util.Optional;
+
+/**
+ * @author vekterli
+ */
+public class GlobalBucketSyncStatsCalculator {
+
+ /**
+ * Compute a value in [0, 1] representing how much of the cluster's data space is currently
+ * out of sync, i.e. pending merging. In other words, if the value is 1 all buckets are out
+ * of sync, and conversely if it's 0 all buckets are in sync. This number applies across bucket
+ * spaces.
+ *
+ * @param globalStats Globally aggregated content node statistics for the entire cluster.
+ * @return Optional containing a value [0, 1] representing the ratio of buckets pending merge
+ * in relation to the total number of buckets in the cluster, or an empty optional if
+ * the underlying global statistics contains invalid/incomplete information.
+ */
+ public static Optional<Double> clusterBucketsOutOfSyncRatio(ContentNodeStats globalStats) {
+ long totalBuckets = 0;
+ long pendingBuckets = 0;
+ for (var space : globalStats.getBucketSpaces().values()) {
+ if (!space.valid()) {
+ return Optional.empty();
+ }
+ totalBuckets += space.getBucketsTotal();
+ pendingBuckets += space.getBucketsPending();
+ }
+ // It's currently possible for the reported number of pending buckets to be greater than
+ // the number of total buckets. Example: this can happen if a bucket is present on a single
+ // node, but should have been replicated to 9 more nodes. Since counts are not normalized
+ // across content nodes for a given bucket, this will be counted as 9 pending and 1 total.
+ // Eventually this will settle as 0 pending and 10 total.
+ // TODO report node-normalized pending/total counts from distributors and use these.
+ pendingBuckets = Math.min(pendingBuckets, totalBuckets);
+ if (totalBuckets <= 0) {
+ return Optional.of(0.0); // No buckets; cannot be out of sync by definition
+ }
+ return Optional.of((double)pendingBuckets / (double)totalBuckets);
+ }
+
+}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
index 419cb652671..d149d4043e4 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
@@ -93,6 +93,10 @@ public class MetricUpdater {
metricReporter.set("is-master", isMaster ? 1 : 0);
}
+ public void updateClusterBucketsOutOfSyncRatio(double ratio) {
+ metricReporter.set("cluster-buckets-out-of-sync-ratio", ratio);
+ }
+
public void addTickTime(long millis, boolean didWork) {
if (didWork) {
metricReporter.set("busy-tick-time-ms", millis);
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
index efb161cebec..e1b774e64ff 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
@@ -17,6 +17,7 @@ public abstract class RemoteClusterControllerTask {
public MasterInterface masterInfo;
public NodeListener nodeListener;
public SlobrokListener slobrokListener;
+ public AggregatedClusterStats aggregatedClusterStats;
}
private final Object monitor = new Object();
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
index 636d01dbfa3..7af5f93fa21 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
@@ -76,7 +76,7 @@ public class Response {
{
protected final Map<String, String> attributes = new LinkedHashMap<>();
protected final Map<String, SubUnitList> subUnits = new LinkedHashMap<>();
- protected final Map<String, Long> metrics = new LinkedHashMap<>();
+ protected final Map<String, Number> metrics = new LinkedHashMap<>();
protected final Map<String, UnitState> stateMap = new LinkedHashMap<>();
protected DistributionState publishedState = null;
@@ -94,7 +94,7 @@ public class Response {
}
@Override
- public Map<String, Long> getMetricMap() { return metrics; }
+ public Map<String, Number> getMetricMap() { return metrics; }
@Override
public Map<String, UnitState> getStatePerType() { return stateMap; }
@Override
@@ -122,7 +122,7 @@ public class Response {
list.addUnit(unit, response);
return this;
}
- public EmptyResponse<T> addMetric(String name, Long value) {
+ public EmptyResponse<T> addMetric(String name, Number value) {
metrics.put(name, value);
return this;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java
index 1df37637dcf..3006effecd4 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.clustercontroller.core.restapiv2.requests;
import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vespa.clustercontroller.core.ClusterStateBundle;
+import com.yahoo.vespa.clustercontroller.core.GlobalBucketSyncStatsCalculator;
import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask;
import com.yahoo.vespa.clustercontroller.core.restapiv2.Id;
import com.yahoo.vespa.clustercontroller.core.restapiv2.Request;
@@ -36,6 +37,11 @@ public class ClusterStateRequest extends Request<Response.ClusterResponse> {
}
}
result.setPublishedState(bundleToDistributionState(context.publishedClusterStateBundle));
+ if (context.aggregatedClusterStats.hasUpdatesFromAllDistributors()) {
+ var stats = context.aggregatedClusterStats.getGlobalStats();
+ var maybeRatio = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats);
+ maybeRatio.ifPresent(r -> result.addMetric("cluster-buckets-out-of-sync-ratio", r));
+ }
return result;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
index 51bda17860e..89095e268cb 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
@@ -10,6 +10,7 @@ import com.yahoo.vespa.clustercontroller.core.ClusterStateHistoryEntry;
import com.yahoo.vespa.clustercontroller.core.ContentCluster;
import com.yahoo.vespa.clustercontroller.core.EventLog;
import com.yahoo.vespa.clustercontroller.core.FleetControllerOptions;
+import com.yahoo.vespa.clustercontroller.core.GlobalBucketSyncStatsCalculator;
import com.yahoo.vespa.clustercontroller.core.LeafGroups;
import com.yahoo.vespa.clustercontroller.core.MasterElectionHandler;
import com.yahoo.vespa.clustercontroller.core.NodeInfo;
@@ -174,11 +175,8 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa
VdsClusterHtmlRenderer.Table table = renderer.createNewClusterHtmlTable(cluster.getName(), cluster.getSlobrokGenerationCount());
ClusterStateBundle state = stateVersionTracker.getVersionedClusterStateBundle();
- if (state.clusterFeedIsBlocked()) { // Implies FeedBlock != null
- table.appendRaw("<h3 style=\"color: red\">Cluster feeding is blocked!</h3>\n");
- table.appendRaw(String.format("<p>Summary: <strong>%s</strong></p>\n",
- HtmlTable.escape(state.getFeedBlockOrNull().getDescription())));
- }
+ renderClusterFeedBlockIfPresent(state, table);
+ renderClusterOutOfSyncRatio(state, stateVersionTracker, table);
List<Group> groups = LeafGroups.enumerateFrom(cluster.getDistribution().getRootGroup());
for (Group group : groups) {
@@ -206,6 +204,53 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa
table.addTable(sb, options.stableStateTimePeriod());
}
+ private static void renderClusterFeedBlockIfPresent(ClusterStateBundle state, VdsClusterHtmlRenderer.Table table) {
+ if (state.clusterFeedIsBlocked()) { // Implies FeedBlock != null
+ table.appendRaw("<h3 style=\"color: red\">Cluster feeding is blocked!</h3>\n");
+ table.appendRaw(String.format("<p>Summary: <strong>%s</strong></p>\n",
+ HtmlTable.escape(state.getFeedBlockOrNull().getDescription())));
+ }
+ }
+
+ private static void renderClusterOutOfSyncRatio(ClusterStateBundle state, StateVersionTracker stateVersionTracker,
+ VdsClusterHtmlRenderer.Table table) {
+ var stats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats();
+ if (!stats.hasUpdatesFromAllDistributors()) {
+ table.appendRaw("<p>Current cluster out of sync ratio cannot be computed, as not all " +
+ "distributors have reported in statistics for the most recent cluster state.</p>\n");
+ return;
+ }
+ var outOfSync = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats.getGlobalStats());
+ if (outOfSync.isEmpty()) {
+ table.appendRaw("<p>Current cluster out of sync ratio cannot be computed, as not all " +
+ "distributors have reported valid statistics.</p>\n");
+ return;
+ }
+ boolean hasMaintenance = stateHasAtLeastOneMaintenanceNode(state);
+ if (!hasMaintenance && outOfSync.get() == 0.0) {
+ table.appendRaw("<p>Cluster is currently in sync.</p>\n");
+ } else {
+ table.appendRaw("<p>Cluster is currently <strong>%.2f%% out of sync</strong>.</p>\n".formatted(outOfSync.get() * 100.0));
+ if (hasMaintenance) {
+ // It is intentional that a cluster with no pending buckets but with nodes in maintenance mode rather
+ // emits "0% out of sync" with a caveat rather than "in sync", as we don't know the latter for sure.
+ table.appendRaw("<p><strong>Note:</strong> since one or more nodes are currently in " +
+ "Maintenance mode, the true out of sync ratio may be higher.</p>\n");
+ }
+ }
+ }
+
+ private static boolean stateHasAtLeastOneMaintenanceNode(ClusterStateBundle state) {
+ var baseline = state.getBaselineClusterState();
+ int nodes = baseline.getNodeCount(NodeType.STORAGE);
+ for (int i = 0; i < nodes; ++i) {
+ if (baseline.getNodeState(Node.ofStorage(i)).getState().oneOf("m")) {
+ return true;
+ }
+ }
+ return false;
+ }
+
private void storeNodeInfo(ContentCluster cluster, int nodeIndex, NodeType nodeType, Map<Integer, NodeInfo> nodeInfoByIndex) {
NodeInfo nodeInfo = cluster.getNodeInfo(new Node(nodeType, nodeIndex));
if (nodeInfo == null) return;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java
index 95f648447f4..0053c02c269 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java
@@ -308,9 +308,7 @@ public class VdsClusterHtmlRenderer {
int nodeEvents = eventLog.getNodeEventsSince(nodeInfo.getNode(),
currentTime - eventLog.getRecentTimePeriod());
row.addCell(new HtmlTable.Cell("" + nodeEvents));
- if (nodeEvents > 20) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (nodeEvents > 3) {
+ if (nodeEvents > 3) {
row.getLastCell().addProperties(WARNING_PROPERTY);
}
}