aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main/java/com
diff options
context:
space:
mode:
Diffstat (limited to 'clustercontroller-core/src/main/java/com')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java9
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java34
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java45
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java33
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java1
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java7
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java8
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java55
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java4
13 files changed, 191 insertions, 25 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java
index 37698a3ad00..aa2a1d29ec0 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java
@@ -10,4 +10,6 @@ public interface AggregatedClusterStats {
ContentClusterStats getStats();
+ ContentNodeStats getGlobalStats();
+
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java
index f1c19bac9b6..6fb31cc1b1c 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java
@@ -38,6 +38,9 @@ public class ClusterStatsAggregator {
// Maps the content node index to the content node stats for that node.
// This MUST be kept up-to-date with distributorToStats;
private final ContentClusterStats aggregatedStats;
+ // This is the aggregate of aggregates across content nodes, allowing a reader to
+ // get a O(1) view of all merges pending in the cluster.
+ private final ContentNodeStats globallyAggregatedNodeStats = new ContentNodeStats(-1);
ClusterStatsAggregator(Set<Integer> distributors, Set<Integer> storageNodes) {
this.distributors = distributors;
@@ -58,6 +61,10 @@ public class ClusterStatsAggregator {
return aggregatedStats;
}
+ @Override
+ public ContentNodeStats getGlobalStats() {
+ return globallyAggregatedNodeStats;
+ }
};
}
@@ -96,12 +103,14 @@ public class ClusterStatsAggregator {
ContentNodeStats statsToAdd = clusterStats.getNodeStats(nodeIndex);
if (statsToAdd != null) {
contentNode.add(statsToAdd);
+ globallyAggregatedNodeStats.add(statsToAdd);
}
if (prevClusterStats != null) {
ContentNodeStats statsToSubtract = prevClusterStats.getNodeStats(nodeIndex);
if (statsToSubtract != null) {
contentNode.subtract(statsToSubtract);
+ globallyAggregatedNodeStats.subtract(statsToSubtract);
}
}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 3e520d95d2c..5cffa4957c6 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -139,7 +139,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
public static FleetController create(FleetControllerOptions options, MetricReporter metricReporter) throws Exception {
var context = new FleetControllerContextImpl(options);
var timer = new RealTimer();
- var metricUpdater = new MetricUpdater(metricReporter, options.fleetControllerIndex(), options.clusterName());
+ var metricUpdater = new MetricUpdater(metricReporter, timer, options.fleetControllerIndex(), options.clusterName());
var log = new EventLog(timer, metricUpdater);
var cluster = new ContentCluster(options);
var stateGatherer = new NodeStateGatherer(timer, timer, log);
@@ -348,7 +348,8 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
ClusterState baselineState = stateBundle.getBaselineClusterState();
newStates.add(stateBundle);
metricUpdater.updateClusterStateMetrics(cluster, baselineState,
- ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock()));
+ ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock()),
+ systemStateBroadcaster.getLastStateBroadcastTimePoint());
lastMetricUpdateCycleCount = cycleCount;
systemStateBroadcaster.handleNewClusterStates(stateBundle);
// Iff master, always store new version in ZooKeeper _before_ publishing to any
@@ -365,12 +366,20 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
private boolean maybePublishOldMetrics() {
verifyInControllerThread();
- if (isMaster() && cycleCount > 300 + lastMetricUpdateCycleCount) {
- ClusterStateBundle stateBundle = stateVersionTracker.getVersionedClusterStateBundle();
- ClusterState baselineState = stateBundle.getBaselineClusterState();
- metricUpdater.updateClusterStateMetrics(cluster, baselineState,
- ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock()));
- lastMetricUpdateCycleCount = cycleCount;
+ if (cycleCount > 300 + lastMetricUpdateCycleCount) {
+ if (isMaster()) {
+ updateMasterClusterSyncMetrics();
+ ClusterStateBundle stateBundle = stateVersionTracker.getVersionedClusterStateBundle();
+ ClusterState baselineState = stateBundle.getBaselineClusterState();
+ metricUpdater.updateClusterStateMetrics(cluster, baselineState,
+ ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock()),
+ systemStateBroadcaster.getLastStateBroadcastTimePoint());
+ lastMetricUpdateCycleCount = cycleCount;
+ } else {
+ // If we're not the master we don't have any authoritative information about
+ // how out of sync the cluster nodes are, so reset the metric.
+ metricUpdater.updateClusterBucketsOutOfSyncRatio(0);
+ }
return true;
} else {
return false;
@@ -563,6 +572,14 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
}
}
+ private void updateMasterClusterSyncMetrics() {
+ var stats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats();
+ if (stats.hasUpdatesFromAllDistributors()) {
+ GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats.getGlobalStats())
+ .ifPresent(metricUpdater::updateClusterBucketsOutOfSyncRatio);
+ }
+ }
+
private boolean updateMasterElectionState() {
try {
return masterElectionHandler.watchMasterElection(database, databaseContext);
@@ -689,6 +706,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
context.cluster = cluster;
context.currentConsolidatedState = consolidatedClusterState();
context.publishedClusterStateBundle = stateVersionTracker.getVersionedClusterStateBundle();
+ context.aggregatedClusterStats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats();
context.masterInfo = new MasterInterface() {
@Override public boolean isMaster() { return isMaster; }
@Override public Integer getMaster() { return masterElectionHandler.getMaster(); }
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java
new file mode 100644
index 00000000000..0137ea2c29e
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java
@@ -0,0 +1,45 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import java.util.Optional;
+
+/**
+ * @author vekterli
+ */
+public class GlobalBucketSyncStatsCalculator {
+
+ /**
+ * Compute a value in [0, 1] representing how much of the cluster's data space is currently
+ * out of sync, i.e. pending merging. In other words, if the value is 1 all buckets are out
+ * of sync, and conversely if it's 0 all buckets are in sync. This number applies across bucket
+ * spaces.
+ *
+ * @param globalStats Globally aggregated content node statistics for the entire cluster.
+ * @return Optional containing a value [0, 1] representing the ratio of buckets pending merge
+ * in relation to the total number of buckets in the cluster, or an empty optional if
+ * the underlying global statistics contains invalid/incomplete information.
+ */
+ public static Optional<Double> clusterBucketsOutOfSyncRatio(ContentNodeStats globalStats) {
+ long totalBuckets = 0;
+ long pendingBuckets = 0;
+ for (var space : globalStats.getBucketSpaces().values()) {
+ if (!space.valid()) {
+ return Optional.empty();
+ }
+ totalBuckets += space.getBucketsTotal();
+ pendingBuckets += space.getBucketsPending();
+ }
+ // It's currently possible for the reported number of pending buckets to be greater than
+ // the number of total buckets. Example: this can happen if a bucket is present on a single
+ // node, but should have been replicated to 9 more nodes. Since counts are not normalized
+ // across content nodes for a given bucket, this will be counted as 9 pending and 1 total.
+ // Eventually this will settle as 0 pending and 10 total.
+ // TODO report node-normalized pending/total counts from distributors and use these.
+ pendingBuckets = Math.min(pendingBuckets, totalBuckets);
+ if (totalBuckets <= 0) {
+ return Optional.of(0.0); // No buckets; cannot be out of sync by definition
+ }
+ return Optional.of((double)pendingBuckets / (double)totalBuckets);
+ }
+
+}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
index 419cb652671..d72ede7199e 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
@@ -10,6 +10,7 @@ import com.yahoo.vespa.clustercontroller.utils.util.ComponentMetricReporter;
import com.yahoo.vespa.clustercontroller.utils.util.MetricReporter;
import java.time.Duration;
+import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
@@ -19,17 +20,27 @@ import java.util.function.BooleanSupplier;
public class MetricUpdater {
private final ComponentMetricReporter metricReporter;
+ private final Timer timer;
+ // Publishing and converging on a cluster state version is never instant nor atomic, but
+ // it usually completes within a few seconds. If convergence does not happen for more than
+ // 30 seconds, it's a sign something has stalled.
+ private Duration stateVersionConvergenceGracePeriod = Duration.ofSeconds(30);
- public MetricUpdater(MetricReporter metricReporter, int controllerIndex, String clusterName) {
+ public MetricUpdater(MetricReporter metricReporter, Timer timer, int controllerIndex, String clusterName) {
this.metricReporter = new ComponentMetricReporter(metricReporter, "cluster-controller.");
this.metricReporter.addDimension("controller-index", String.valueOf(controllerIndex));
this.metricReporter.addDimension("clusterid", clusterName);
+ this.timer = timer;
}
public MetricReporter.Context createContext(Map<String, String> dimensions) {
return metricReporter.createContext(dimensions);
}
+ public void setStateVersionConvergenceGracePeriod(Duration gracePeriod) {
+ stateVersionConvergenceGracePeriod = gracePeriod;
+ }
+
private static int nodesInAvailableState(Map<State, Integer> nodeCounts) {
return nodeCounts.getOrDefault(State.INITIALIZING, 0)
+ nodeCounts.getOrDefault(State.RETIRED, 0)
@@ -39,10 +50,13 @@ public class MetricUpdater {
+ nodeCounts.getOrDefault(State.MAINTENANCE, 0);
}
- public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state, ResourceUsageStats resourceUsage) {
+ public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state,
+ ResourceUsageStats resourceUsage, Instant lastStateBroadcastTimePoint) {
Map<String, String> dimensions = new HashMap<>();
dimensions.put("cluster", cluster.getName());
dimensions.put("clusterid", cluster.getName());
+ Instant now = timer.getCurrentWallClockTime();
+ boolean convergenceDeadlinePassed = lastStateBroadcastTimePoint.plus(stateVersionConvergenceGracePeriod).isBefore(now);
for (NodeType type : NodeType.getTypes()) {
dimensions.put("node-type", type.toString().toLowerCase());
MetricReporter.Context context = createContext(dimensions);
@@ -50,10 +64,18 @@ public class MetricUpdater {
for (State s : State.values()) {
nodeCounts.put(s, 0);
}
+ int nodesNotConverged = 0;
for (Integer i : cluster.getConfiguredNodes().keySet()) {
- NodeState s = state.getNodeState(new Node(type, i));
+ var node = new Node(type, i);
+ NodeState s = state.getNodeState(node);
Integer count = nodeCounts.get(s.getState());
nodeCounts.put(s.getState(), count + 1);
+ var info = cluster.getNodeInfo(node);
+ if (info != null && convergenceDeadlinePassed && s.getState().oneOf("uir")) {
+ if (info.getClusterStateVersionBundleAcknowledged() != state.getVersion()) {
+ nodesNotConverged++;
+ }
+ }
}
for (State s : State.values()) {
String name = s.toString().toLowerCase() + ".count";
@@ -63,6 +85,7 @@ public class MetricUpdater {
final int availableNodes = nodesInAvailableState(nodeCounts);
final int totalNodes = Math.max(cluster.getConfiguredNodes().size(), 1); // Assumes 1-1 between distributor and storage
metricReporter.set("available-nodes.ratio", (double)availableNodes / totalNodes, context);
+ metricReporter.set("nodes-not-converged", nodesNotConverged, context);
}
dimensions.remove("node-type");
MetricReporter.Context context = createContext(dimensions);
@@ -93,6 +116,10 @@ public class MetricUpdater {
metricReporter.set("is-master", isMaster ? 1 : 0);
}
+ public void updateClusterBucketsOutOfSyncRatio(double ratio) {
+ metricReporter.set("cluster-buckets-out-of-sync-ratio", ratio);
+ }
+
public void addTickTime(long millis, boolean didWork) {
if (didWork) {
metricReporter.set("busy-tick-time-ms", millis);
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java
index b4563c09b66..482b40381df 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
+import java.time.Instant;
import java.util.Calendar;
import java.util.Locale;
import java.util.TimeZone;
@@ -10,8 +11,9 @@ import java.util.TimeZone;
*/
public class RealTimer implements Timer {
- public long getCurrentTimeInMillis() {
- return System.currentTimeMillis();
+ @Override
+ public Instant getCurrentWallClockTime() {
+ return Instant.now();
}
public static String printDuration(long time) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
index efb161cebec..e1b774e64ff 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
@@ -17,6 +17,7 @@ public abstract class RemoteClusterControllerTask {
public MasterInterface masterInfo;
public NodeListener nodeListener;
public SlobrokListener slobrokListener;
+ public AggregatedClusterStats aggregatedClusterStats;
}
private final Object monitor = new Object();
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java
index c74a846fe30..0337e187b8e 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java
@@ -8,6 +8,7 @@ import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler;
+import java.time.Instant;
import java.util.logging.Level;
import java.util.LinkedList;
import java.util.List;
@@ -29,6 +30,7 @@ public class SystemStateBroadcaster {
private final static long minTimeBetweenNodeErrorLogging = 10 * 60 * 1000;
private final Map<Node, Long> lastErrorReported = new TreeMap<>();
+ private Instant lastStateBroadcastTimePoint = Instant.EPOCH;
private int lastOfficialStateVersion = -1;
private int lastStateVersionBundleAcked = 0;
private int lastClusterStateVersionConverged = 0;
@@ -45,6 +47,7 @@ public class SystemStateBroadcaster {
public void handleNewClusterStates(ClusterStateBundle state) {
clusterStateBundle = state;
+ lastStateBroadcastTimePoint = Instant.ofEpochMilli(timer.getCurrentTimeInMillis());
}
public ClusterState getClusterState() {
@@ -67,6 +70,10 @@ public class SystemStateBroadcaster {
return lastClusterStateBundleConverged;
}
+ public Instant getLastStateBroadcastTimePoint() {
+ return lastStateBroadcastTimePoint;
+ }
+
private void reportNodeError(boolean nodeOk, NodeInfo info, String message) {
long time = timer.getCurrentTimeInMillis();
Long lastReported = lastErrorReported.get(info.getNode());
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java
index 6c7da15b1a5..eaa60b3d675 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java
@@ -1,12 +1,18 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
+import java.time.Instant;
+
/**
* Interface used to get time. This is separated into its own class, such that unit tests can fake timing to do timing related
* tests without relying on the speed of the unit test processing.
*/
public interface Timer {
- long getCurrentTimeInMillis();
+ Instant getCurrentWallClockTime();
+
+ default long getCurrentTimeInMillis() {
+ return getCurrentWallClockTime().toEpochMilli();
+ }
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
index 636d01dbfa3..7af5f93fa21 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java
@@ -76,7 +76,7 @@ public class Response {
{
protected final Map<String, String> attributes = new LinkedHashMap<>();
protected final Map<String, SubUnitList> subUnits = new LinkedHashMap<>();
- protected final Map<String, Long> metrics = new LinkedHashMap<>();
+ protected final Map<String, Number> metrics = new LinkedHashMap<>();
protected final Map<String, UnitState> stateMap = new LinkedHashMap<>();
protected DistributionState publishedState = null;
@@ -94,7 +94,7 @@ public class Response {
}
@Override
- public Map<String, Long> getMetricMap() { return metrics; }
+ public Map<String, Number> getMetricMap() { return metrics; }
@Override
public Map<String, UnitState> getStatePerType() { return stateMap; }
@Override
@@ -122,7 +122,7 @@ public class Response {
list.addUnit(unit, response);
return this;
}
- public EmptyResponse<T> addMetric(String name, Long value) {
+ public EmptyResponse<T> addMetric(String name, Number value) {
metrics.put(name, value);
return this;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java
index 1df37637dcf..3006effecd4 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.clustercontroller.core.restapiv2.requests;
import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vespa.clustercontroller.core.ClusterStateBundle;
+import com.yahoo.vespa.clustercontroller.core.GlobalBucketSyncStatsCalculator;
import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask;
import com.yahoo.vespa.clustercontroller.core.restapiv2.Id;
import com.yahoo.vespa.clustercontroller.core.restapiv2.Request;
@@ -36,6 +37,11 @@ public class ClusterStateRequest extends Request<Response.ClusterResponse> {
}
}
result.setPublishedState(bundleToDistributionState(context.publishedClusterStateBundle));
+ if (context.aggregatedClusterStats.hasUpdatesFromAllDistributors()) {
+ var stats = context.aggregatedClusterStats.getGlobalStats();
+ var maybeRatio = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats);
+ maybeRatio.ifPresent(r -> result.addMetric("cluster-buckets-out-of-sync-ratio", r));
+ }
return result;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
index 51bda17860e..89095e268cb 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
@@ -10,6 +10,7 @@ import com.yahoo.vespa.clustercontroller.core.ClusterStateHistoryEntry;
import com.yahoo.vespa.clustercontroller.core.ContentCluster;
import com.yahoo.vespa.clustercontroller.core.EventLog;
import com.yahoo.vespa.clustercontroller.core.FleetControllerOptions;
+import com.yahoo.vespa.clustercontroller.core.GlobalBucketSyncStatsCalculator;
import com.yahoo.vespa.clustercontroller.core.LeafGroups;
import com.yahoo.vespa.clustercontroller.core.MasterElectionHandler;
import com.yahoo.vespa.clustercontroller.core.NodeInfo;
@@ -174,11 +175,8 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa
VdsClusterHtmlRenderer.Table table = renderer.createNewClusterHtmlTable(cluster.getName(), cluster.getSlobrokGenerationCount());
ClusterStateBundle state = stateVersionTracker.getVersionedClusterStateBundle();
- if (state.clusterFeedIsBlocked()) { // Implies FeedBlock != null
- table.appendRaw("<h3 style=\"color: red\">Cluster feeding is blocked!</h3>\n");
- table.appendRaw(String.format("<p>Summary: <strong>%s</strong></p>\n",
- HtmlTable.escape(state.getFeedBlockOrNull().getDescription())));
- }
+ renderClusterFeedBlockIfPresent(state, table);
+ renderClusterOutOfSyncRatio(state, stateVersionTracker, table);
List<Group> groups = LeafGroups.enumerateFrom(cluster.getDistribution().getRootGroup());
for (Group group : groups) {
@@ -206,6 +204,53 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa
table.addTable(sb, options.stableStateTimePeriod());
}
+ private static void renderClusterFeedBlockIfPresent(ClusterStateBundle state, VdsClusterHtmlRenderer.Table table) {
+ if (state.clusterFeedIsBlocked()) { // Implies FeedBlock != null
+ table.appendRaw("<h3 style=\"color: red\">Cluster feeding is blocked!</h3>\n");
+ table.appendRaw(String.format("<p>Summary: <strong>%s</strong></p>\n",
+ HtmlTable.escape(state.getFeedBlockOrNull().getDescription())));
+ }
+ }
+
+ private static void renderClusterOutOfSyncRatio(ClusterStateBundle state, StateVersionTracker stateVersionTracker,
+ VdsClusterHtmlRenderer.Table table) {
+ var stats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats();
+ if (!stats.hasUpdatesFromAllDistributors()) {
+ table.appendRaw("<p>Current cluster out of sync ratio cannot be computed, as not all " +
+ "distributors have reported in statistics for the most recent cluster state.</p>\n");
+ return;
+ }
+ var outOfSync = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats.getGlobalStats());
+ if (outOfSync.isEmpty()) {
+ table.appendRaw("<p>Current cluster out of sync ratio cannot be computed, as not all " +
+ "distributors have reported valid statistics.</p>\n");
+ return;
+ }
+ boolean hasMaintenance = stateHasAtLeastOneMaintenanceNode(state);
+ if (!hasMaintenance && outOfSync.get() == 0.0) {
+ table.appendRaw("<p>Cluster is currently in sync.</p>\n");
+ } else {
+ table.appendRaw("<p>Cluster is currently <strong>%.2f%% out of sync</strong>.</p>\n".formatted(outOfSync.get() * 100.0));
+ if (hasMaintenance) {
+ // It is intentional that a cluster with no pending buckets but with nodes in maintenance mode rather
+ // emits "0% out of sync" with a caveat rather than "in sync", as we don't know the latter for sure.
+ table.appendRaw("<p><strong>Note:</strong> since one or more nodes are currently in " +
+ "Maintenance mode, the true out of sync ratio may be higher.</p>\n");
+ }
+ }
+ }
+
+ private static boolean stateHasAtLeastOneMaintenanceNode(ClusterStateBundle state) {
+ var baseline = state.getBaselineClusterState();
+ int nodes = baseline.getNodeCount(NodeType.STORAGE);
+ for (int i = 0; i < nodes; ++i) {
+ if (baseline.getNodeState(Node.ofStorage(i)).getState().oneOf("m")) {
+ return true;
+ }
+ }
+ return false;
+ }
+
private void storeNodeInfo(ContentCluster cluster, int nodeIndex, NodeType nodeType, Map<Integer, NodeInfo> nodeInfoByIndex) {
NodeInfo nodeInfo = cluster.getNodeInfo(new Node(nodeType, nodeIndex));
if (nodeInfo == null) return;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java
index 95f648447f4..0053c02c269 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java
@@ -308,9 +308,7 @@ public class VdsClusterHtmlRenderer {
int nodeEvents = eventLog.getNodeEventsSince(nodeInfo.getNode(),
currentTime - eventLog.getRecentTimePeriod());
row.addCell(new HtmlTable.Cell("" + nodeEvents));
- if (nodeEvents > 20) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (nodeEvents > 3) {
+ if (nodeEvents > 3) {
row.getLastCell().addProperties(WARNING_PROPERTY);
}
}