diff options
Diffstat (limited to 'clustercontroller-core/src/main/java/com')
13 files changed, 191 insertions, 25 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java index 37698a3ad00..aa2a1d29ec0 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/AggregatedClusterStats.java @@ -10,4 +10,6 @@ public interface AggregatedClusterStats { ContentClusterStats getStats(); + ContentNodeStats getGlobalStats(); + } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java index f1c19bac9b6..6fb31cc1b1c 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStatsAggregator.java @@ -38,6 +38,9 @@ public class ClusterStatsAggregator { // Maps the content node index to the content node stats for that node. // This MUST be kept up-to-date with distributorToStats; private final ContentClusterStats aggregatedStats; + // This is the aggregate of aggregates across content nodes, allowing a reader to + // get a O(1) view of all merges pending in the cluster. + private final ContentNodeStats globallyAggregatedNodeStats = new ContentNodeStats(-1); ClusterStatsAggregator(Set<Integer> distributors, Set<Integer> storageNodes) { this.distributors = distributors; @@ -58,6 +61,10 @@ public class ClusterStatsAggregator { return aggregatedStats; } + @Override + public ContentNodeStats getGlobalStats() { + return globallyAggregatedNodeStats; + } }; } @@ -96,12 +103,14 @@ public class ClusterStatsAggregator { ContentNodeStats statsToAdd = clusterStats.getNodeStats(nodeIndex); if (statsToAdd != null) { contentNode.add(statsToAdd); + globallyAggregatedNodeStats.add(statsToAdd); } if (prevClusterStats != null) { ContentNodeStats statsToSubtract = prevClusterStats.getNodeStats(nodeIndex); if (statsToSubtract != null) { contentNode.subtract(statsToSubtract); + globallyAggregatedNodeStats.subtract(statsToSubtract); } } } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 3e520d95d2c..5cffa4957c6 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -139,7 +139,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta public static FleetController create(FleetControllerOptions options, MetricReporter metricReporter) throws Exception { var context = new FleetControllerContextImpl(options); var timer = new RealTimer(); - var metricUpdater = new MetricUpdater(metricReporter, options.fleetControllerIndex(), options.clusterName()); + var metricUpdater = new MetricUpdater(metricReporter, timer, options.fleetControllerIndex(), options.clusterName()); var log = new EventLog(timer, metricUpdater); var cluster = new ContentCluster(options); var stateGatherer = new NodeStateGatherer(timer, timer, log); @@ -348,7 +348,8 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta ClusterState baselineState = stateBundle.getBaselineClusterState(); newStates.add(stateBundle); metricUpdater.updateClusterStateMetrics(cluster, baselineState, - ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock())); + ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock()), + systemStateBroadcaster.getLastStateBroadcastTimePoint()); lastMetricUpdateCycleCount = cycleCount; systemStateBroadcaster.handleNewClusterStates(stateBundle); // Iff master, always store new version in ZooKeeper _before_ publishing to any @@ -365,12 +366,20 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta private boolean maybePublishOldMetrics() { verifyInControllerThread(); - if (isMaster() && cycleCount > 300 + lastMetricUpdateCycleCount) { - ClusterStateBundle stateBundle = stateVersionTracker.getVersionedClusterStateBundle(); - ClusterState baselineState = stateBundle.getBaselineClusterState(); - metricUpdater.updateClusterStateMetrics(cluster, baselineState, - ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock())); - lastMetricUpdateCycleCount = cycleCount; + if (cycleCount > 300 + lastMetricUpdateCycleCount) { + if (isMaster()) { + updateMasterClusterSyncMetrics(); + ClusterStateBundle stateBundle = stateVersionTracker.getVersionedClusterStateBundle(); + ClusterState baselineState = stateBundle.getBaselineClusterState(); + metricUpdater.updateClusterStateMetrics(cluster, baselineState, + ResourceUsageStats.calculateFrom(cluster.getNodeInfos(), options.clusterFeedBlockLimit(), stateBundle.getFeedBlock()), + systemStateBroadcaster.getLastStateBroadcastTimePoint()); + lastMetricUpdateCycleCount = cycleCount; + } else { + // If we're not the master we don't have any authoritative information about + // how out of sync the cluster nodes are, so reset the metric. + metricUpdater.updateClusterBucketsOutOfSyncRatio(0); + } return true; } else { return false; @@ -563,6 +572,14 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta } } + private void updateMasterClusterSyncMetrics() { + var stats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats(); + if (stats.hasUpdatesFromAllDistributors()) { + GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats.getGlobalStats()) + .ifPresent(metricUpdater::updateClusterBucketsOutOfSyncRatio); + } + } + private boolean updateMasterElectionState() { try { return masterElectionHandler.watchMasterElection(database, databaseContext); @@ -689,6 +706,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta context.cluster = cluster; context.currentConsolidatedState = consolidatedClusterState(); context.publishedClusterStateBundle = stateVersionTracker.getVersionedClusterStateBundle(); + context.aggregatedClusterStats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats(); context.masterInfo = new MasterInterface() { @Override public boolean isMaster() { return isMaster; } @Override public Integer getMaster() { return masterElectionHandler.getMaster(); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java new file mode 100644 index 00000000000..0137ea2c29e --- /dev/null +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GlobalBucketSyncStatsCalculator.java @@ -0,0 +1,45 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import java.util.Optional; + +/** + * @author vekterli + */ +public class GlobalBucketSyncStatsCalculator { + + /** + * Compute a value in [0, 1] representing how much of the cluster's data space is currently + * out of sync, i.e. pending merging. In other words, if the value is 1 all buckets are out + * of sync, and conversely if it's 0 all buckets are in sync. This number applies across bucket + * spaces. + * + * @param globalStats Globally aggregated content node statistics for the entire cluster. + * @return Optional containing a value [0, 1] representing the ratio of buckets pending merge + * in relation to the total number of buckets in the cluster, or an empty optional if + * the underlying global statistics contains invalid/incomplete information. + */ + public static Optional<Double> clusterBucketsOutOfSyncRatio(ContentNodeStats globalStats) { + long totalBuckets = 0; + long pendingBuckets = 0; + for (var space : globalStats.getBucketSpaces().values()) { + if (!space.valid()) { + return Optional.empty(); + } + totalBuckets += space.getBucketsTotal(); + pendingBuckets += space.getBucketsPending(); + } + // It's currently possible for the reported number of pending buckets to be greater than + // the number of total buckets. Example: this can happen if a bucket is present on a single + // node, but should have been replicated to 9 more nodes. Since counts are not normalized + // across content nodes for a given bucket, this will be counted as 9 pending and 1 total. + // Eventually this will settle as 0 pending and 10 total. + // TODO report node-normalized pending/total counts from distributors and use these. + pendingBuckets = Math.min(pendingBuckets, totalBuckets); + if (totalBuckets <= 0) { + return Optional.of(0.0); // No buckets; cannot be out of sync by definition + } + return Optional.of((double)pendingBuckets / (double)totalBuckets); + } + +} diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java index 419cb652671..d72ede7199e 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java @@ -10,6 +10,7 @@ import com.yahoo.vespa.clustercontroller.utils.util.ComponentMetricReporter; import com.yahoo.vespa.clustercontroller.utils.util.MetricReporter; import java.time.Duration; +import java.time.Instant; import java.util.HashMap; import java.util.Map; import java.util.SortedSet; @@ -19,17 +20,27 @@ import java.util.function.BooleanSupplier; public class MetricUpdater { private final ComponentMetricReporter metricReporter; + private final Timer timer; + // Publishing and converging on a cluster state version is never instant nor atomic, but + // it usually completes within a few seconds. If convergence does not happen for more than + // 30 seconds, it's a sign something has stalled. + private Duration stateVersionConvergenceGracePeriod = Duration.ofSeconds(30); - public MetricUpdater(MetricReporter metricReporter, int controllerIndex, String clusterName) { + public MetricUpdater(MetricReporter metricReporter, Timer timer, int controllerIndex, String clusterName) { this.metricReporter = new ComponentMetricReporter(metricReporter, "cluster-controller."); this.metricReporter.addDimension("controller-index", String.valueOf(controllerIndex)); this.metricReporter.addDimension("clusterid", clusterName); + this.timer = timer; } public MetricReporter.Context createContext(Map<String, String> dimensions) { return metricReporter.createContext(dimensions); } + public void setStateVersionConvergenceGracePeriod(Duration gracePeriod) { + stateVersionConvergenceGracePeriod = gracePeriod; + } + private static int nodesInAvailableState(Map<State, Integer> nodeCounts) { return nodeCounts.getOrDefault(State.INITIALIZING, 0) + nodeCounts.getOrDefault(State.RETIRED, 0) @@ -39,10 +50,13 @@ public class MetricUpdater { + nodeCounts.getOrDefault(State.MAINTENANCE, 0); } - public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state, ResourceUsageStats resourceUsage) { + public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state, + ResourceUsageStats resourceUsage, Instant lastStateBroadcastTimePoint) { Map<String, String> dimensions = new HashMap<>(); dimensions.put("cluster", cluster.getName()); dimensions.put("clusterid", cluster.getName()); + Instant now = timer.getCurrentWallClockTime(); + boolean convergenceDeadlinePassed = lastStateBroadcastTimePoint.plus(stateVersionConvergenceGracePeriod).isBefore(now); for (NodeType type : NodeType.getTypes()) { dimensions.put("node-type", type.toString().toLowerCase()); MetricReporter.Context context = createContext(dimensions); @@ -50,10 +64,18 @@ public class MetricUpdater { for (State s : State.values()) { nodeCounts.put(s, 0); } + int nodesNotConverged = 0; for (Integer i : cluster.getConfiguredNodes().keySet()) { - NodeState s = state.getNodeState(new Node(type, i)); + var node = new Node(type, i); + NodeState s = state.getNodeState(node); Integer count = nodeCounts.get(s.getState()); nodeCounts.put(s.getState(), count + 1); + var info = cluster.getNodeInfo(node); + if (info != null && convergenceDeadlinePassed && s.getState().oneOf("uir")) { + if (info.getClusterStateVersionBundleAcknowledged() != state.getVersion()) { + nodesNotConverged++; + } + } } for (State s : State.values()) { String name = s.toString().toLowerCase() + ".count"; @@ -63,6 +85,7 @@ public class MetricUpdater { final int availableNodes = nodesInAvailableState(nodeCounts); final int totalNodes = Math.max(cluster.getConfiguredNodes().size(), 1); // Assumes 1-1 between distributor and storage metricReporter.set("available-nodes.ratio", (double)availableNodes / totalNodes, context); + metricReporter.set("nodes-not-converged", nodesNotConverged, context); } dimensions.remove("node-type"); MetricReporter.Context context = createContext(dimensions); @@ -93,6 +116,10 @@ public class MetricUpdater { metricReporter.set("is-master", isMaster ? 1 : 0); } + public void updateClusterBucketsOutOfSyncRatio(double ratio) { + metricReporter.set("cluster-buckets-out-of-sync-ratio", ratio); + } + public void addTickTime(long millis, boolean didWork) { if (didWork) { metricReporter.set("busy-tick-time-ms", millis); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java index b4563c09b66..482b40381df 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RealTimer.java @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.clustercontroller.core; +import java.time.Instant; import java.util.Calendar; import java.util.Locale; import java.util.TimeZone; @@ -10,8 +11,9 @@ import java.util.TimeZone; */ public class RealTimer implements Timer { - public long getCurrentTimeInMillis() { - return System.currentTimeMillis(); + @Override + public Instant getCurrentWallClockTime() { + return Instant.now(); } public static String printDuration(long time) { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java index efb161cebec..e1b774e64ff 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java @@ -17,6 +17,7 @@ public abstract class RemoteClusterControllerTask { public MasterInterface masterInfo; public NodeListener nodeListener; public SlobrokListener slobrokListener; + public AggregatedClusterStats aggregatedClusterStats; } private final Object monitor = new Object(); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java index c74a846fe30..0337e187b8e 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java @@ -8,6 +8,7 @@ import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.State; import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler; +import java.time.Instant; import java.util.logging.Level; import java.util.LinkedList; import java.util.List; @@ -29,6 +30,7 @@ public class SystemStateBroadcaster { private final static long minTimeBetweenNodeErrorLogging = 10 * 60 * 1000; private final Map<Node, Long> lastErrorReported = new TreeMap<>(); + private Instant lastStateBroadcastTimePoint = Instant.EPOCH; private int lastOfficialStateVersion = -1; private int lastStateVersionBundleAcked = 0; private int lastClusterStateVersionConverged = 0; @@ -45,6 +47,7 @@ public class SystemStateBroadcaster { public void handleNewClusterStates(ClusterStateBundle state) { clusterStateBundle = state; + lastStateBroadcastTimePoint = Instant.ofEpochMilli(timer.getCurrentTimeInMillis()); } public ClusterState getClusterState() { @@ -67,6 +70,10 @@ public class SystemStateBroadcaster { return lastClusterStateBundleConverged; } + public Instant getLastStateBroadcastTimePoint() { + return lastStateBroadcastTimePoint; + } + private void reportNodeError(boolean nodeOk, NodeInfo info, String message) { long time = timer.getCurrentTimeInMillis(); Long lastReported = lastErrorReported.get(info.getNode()); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java index 6c7da15b1a5..eaa60b3d675 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Timer.java @@ -1,12 +1,18 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.clustercontroller.core; +import java.time.Instant; + /** * Interface used to get time. This is separated into its own class, such that unit tests can fake timing to do timing related * tests without relying on the speed of the unit test processing. */ public interface Timer { - long getCurrentTimeInMillis(); + Instant getCurrentWallClockTime(); + + default long getCurrentTimeInMillis() { + return getCurrentWallClockTime().toEpochMilli(); + } } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java index 636d01dbfa3..7af5f93fa21 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Response.java @@ -76,7 +76,7 @@ public class Response { { protected final Map<String, String> attributes = new LinkedHashMap<>(); protected final Map<String, SubUnitList> subUnits = new LinkedHashMap<>(); - protected final Map<String, Long> metrics = new LinkedHashMap<>(); + protected final Map<String, Number> metrics = new LinkedHashMap<>(); protected final Map<String, UnitState> stateMap = new LinkedHashMap<>(); protected DistributionState publishedState = null; @@ -94,7 +94,7 @@ public class Response { } @Override - public Map<String, Long> getMetricMap() { return metrics; } + public Map<String, Number> getMetricMap() { return metrics; } @Override public Map<String, UnitState> getStatePerType() { return stateMap; } @Override @@ -122,7 +122,7 @@ public class Response { list.addUnit(unit, response); return this; } - public EmptyResponse<T> addMetric(String name, Long value) { + public EmptyResponse<T> addMetric(String name, Number value) { metrics.put(name, value); return this; } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java index 1df37637dcf..3006effecd4 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/ClusterStateRequest.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.clustercontroller.core.restapiv2.requests; import com.yahoo.vdslib.state.NodeType; import com.yahoo.vespa.clustercontroller.core.ClusterStateBundle; +import com.yahoo.vespa.clustercontroller.core.GlobalBucketSyncStatsCalculator; import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask; import com.yahoo.vespa.clustercontroller.core.restapiv2.Id; import com.yahoo.vespa.clustercontroller.core.restapiv2.Request; @@ -36,6 +37,11 @@ public class ClusterStateRequest extends Request<Response.ClusterResponse> { } } result.setPublishedState(bundleToDistributionState(context.publishedClusterStateBundle)); + if (context.aggregatedClusterStats.hasUpdatesFromAllDistributors()) { + var stats = context.aggregatedClusterStats.getGlobalStats(); + var maybeRatio = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats); + maybeRatio.ifPresent(r -> result.addMetric("cluster-buckets-out-of-sync-ratio", r)); + } return result; } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java index 51bda17860e..89095e268cb 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java @@ -10,6 +10,7 @@ import com.yahoo.vespa.clustercontroller.core.ClusterStateHistoryEntry; import com.yahoo.vespa.clustercontroller.core.ContentCluster; import com.yahoo.vespa.clustercontroller.core.EventLog; import com.yahoo.vespa.clustercontroller.core.FleetControllerOptions; +import com.yahoo.vespa.clustercontroller.core.GlobalBucketSyncStatsCalculator; import com.yahoo.vespa.clustercontroller.core.LeafGroups; import com.yahoo.vespa.clustercontroller.core.MasterElectionHandler; import com.yahoo.vespa.clustercontroller.core.NodeInfo; @@ -174,11 +175,8 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa VdsClusterHtmlRenderer.Table table = renderer.createNewClusterHtmlTable(cluster.getName(), cluster.getSlobrokGenerationCount()); ClusterStateBundle state = stateVersionTracker.getVersionedClusterStateBundle(); - if (state.clusterFeedIsBlocked()) { // Implies FeedBlock != null - table.appendRaw("<h3 style=\"color: red\">Cluster feeding is blocked!</h3>\n"); - table.appendRaw(String.format("<p>Summary: <strong>%s</strong></p>\n", - HtmlTable.escape(state.getFeedBlockOrNull().getDescription()))); - } + renderClusterFeedBlockIfPresent(state, table); + renderClusterOutOfSyncRatio(state, stateVersionTracker, table); List<Group> groups = LeafGroups.enumerateFrom(cluster.getDistribution().getRootGroup()); for (Group group : groups) { @@ -206,6 +204,53 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa table.addTable(sb, options.stableStateTimePeriod()); } + private static void renderClusterFeedBlockIfPresent(ClusterStateBundle state, VdsClusterHtmlRenderer.Table table) { + if (state.clusterFeedIsBlocked()) { // Implies FeedBlock != null + table.appendRaw("<h3 style=\"color: red\">Cluster feeding is blocked!</h3>\n"); + table.appendRaw(String.format("<p>Summary: <strong>%s</strong></p>\n", + HtmlTable.escape(state.getFeedBlockOrNull().getDescription()))); + } + } + + private static void renderClusterOutOfSyncRatio(ClusterStateBundle state, StateVersionTracker stateVersionTracker, + VdsClusterHtmlRenderer.Table table) { + var stats = stateVersionTracker.getAggregatedClusterStats().getAggregatedStats(); + if (!stats.hasUpdatesFromAllDistributors()) { + table.appendRaw("<p>Current cluster out of sync ratio cannot be computed, as not all " + + "distributors have reported in statistics for the most recent cluster state.</p>\n"); + return; + } + var outOfSync = GlobalBucketSyncStatsCalculator.clusterBucketsOutOfSyncRatio(stats.getGlobalStats()); + if (outOfSync.isEmpty()) { + table.appendRaw("<p>Current cluster out of sync ratio cannot be computed, as not all " + + "distributors have reported valid statistics.</p>\n"); + return; + } + boolean hasMaintenance = stateHasAtLeastOneMaintenanceNode(state); + if (!hasMaintenance && outOfSync.get() == 0.0) { + table.appendRaw("<p>Cluster is currently in sync.</p>\n"); + } else { + table.appendRaw("<p>Cluster is currently <strong>%.2f%% out of sync</strong>.</p>\n".formatted(outOfSync.get() * 100.0)); + if (hasMaintenance) { + // It is intentional that a cluster with no pending buckets but with nodes in maintenance mode rather + // emits "0% out of sync" with a caveat rather than "in sync", as we don't know the latter for sure. + table.appendRaw("<p><strong>Note:</strong> since one or more nodes are currently in " + + "Maintenance mode, the true out of sync ratio may be higher.</p>\n"); + } + } + } + + private static boolean stateHasAtLeastOneMaintenanceNode(ClusterStateBundle state) { + var baseline = state.getBaselineClusterState(); + int nodes = baseline.getNodeCount(NodeType.STORAGE); + for (int i = 0; i < nodes; ++i) { + if (baseline.getNodeState(Node.ofStorage(i)).getState().oneOf("m")) { + return true; + } + } + return false; + } + private void storeNodeInfo(ContentCluster cluster, int nodeIndex, NodeType nodeType, Map<Integer, NodeInfo> nodeInfoByIndex) { NodeInfo nodeInfo = cluster.getNodeInfo(new Node(nodeType, nodeIndex)); if (nodeInfo == null) return; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java index 95f648447f4..0053c02c269 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRenderer.java @@ -308,9 +308,7 @@ public class VdsClusterHtmlRenderer { int nodeEvents = eventLog.getNodeEventsSince(nodeInfo.getNode(), currentTime - eventLog.getRecentTimePeriod()); row.addCell(new HtmlTable.Cell("" + nodeEvents)); - if (nodeEvents > 20) { - row.getLastCell().addProperties(ERROR_PROPERTY); - } else if (nodeEvents > 3) { + if (nodeEvents > 3) { row.getLastCell().addProperties(WARNING_PROPERTY); } } |