diff options
author | Harald Musum <musum@verizonmedia.com> | 2022-08-30 17:02:05 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-08-30 17:02:05 +0200 |
commit | c81780000ed7ca2be480f16e3d442e23f86bd808 (patch) | |
tree | 1fe8364f500755fdce33d80805ca22e4909905bb | |
parent | 106374aac1cc4ab9ac682b5a183b347460e8cdc1 (diff) | |
parent | 7455d58e8eeae70e8909964b9584a4e81e501bd4 (diff) |
Merge pull request #23855 from vespa-engine/hmusum/cleanup-20
Cluster controller cleanup, part 7 [run-systemtest]
12 files changed, 159 insertions, 170 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 3f3bf62bf4d..a1e213fc1f9 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -37,6 +37,7 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Queue; import java.util.Set; import java.util.TimeZone; @@ -130,12 +131,9 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta this.systemStateBroadcaster = systemStateBroadcaster; this.stateVersionTracker = new StateVersionTracker(options.minMergeCompletionRatio); this.metricUpdater = metricUpdater; - - this.statusPageServer = statusPage; + this.statusPageServer = Objects.requireNonNull(statusPage, "statusPage cannot be null"); this.rpcServer = server; - this.masterElectionHandler = masterElectionHandler; - this.statusRequestRouter.addHandler( "^/node=([a-z]+)\\.(\\d+)$", new LegacyNodePageRequestHandler(timer, eventLog, cluster)); @@ -277,9 +275,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta controllerThreadId = Thread.currentThread().getId(); database.shutdown(databaseContext); - if (statusPageServer != null) { - statusPageServer.shutdown(); - } + statusPageServer.shutdown(); if (rpcServer != null) { rpcServer.shutdown(); } @@ -530,12 +526,10 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta } } - if (statusPageServer != null) { - try{ - statusPageServer.setPort(options.httpPort); - } catch (Exception e) { - context.log(logger, Level.WARNING, "Failed to initialize status server socket. This may be natural if cluster has altered the services running on this node: " + e.getMessage()); - } + try { + statusPageServer.setPort(options.httpPort); + } catch (Exception e) { + context.log(logger, Level.WARNING, "Failed to initialize status server socket. This may be natural if cluster has altered the services running on this node: " + e.getMessage()); } long currentTime = timer.getCurrentTimeInMillis(); @@ -679,12 +673,10 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta } private boolean processAnyPendingStatusPageRequest() { - if (statusPageServer != null) { - StatusPageServer.HttpRequest statusRequest = statusPageServer.getCurrentHttpRequest(); - if (statusRequest != null) { - statusPageServer.answerCurrentStatusRequest(fetchStatusPage(statusRequest)); - return true; - } + StatusPageServer.HttpRequest statusRequest = statusPageServer.getCurrentHttpRequest(); + if (statusRequest != null) { + statusPageServer.answerCurrentStatusRequest(fetchStatusPage(statusRequest)); + return true; } return false; } @@ -1224,8 +1216,6 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta public ContentCluster getCluster() { return cluster; } - public List<NodeEvent> getNodeEvents(Node n) { return eventLog.getNodeEvents(n); } - public EventLog getEventLog() { return eventLog; } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index 16e0e1c0673..750d44c8fbf 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -5,18 +5,13 @@ import com.yahoo.jrt.slobrok.api.BackOffPolicy; import com.yahoo.vdslib.distribution.ConfiguredNode; import com.yahoo.vdslib.distribution.Distribution; import com.yahoo.vdslib.state.NodeType; - -import java.text.DecimalFormat; -import java.text.DecimalFormatSymbols; import java.time.Duration; import java.util.Collection; import java.util.Collections; -import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; -import java.util.stream.Collectors; /** * This class represents all the options that can be set in the fleetcontroller. @@ -125,7 +120,7 @@ public class FleetControllerOptions implements Cloneable { // TODO: Get rid of this by always getting nodes by distribution.getNodes() public Set<ConfiguredNode> nodes; - private Duration maxDeferredTaskVersionWaitTime = Duration.ofSeconds(30); + public Duration maxDeferredTaskVersionWaitTime = Duration.ofSeconds(30); public boolean clusterHasGlobalDocumentTypes = false; @@ -175,82 +170,4 @@ public class FleetControllerOptions implements Cloneable { } } - public static String splitZooKeeperAddress(String s) { - StringBuilder sb = new StringBuilder(); - while (true) { - int index = s.indexOf(','); - if (index > 0) { - sb.append(s.substring(0, index + 1)).append(' '); - s = s.substring(index+1); - } else { - break; - } - } - sb.append(s); - return sb.toString(); - } - - static DecimalFormat DecimalDot2 = new DecimalFormat("0.00", new DecimalFormatSymbols(Locale.ENGLISH)); - - public void writeHtmlState(StringBuilder sb) { - String slobrokspecs = ""; - for (int i=0; i<slobrokConnectionSpecs.length; ++i) { - if (i != 0) slobrokspecs += "<br>"; - slobrokspecs += slobrokConnectionSpecs[i]; - } - sb.append("<h1>Current config</h1>\n") - .append("<p>Fleet controller config id: ").append(fleetControllerConfigId == null ? null : fleetControllerConfigId.replaceAll("\n", "<br>\n")).append("</p>\n") - .append("<p>Slobrok config id: ").append(slobrokConfigId == null ? null : slobrokConfigId.replaceAll("\n", "<br>\n")).append("</p>\n") - .append("<table border=\"1\" cellspacing=\"0\"><tr><th>Property</th><th>Value</th></tr>\n"); - - sb.append("<tr><td><nobr>Cluster name</nobr></td><td align=\"right\">").append(clusterName).append("</td></tr>"); - sb.append("<tr><td><nobr>Fleet controller index</nobr></td><td align=\"right\">").append(fleetControllerIndex).append("/").append(fleetControllerCount).append("</td></tr>"); - sb.append("<tr><td><nobr>Number of fleetcontrollers gathering states from nodes</nobr></td><td align=\"right\">").append(stateGatherCount).append("</td></tr>"); - - sb.append("<tr><td><nobr>Slobrok connection spec</nobr></td><td align=\"right\">").append(slobrokspecs).append("</td></tr>"); - sb.append("<tr><td><nobr>RPC port</nobr></td><td align=\"right\">").append(rpcPort == 0 ? "Pick random available" : rpcPort).append("</td></tr>"); - sb.append("<tr><td><nobr>HTTP port</nobr></td><td align=\"right\">").append(httpPort == 0 ? "Pick random available" : httpPort).append("</td></tr>"); - sb.append("<tr><td><nobr>Master cooldown period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(masterZooKeeperCooldownPeriod)).append("</td></tr>"); - String zooKeeperAddress = (zooKeeperServerAddress == null ? "Not using Zookeeper" : splitZooKeeperAddress(zooKeeperServerAddress)); - sb.append("<tr><td><nobr>Zookeeper server address</nobr></td><td align=\"right\">").append(zooKeeperAddress).append("</td></tr>"); - sb.append("<tr><td><nobr>Zookeeper session timeout</nobr></td><td align=\"right\">").append(RealTimer.printDuration(zooKeeperSessionTimeout)).append("</td></tr>"); - - sb.append("<tr><td><nobr>Cycle wait time</nobr></td><td align=\"right\">").append(cycleWaitTime).append(" ms</td></tr>"); - sb.append("<tr><td><nobr>Minimum time before first clusterstate broadcast as master</nobr></td><td align=\"right\">").append(RealTimer.printDuration(minTimeBeforeFirstSystemStateBroadcast)).append("</td></tr>"); - sb.append("<tr><td><nobr>Minimum time between official cluster states</nobr></td><td align=\"right\">").append(RealTimer.printDuration(minTimeBetweenNewSystemStates)).append("</td></tr>"); - sb.append("<tr><td><nobr>Slobrok mirror backoff policy</nobr></td><td align=\"right\">").append(slobrokBackOffPolicy == null ? "default" : "overridden").append("</td></tr>"); - - sb.append("<tr><td><nobr>Node state request timeout</nobr></td><td align=\"right\">").append(RealTimer.printDuration(nodeStateRequestTimeoutMS)).append("</td></tr>"); - sb.append("<tr><td><nobr>VDS 4.1 node state polling frequency</nobr></td><td align=\"right\">").append(RealTimer.printDuration(statePollingFrequency)).append("</td></tr>"); - sb.append("<tr><td><nobr>Maximum distributor transition time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxTransitionTime.get(NodeType.DISTRIBUTOR))).append("</td></tr>"); - sb.append("<tr><td><nobr>Maximum storage transition time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxTransitionTime.get(NodeType.STORAGE))).append("</td></tr>"); - sb.append("<tr><td><nobr>Maximum initialize without progress time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxInitProgressTime)).append("</td></tr>"); - sb.append("<tr><td><nobr>Maximum premature crashes</nobr></td><td align=\"right\">").append(maxPrematureCrashes).append("</td></tr>"); - sb.append("<tr><td><nobr>Stable state time period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(stableStateTimePeriod)).append("</td></tr>"); - sb.append("<tr><td><nobr>Slobrok disconnect grace period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxSlobrokDisconnectGracePeriod)).append("</td></tr>"); - - sb.append("<tr><td><nobr>Number of distributor nodes</nobr></td><td align=\"right\">").append(nodes == null ? "Autodetect" : nodes.size()).append("</td></tr>"); - sb.append("<tr><td><nobr>Number of storage nodes</nobr></td><td align=\"right\">").append(nodes == null ? "Autodetect" : nodes.size()).append("</td></tr>"); - sb.append("<tr><td><nobr>Minimum distributor nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(minDistributorNodesUp).append("</td></tr>"); - sb.append("<tr><td><nobr>Minimum storage nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(minStorageNodesUp).append("</td></tr>"); - sb.append("<tr><td><nobr>Minimum percentage of distributor nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(DecimalDot2.format(100 * minRatioOfDistributorNodesUp)).append(" %</td></tr>"); - sb.append("<tr><td><nobr>Minimum percentage of storage nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(DecimalDot2.format(100 * minRatioOfStorageNodesUp)).append(" %</td></tr>"); - - sb.append("<tr><td><nobr>Show local cluster state changes</nobr></td><td align=\"right\">").append(showLocalSystemStatesInEventLog).append("</td></tr>"); - sb.append("<tr><td><nobr>Maximum event log size</nobr></td><td align=\"right\">").append(eventLogMaxSize).append("</td></tr>"); - sb.append("<tr><td><nobr>Maximum node event log size</nobr></td><td align=\"right\">").append(eventNodeLogMaxSize).append("</td></tr>"); - sb.append("<tr><td><nobr>Wanted distribution bits</nobr></td><td align=\"right\">").append(distributionBits).append("</td></tr>"); - sb.append("<tr><td><nobr>Max deferred task version wait time</nobr></td><td align=\"right\">").append(maxDeferredTaskVersionWaitTime.toMillis()).append("ms</td></tr>"); - sb.append("<tr><td><nobr>Cluster has global document types configured</nobr></td><td align=\"right\">").append(clusterHasGlobalDocumentTypes).append("</td></tr>"); - sb.append("<tr><td><nobr>Enable 2-phase cluster state activation protocol</nobr></td><td align=\"right\">").append(enableTwoPhaseClusterStateActivation).append("</td></tr>"); - sb.append("<tr><td><nobr>Cluster auto feed block on resource exhaustion enabled</nobr></td><td align=\"right\">") - .append(clusterFeedBlockEnabled).append("</td></tr>"); - sb.append("<tr><td><nobr>Feed block limits</nobr></td><td align=\"right\">") - .append(clusterFeedBlockLimit.entrySet().stream() - .map(kv -> String.format("%s: %.2f%%", kv.getKey(), kv.getValue() * 100.0)) - .collect(Collectors.joining("<br/>"))).append("</td></tr>"); - - sb.append("</table>"); - } - } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java index 637aca16ee7..dfc328346bb 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java @@ -35,11 +35,9 @@ public class MasterElectionHandler implements MasterInterface { this.index = index; this.totalCount = totalCount; this.nextInLineCount = Integer.MAX_VALUE; - // Only a given set of nodes can ever become master - if (index > (totalCount - 1) / 2) { + if (cannotBecomeMaster()) context.log(logger, Level.FINE, () -> "We can never become master and will always stay a follower."); - } - // Tag current time as when we have not seen any other master. Make sure we're not taking over at once for master that is on the way down + // Tag current time as when we have not seen any other master. Make sure we're not taking over at once for master that is on the way down masterGoneFromZooKeeperTime = timer.getCurrentTimeInMillis(); } @@ -77,16 +75,15 @@ public class MasterElectionHandler implements MasterInterface { @Override public Integer getMaster() { - // If too few followers there can be no master - if (2 * followers <= totalCount) { + if (tooFewFollowersToHaveAMaster()) { return null; } - // If all are following master candidate, it is master if it exists. + // If all are following master candidate, it is master if it exists. if (followers == totalCount) { return masterCandidate; } - // If not all are following we only accept master candidate if old master - // disappeared sufficient time ago + // If not all are following we only accept master candidate if old master + // disappeared sufficient time ago if (masterGoneFromZooKeeperTime + masterZooKeeperCooldownPeriod > timer.getCurrentTimeInMillis()) { return null; } @@ -97,8 +94,7 @@ public class MasterElectionHandler implements MasterInterface { if (masterCandidate == null) { return "There is currently no master candidate."; } - // If too few followers there can be no master - if (2 * followers <= totalCount) { + if (tooFewFollowersToHaveAMaster()) { return "More than half of the nodes must agree for there to be a master. Only " + followers + " of " + totalCount + " nodes agree on current master candidate (" + masterCandidate + ")."; } @@ -118,6 +114,10 @@ public class MasterElectionHandler implements MasterInterface { return followers + " of " + totalCount + " nodes agree " + masterCandidate + " is master."; } + private boolean tooFewFollowersToHaveAMaster() { + return 2 * followers <= totalCount; + } + public boolean isAmongNthFirst(int first) { return (nextInLineCount < first); } public boolean watchMasterElection(DatabaseHandler database, @@ -131,8 +131,8 @@ public class MasterElectionHandler implements MasterInterface { } return false; // Nothing have happened since last time. } - // Move next data to temporary, such that we don't need to keep lock, and such that we don't retry - // if we happen to fail processing the data. + // Move next data to temporary, such that we don't need to keep lock, and such that we don't retry + // if we happen to fail processing the data. Map<Integer, Integer> state; context.log(logger, Level.INFO, "Handling new master election, as we have received " + nextMasterData.size() + " entries"); synchronized (monitor) { @@ -184,8 +184,7 @@ public class MasterElectionHandler implements MasterInterface { database.setMasterVote(dbContext, first.getKey()); } } - // Only a given set of nodes can ever become master - if (index <= (totalCount - 1) / 2) { + if (canBecomeMaster()) { int ourPosition = 0; for (Map.Entry<Integer, Integer> entry : state.entrySet()) { if (entry.getKey() != index) { @@ -205,6 +204,11 @@ public class MasterElectionHandler implements MasterInterface { return true; } + // Only a given set of nodes can ever become master + private boolean canBecomeMaster() {return index <= (totalCount - 1) / 2;} + + private boolean cannotBecomeMaster() {return ! canBecomeMaster();} + private static String toString(Map<Integer, Integer> data) { StringBuilder sb = new StringBuilder(); for (Map.Entry<Integer, Integer> entry : data.entrySet()) { @@ -253,10 +257,10 @@ public class MasterElectionHandler implements MasterInterface { Integer master = getMaster(); if (master != null) { sb.append("<p>Current cluster controller master is node " + master + "."); - if (master.intValue() == index) sb.append(" (This node)"); + if (master == index) sb.append(" (This node)"); sb.append("</p>"); } else { - if (2 * followers <= totalCount) { + if (tooFewFollowersToHaveAMaster()) { sb.append("<p>There is currently no master. Less than half the fleet controllers (") .append(followers).append(") are following master candidate ").append(masterCandidate) .append(".</p>"); @@ -267,19 +271,19 @@ public class MasterElectionHandler implements MasterInterface { .append(" before electing new master unless all possible master candidates are online.</p>"); } } - if ((master == null || master.intValue() != index) && nextInLineCount < stateGatherCount) { + if ((master == null || master != index) && nextInLineCount < stateGatherCount) { sb.append("<p>As we are number ").append(nextInLineCount) .append(" in line for taking over as master, we're gathering state from nodes.</p>"); sb.append("<p><font color=\"red\">As we are not the master, we don't know about nodes current system state" + " or wanted states, so some statistics below may be stale. Look at status page on master " + "for updated data.</font></p>"); } - if (index * 2 > totalCount) { + if (cannotBecomeMaster()) { sb.append("<p>As lowest index fleet controller is prioritized to become master, and more than half " + "of the fleet controllers need to be available to select a master, we can never become master.</p>"); } - // Debug data + // Debug data sb.append("<p><font size=\"-1\" color=\"grey\">Master election handler internal state:") .append("<br>Index: " + index) .append("<br>Fleet controller count: " + totalCount) diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java index 4f20b3d0cdc..0611d754b69 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java @@ -20,16 +20,22 @@ import com.yahoo.vespa.clustercontroller.core.status.statuspage.HtmlTable; import com.yahoo.vespa.clustercontroller.core.status.statuspage.StatusPageResponse; import com.yahoo.vespa.clustercontroller.core.status.statuspage.StatusPageServer; import com.yahoo.vespa.clustercontroller.core.status.statuspage.VdsClusterHtmlRenderer; +import java.text.DecimalFormat; +import java.text.DecimalFormatSymbols; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.TimeZone; import java.util.TreeMap; +import java.util.stream.Collectors; /** * @author Haakon Humberset */ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHandler { + private static final DecimalFormat DecimalDot2 = new DecimalFormat("0.00", new DecimalFormatSymbols(Locale.ENGLISH)); + private final Timer timer; private final ContentCluster cluster; private final MasterElectionHandler masterElectionHandler; @@ -69,7 +75,6 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa .append(" | <a href=\"#eventlog\">Event log</a>") .append(" ]</font></p>\n"); content.append("<table><tr><td>UTC time when creating this page:</td><td align=\"right\">").append(RealTimer.printDateNoMilliSeconds(currentTime, tz)).append("</td></tr>"); - //content.append("<tr><td>Fleetcontroller version:</td><td align=\"right\">" + Vtag.V_TAG_PKG + "</td></tr/>"); content.append("<tr><td>Cluster controller uptime:</td><td align=\"right\">" + RealTimer.printDuration(currentTime - startedTime) + "</td></tr></table>"); if (masterElectionHandler.isAmongNthFirst(options.stateGatherCount)) { // Table overview of all the nodes @@ -78,12 +83,12 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa writeHtmlState(stateVersionTracker, content); } else { // Overview of current config - options.writeHtmlState(content); + writeHtmlState(content, options); } // State of master election masterElectionHandler.writeHtmlState(content, options.stateGatherCount); // Overview of current config - options.writeHtmlState(content); + writeHtmlState(content, options); // Event log eventLog.writeHtmlState(content, null); response.writeHtmlFooter(content, ""); @@ -200,4 +205,80 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa nodeInfoByIndex.put(nodeIndex, nodeInfo); } + public void writeHtmlState(StringBuilder sb, FleetControllerOptions options) { + String slobrokspecs = ""; + for (int i = 0; i < options.slobrokConnectionSpecs.length; ++i) { + if (i != 0) slobrokspecs += "<br>"; + slobrokspecs += options.slobrokConnectionSpecs[i]; + } + sb.append("<h1>Current config</h1>\n") + .append("<p>Fleet controller config id: ").append(options.fleetControllerConfigId == null ? null : options.fleetControllerConfigId.replaceAll("\n", "<br>\n")).append("</p>\n") + .append("<p>Slobrok config id: ").append(options.slobrokConfigId == null ? null : options.slobrokConfigId.replaceAll("\n", "<br>\n")).append("</p>\n") + .append("<table border=\"1\" cellspacing=\"0\"><tr><th>Property</th><th>Value</th></tr>\n"); + + sb.append("<tr><td><nobr>Cluster name</nobr></td><td align=\"right\">").append(options.clusterName).append("</td></tr>"); + sb.append("<tr><td><nobr>Fleet controller index</nobr></td><td align=\"right\">").append(options.fleetControllerIndex).append("/").append(options.fleetControllerCount).append("</td></tr>"); + sb.append("<tr><td><nobr>Number of fleetcontrollers gathering states from nodes</nobr></td><td align=\"right\">").append(options.stateGatherCount).append("</td></tr>"); + + sb.append("<tr><td><nobr>Slobrok connection spec</nobr></td><td align=\"right\">").append(slobrokspecs).append("</td></tr>"); + sb.append("<tr><td><nobr>RPC port</nobr></td><td align=\"right\">").append(options.rpcPort == 0 ? "Pick random available" : options.rpcPort).append("</td></tr>"); + sb.append("<tr><td><nobr>HTTP port</nobr></td><td align=\"right\">").append(options.httpPort == 0 ? "Pick random available" : options.httpPort).append("</td></tr>"); + sb.append("<tr><td><nobr>Master cooldown period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.masterZooKeeperCooldownPeriod)).append("</td></tr>"); + String zooKeeperAddress = (options.zooKeeperServerAddress == null ? "Not using Zookeeper" : splitZooKeeperAddress(options.zooKeeperServerAddress)); + sb.append("<tr><td><nobr>Zookeeper server address</nobr></td><td align=\"right\">").append(zooKeeperAddress).append("</td></tr>"); + sb.append("<tr><td><nobr>Zookeeper session timeout</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.zooKeeperSessionTimeout)).append("</td></tr>"); + + sb.append("<tr><td><nobr>Cycle wait time</nobr></td><td align=\"right\">").append(options.cycleWaitTime).append(" ms</td></tr>"); + sb.append("<tr><td><nobr>Minimum time before first clusterstate broadcast as master</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.minTimeBeforeFirstSystemStateBroadcast)).append("</td></tr>"); + sb.append("<tr><td><nobr>Minimum time between official cluster states</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.minTimeBetweenNewSystemStates)).append("</td></tr>"); + sb.append("<tr><td><nobr>Slobrok mirror backoff policy</nobr></td><td align=\"right\">").append(options.slobrokBackOffPolicy == null ? "default" : "overridden").append("</td></tr>"); + + sb.append("<tr><td><nobr>Node state request timeout</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.nodeStateRequestTimeoutMS)).append("</td></tr>"); + sb.append("<tr><td><nobr>VDS 4.1 node state polling frequency</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.statePollingFrequency)).append("</td></tr>"); + sb.append("<tr><td><nobr>Maximum distributor transition time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.maxTransitionTime.get(NodeType.DISTRIBUTOR))).append("</td></tr>"); + sb.append("<tr><td><nobr>Maximum storage transition time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.maxTransitionTime.get(NodeType.STORAGE))).append("</td></tr>"); + sb.append("<tr><td><nobr>Maximum initialize without progress time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.maxInitProgressTime)).append("</td></tr>"); + sb.append("<tr><td><nobr>Maximum premature crashes</nobr></td><td align=\"right\">").append(options.maxPrematureCrashes).append("</td></tr>"); + sb.append("<tr><td><nobr>Stable state time period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.stableStateTimePeriod)).append("</td></tr>"); + sb.append("<tr><td><nobr>Slobrok disconnect grace period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.maxSlobrokDisconnectGracePeriod)).append("</td></tr>"); + + sb.append("<tr><td><nobr>Number of distributor nodes</nobr></td><td align=\"right\">").append(options.nodes == null ? "Autodetect" : options.nodes.size()).append("</td></tr>"); + sb.append("<tr><td><nobr>Number of storage nodes</nobr></td><td align=\"right\">").append(options.nodes == null ? "Autodetect" : options.nodes.size()).append("</td></tr>"); + sb.append("<tr><td><nobr>Minimum distributor nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(options.minDistributorNodesUp).append("</td></tr>"); + sb.append("<tr><td><nobr>Minimum storage nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(options.minStorageNodesUp).append("</td></tr>"); + sb.append("<tr><td><nobr>Minimum percentage of distributor nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(DecimalDot2.format(100 * options.minRatioOfDistributorNodesUp)).append(" %</td></tr>"); + sb.append("<tr><td><nobr>Minimum percentage of storage nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(DecimalDot2.format(100 * options.minRatioOfStorageNodesUp)).append(" %</td></tr>"); + + sb.append("<tr><td><nobr>Show local cluster state changes</nobr></td><td align=\"right\">").append(options.showLocalSystemStatesInEventLog).append("</td></tr>"); + sb.append("<tr><td><nobr>Maximum event log size</nobr></td><td align=\"right\">").append(options.eventLogMaxSize).append("</td></tr>"); + sb.append("<tr><td><nobr>Maximum node event log size</nobr></td><td align=\"right\">").append(options.eventNodeLogMaxSize).append("</td></tr>"); + sb.append("<tr><td><nobr>Wanted distribution bits</nobr></td><td align=\"right\">").append(options.distributionBits).append("</td></tr>"); + sb.append("<tr><td><nobr>Max deferred task version wait time</nobr></td><td align=\"right\">").append(options.maxDeferredTaskVersionWaitTime.toMillis()).append("ms</td></tr>"); + sb.append("<tr><td><nobr>Cluster has global document types configured</nobr></td><td align=\"right\">").append(options.clusterHasGlobalDocumentTypes).append("</td></tr>"); + sb.append("<tr><td><nobr>Enable 2-phase cluster state activation protocol</nobr></td><td align=\"right\">").append(options.enableTwoPhaseClusterStateActivation).append("</td></tr>"); + sb.append("<tr><td><nobr>Cluster auto feed block on resource exhaustion enabled</nobr></td><td align=\"right\">") + .append(options.clusterFeedBlockEnabled).append("</td></tr>"); + sb.append("<tr><td><nobr>Feed block limits</nobr></td><td align=\"right\">") + .append(options.clusterFeedBlockLimit.entrySet().stream() + .map(kv -> String.format("%s: %.2f%%", kv.getKey(), kv.getValue() * 100.0)) + .collect(Collectors.joining("<br/>"))).append("</td></tr>"); + + sb.append("</table>"); + } + + private static String splitZooKeeperAddress(String s) { + StringBuilder sb = new StringBuilder(); + while (true) { + int index = s.indexOf(','); + if (index > 0) { + sb.append(s.substring(0, index + 1)).append(' '); + s = s.substring(index+1); + } else { + break; + } + } + sb.append(s); + return sb.toString(); + } + } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/StatusHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/StatusHandler.java index 69144d23fe5..79e6a91f561 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/StatusHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/StatusHandler.java @@ -31,7 +31,7 @@ public class StatusHandler implements HttpRequestHandler { StatusPageServer.HttpRequest request; StatusPageResponse response; - // Ensure only only one use the server at a time + // Ensure only one use the server at a time private final Object queueMonitor = new Object(); // Lock safety with fleetcontroller. Wait until completion private final Object answerMonitor = new Object(); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java index ec6255fd13b..5a9d28bc327 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java @@ -9,6 +9,7 @@ import com.yahoo.vdslib.state.NodeType; import com.yahoo.vdslib.state.State; import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler; import com.yahoo.vespa.clustercontroller.core.database.ZooKeeperDatabaseFactory; +import com.yahoo.vespa.clustercontroller.core.status.StatusHandler; import com.yahoo.vespa.clustercontroller.utils.util.NoMetricReporter; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -58,7 +59,9 @@ public class ClusterFeedBlockTest extends FleetControllerTest { var stateGenerator = new StateChangeHandler(context, timer, eventLog); var stateBroadcaster = new SystemStateBroadcaster(context, timer, timer); var masterElectionHandler = new MasterElectionHandler(context, options.fleetControllerIndex, options.fleetControllerCount, timer, timer); - ctrl = new FleetController(context, timer, eventLog, cluster, stateGatherer, communicator, null, null, communicator, database, stateGenerator, stateBroadcaster, masterElectionHandler, metricUpdater, options); + var status = new StatusHandler.ContainerStatusPageServer(); + ctrl = new FleetController(context, timer, eventLog, cluster, stateGatherer, communicator, status, null, communicator, database, + stateGenerator, stateBroadcaster, masterElectionHandler, metricUpdater, options); ctrl.tick(); markAllNodesAsUp(options); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java index 426ac42fe9e..fd6ff30a08f 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java @@ -11,6 +11,7 @@ import com.yahoo.vdslib.state.NodeType; import com.yahoo.vdslib.state.State; import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler; import com.yahoo.vespa.clustercontroller.core.database.ZooKeeperDatabaseFactory; +import com.yahoo.vespa.clustercontroller.core.status.StatusHandler; import com.yahoo.vespa.clustercontroller.core.testutils.StateWaiter; import com.yahoo.vespa.clustercontroller.utils.util.NoMetricReporter; import org.junit.jupiter.api.BeforeEach; @@ -57,7 +58,9 @@ public class StateChangeTest extends FleetControllerTest { var stateGenerator = new StateChangeHandler(context, timer, eventLog); var stateBroadcaster = new SystemStateBroadcaster(context, timer, timer); var masterElectionHandler = new MasterElectionHandler(context, options.fleetControllerIndex, options.fleetControllerCount, timer, timer); - ctrl = new FleetController(context, timer, eventLog, cluster, stateGatherer, communicator, null, null, communicator, database, stateGenerator, stateBroadcaster, masterElectionHandler, metricUpdater, options); + var status = new StatusHandler.ContainerStatusPageServer(); + ctrl = new FleetController(context, timer, eventLog, cluster, stateGatherer, communicator, status, null, communicator, database, + stateGenerator, stateBroadcaster, masterElectionHandler, metricUpdater, options); ctrl.tick(); if (options.fleetControllerCount == 1) { diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java index 74017b6b821..bf5770681ef 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java @@ -117,20 +117,23 @@ public class ContentCluster extends AbstractConfigProducer<AbstractConfigProduce RedundancyBuilder redundancyBuilder = new RedundancyBuilder(contentElement); Set<NewDocumentType> globallyDistributedDocuments = new GlobalDistributionBuilder(documentDefinitions).build(documentsElement); - ContentCluster c = new ContentCluster(context.getParentProducer(), getClusterId(contentElement), documentDefinitions, + String clusterId = getClusterId(contentElement); + ContentCluster c = new ContentCluster(context.getParentProducer(), clusterId, documentDefinitions, globallyDistributedDocuments, routingSelection, deployState.zone(), deployState.isHosted()); var resourceLimits = new ClusterResourceLimits.Builder(stateIsHosted(deployState), deployState.featureFlags().resourceLimitDisk(), deployState.featureFlags().resourceLimitMemory()) .build(contentElement); - c.clusterControllerConfig = new ClusterControllerConfig.Builder(getClusterId(contentElement), - contentElement, - resourceLimits.getClusterControllerLimits()).build(deployState, c, contentElement.getXml()); + c.clusterControllerConfig = new ClusterControllerConfig.Builder(clusterId, + contentElement, + resourceLimits.getClusterControllerLimits()) + .build(deployState, c, contentElement.getXml()); c.search = new ContentSearchCluster.Builder(documentDefinitions, - globallyDistributedDocuments, - fractionOfMemoryReserved(getClusterId(contentElement), containers), - resourceLimits.getContentNodeLimits()).build(deployState, c, contentElement.getXml()); + globallyDistributedDocuments, + fractionOfMemoryReserved(clusterId, containers), + resourceLimits.getContentNodeLimits()) + .build(deployState, c, contentElement.getXml()); c.persistenceFactory = new EngineFactoryBuilder().build(contentElement, c); c.storageNodes = new StorageCluster.Builder().build(deployState, c, w3cContentElement); c.distributorNodes = new DistributorCluster.Builder(c).build(deployState, c, w3cContentElement); @@ -173,9 +176,8 @@ public class ContentCluster extends AbstractConfigProducer<AbstractConfigProduce if (csc.hasIndexedCluster()) { setupIndexedCluster(csc.getIndexed(), search, element, logger); } - - } + private void setupIndexedCluster(IndexedSearchCluster index, ContentSearch search, ModelElement element, DeployLogger logger) { Double queryTimeout = search.getQueryTimeout(); if (queryTimeout != null) { @@ -267,9 +269,8 @@ public class ContentCluster extends AbstractConfigProducer<AbstractConfigProduce } private void validateThatGroupSiblingsAreUnique(String cluster, StorageGroup group) { - if (group == null) { - return; // Unit testing case - } + if (group == null) return; // Unit testing case + validateGroupSiblings(cluster, group); for (StorageGroup g : group.getSubgroups()) { validateThatGroupSiblingsAreUnique(cluster, g); @@ -301,7 +302,7 @@ public class ContentCluster extends AbstractConfigProducer<AbstractConfigProduce } clusterControllers = admin.getClusterControllers(); } - else { // self-hosted: Put cluser controller on config servers or use explicit cluster controllers + else { // self-hosted: Put cluster controller on config servers or use explicit cluster controllers if (admin.getClusterControllers() == null) { var hosts = admin.getConfigservers().stream().map(s -> s.getHostResource()).collect(toList()); if (hosts.size() > 1) { diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/FileStorProducer.java b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/FileStorProducer.java index fb4016f4cf4..ff905187969 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/FileStorProducer.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/FileStorProducer.java @@ -44,11 +44,11 @@ public class FileStorProducer implements StorFilestorConfig.Producer { private final Integer numThreads; private final ContentCluster cluster; - private final int reponseNumThreads; + private final int responseNumThreads; private final StorFilestorConfig.Response_sequencer_type.Enum responseSequencerType; private final double persistenceThrottlingWsDecrementFactor; private final double persistenceThrottlingWsBackoff; - private final int persistenceThrottingWindowSize; + private final int persistenceThrottlingWindowSize; private final double persistenceThrottlingWsResizeRate; private final boolean persistenceThrottlingOfMergeFeedOps; private final boolean useAsyncMessageHandlingOnSchedule; @@ -64,11 +64,11 @@ public class FileStorProducer implements StorFilestorConfig.Producer { public FileStorProducer(ModelContext.FeatureFlags featureFlags, ContentCluster parent, Integer numThreads) { this.numThreads = numThreads; this.cluster = parent; - this.reponseNumThreads = featureFlags.defaultNumResponseThreads(); + this.responseNumThreads = featureFlags.defaultNumResponseThreads(); this.responseSequencerType = convertResponseSequencerType(featureFlags.responseSequencerType()); this.persistenceThrottlingWsDecrementFactor = featureFlags.persistenceThrottlingWsDecrementFactor(); this.persistenceThrottlingWsBackoff = featureFlags.persistenceThrottlingWsBackoff(); - this.persistenceThrottingWindowSize = featureFlags.persistenceThrottlingWindowSize(); + this.persistenceThrottlingWindowSize = featureFlags.persistenceThrottlingWindowSize(); this.persistenceThrottlingWsResizeRate = featureFlags.persistenceThrottlingWsResizeRate(); this.persistenceThrottlingOfMergeFeedOps = featureFlags.persistenceThrottlingOfMergeFeedOps(); this.useAsyncMessageHandlingOnSchedule = featureFlags.useAsyncMessageHandlingOnSchedule(); @@ -80,15 +80,15 @@ public class FileStorProducer implements StorFilestorConfig.Producer { builder.num_threads(numThreads); } builder.enable_multibit_split_optimalization(cluster.getPersistence().enableMultiLevelSplitting()); - builder.num_response_threads(reponseNumThreads); + builder.num_response_threads(responseNumThreads); builder.response_sequencer_type(responseSequencerType); builder.use_async_message_handling_on_schedule(useAsyncMessageHandlingOnSchedule); var throttleBuilder = new StorFilestorConfig.Async_operation_throttler.Builder(); throttleBuilder.window_size_decrement_factor(persistenceThrottlingWsDecrementFactor); throttleBuilder.window_size_backoff(persistenceThrottlingWsBackoff); - if (persistenceThrottingWindowSize > 0) { - throttleBuilder.min_window_size(persistenceThrottingWindowSize); - throttleBuilder.max_window_size(persistenceThrottingWindowSize); + if (persistenceThrottlingWindowSize > 0) { + throttleBuilder.min_window_size(persistenceThrottlingWindowSize); + throttleBuilder.max_window_size(persistenceThrottlingWindowSize); } throttleBuilder.resize_rate(persistenceThrottlingWsResizeRate); throttleBuilder.throttle_individual_merge_feed_ops(persistenceThrottlingOfMergeFeedOps); diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/IntegrityCheckerProducer.java b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/IntegrityCheckerProducer.java index 588e7c55ab9..4f81bbf165f 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/IntegrityCheckerProducer.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/IntegrityCheckerProducer.java @@ -16,9 +16,9 @@ public class IntegrityCheckerProducer implements StorIntegritycheckerConfig.Prod } } - private Integer startTime; - private Integer stopTime; - private String weeklyCycle; + private final Integer startTime; + private final Integer stopTime; + private final String weeklyCycle; IntegrityCheckerProducer(Integer startTime, Integer stopTime, String weeklyCycle) { this.startTime = startTime; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorServerProducer.java b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorServerProducer.java index 2fca964a995..e66f2c48f26 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorServerProducer.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorServerProducer.java @@ -3,10 +3,8 @@ package com.yahoo.vespa.model.content.storagecluster; import com.yahoo.config.model.api.ModelContext; import com.yahoo.vespa.config.content.core.StorServerConfig; -import com.yahoo.vespa.model.content.cluster.ContentCluster; import com.yahoo.vespa.model.builder.xml.dom.ModelElement; - -import java.util.Optional; +import com.yahoo.vespa.model.content.cluster.ContentCluster; /** * Serves config for stor-server for storage clusters (clusters of storage nodes). @@ -28,11 +26,10 @@ public class StorServerProducer implements StorServerConfig.Producer { } } - private String clusterName; + private final String clusterName; private Integer maxMergesPerNode; private Integer queueSize; - private Integer bucketDBStripeBits; - private StorServerConfig.Merge_throttling_policy.Type.Enum mergeThrottlingPolicyType; + private final StorServerConfig.Merge_throttling_policy.Type.Enum mergeThrottlingPolicyType; private StorServerProducer setMaxMergesPerNode(Integer value) { if (value != null) { @@ -46,10 +43,6 @@ public class StorServerProducer implements StorServerConfig.Producer { } return this; } - private StorServerProducer setBucketDBStripeBits(Integer value) { - bucketDBStripeBits = value; - return this; - } private static StorServerConfig.Merge_throttling_policy.Type.Enum toThrottlePolicyType(String policyType) { try { @@ -80,9 +73,6 @@ public class StorServerProducer implements StorServerConfig.Producer { if (queueSize != null) { builder.max_merge_queue_size(queueSize); } - if (bucketDBStripeBits != null) { - builder.content_node_bucket_db_stripe_bits(bucketDBStripeBits); - } // TODO set throttle policy params based on existing or separate flags builder.merge_throttling_policy(new StorServerConfig.Merge_throttling_policy.Builder().type(mergeThrottlingPolicyType)); } diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorageCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorageCluster.java index a3f32fcb44b..da82a69842a 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorageCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/storagecluster/StorageCluster.java @@ -30,7 +30,7 @@ public class StorageCluster extends AbstractConfigProducer<StorageNode> { public static class Builder extends VespaDomBuilder.DomConfigProducerBuilder<StorageCluster> { @Override - protected StorageCluster doBuild(DeployState deployState, AbstractConfigProducer ancestor, Element producerSpec) { + protected StorageCluster doBuild(DeployState deployState, AbstractConfigProducer<?> ancestor, Element producerSpec) { final ModelElement clusterElem = new ModelElement(producerSpec); final ContentCluster cluster = (ContentCluster)ancestor; @@ -51,7 +51,7 @@ public class StorageCluster extends AbstractConfigProducer<StorageNode> private final StorVisitorProducer storVisitorProducer; private final PersistenceProducer persistenceProducer; - StorageCluster(AbstractConfigProducer parent, + StorageCluster(AbstractConfigProducer<?> parent, String clusterName, FileStorProducer fileStorProducer, IntegrityCheckerProducer integrityCheckerProducer, |