aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
blob: e63531229d604fee4e1c67710fd96af15a2f180d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// Copyright Verizone Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;

import com.yahoo.jrt.slobrok.api.BackOffPolicy;
import com.yahoo.vdslib.distribution.ConfiguredNode;
import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.state.NodeType;

import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.time.Duration;
import java.util.Collection;
import java.util.Collections;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;

/**
 * This class represents all the options that can be set in the fleetcontroller.
 * Tests typically just generate an instance of this object to use in fleet controller for testing.
 * A real application generate this object from config, and on config updates, post new options to the fleet controller.
 */
public class FleetControllerOptions implements Cloneable {

    // TODO: Make fields private

    public String fleetControllerConfigId;
    public String slobrokConfigId;

    public String clusterName;
    public int fleetControllerIndex = 0;
    public int fleetControllerCount = 1;
    public int stateGatherCount = 2;

    // TODO: This cannot be null but nonnull is not verified
    public String[] slobrokConnectionSpecs;
    public int rpcPort = 0;
    public int httpPort = 0;
    public int distributionBits = 16;

    /** Timeout before breaking zookeeper session (in milliseconds) */
    public int zooKeeperSessionTimeout = 5 * 60 * 1000;
    /**
     * Timeout between master disappearing before new master will take over.
     * (Grace period to allow old master to detect that it is disconnected from zookeeper)
     */
    public int masterZooKeeperCooldownPeriod = 15 * 1000;

    public String zooKeeperServerAddress = null;

    public int statePollingFrequency = 5000;
    /**
     * Max amount of time to keep a node, that has previously been available
     * in steady state, in maintenance mode, while node is unreachable, before setting it down.
     */
    public Map<NodeType, Integer> maxTransitionTime = new TreeMap<>();

    /**
     * Max amount of time to keep a storage node, that is initializing, in maintenance mode, without any further
     * initializing progress being received, before setting it down.
     */
    public int maxInitProgressTime = 5000;

    public int maxPrematureCrashes = 4;
    public long stableStateTimePeriod = 2 * 60 * 60 * 1000;

    public int eventLogMaxSize = 1024;
    public int eventNodeLogMaxSize = 1024;

    public int minDistributorNodesUp = 1;
    public int minStorageNodesUp = 1;
    public double minRatioOfDistributorNodesUp = 0.50;
    public double minRatioOfStorageNodesUp = 0.50;

    /**
     * Minimum ratio of nodes in an "available" state (up, initializing or maintenance)
     * that shall be present in a group for the group itself to be considered available.
     * If the ratio of available nodes drop under this limit, the group's nodes will be
     * implicitly taken down.
     *
     * A value of 0.0 implies group auto-takedown feature is effectively disabled.
     */
    public double minNodeRatioPerGroup = 0.0;

    /**
     * Milliseconds to sleep after doing a work cycle where we did no work. Some events do not interrupt the sleeping,
     * such as slobrok changes, so shouldn't set this too high.
     */
    public int cycleWaitTime = 100;
    /**
     * Minimum time to pass (in milliseconds) before broadcasting our first systemstate. Set small in unit tests,
     * but should be a few seconds in a real system to prevent new nodes taking over from disturbing the system by
     * putting out a different systemstate just because all nodes don't answer witihin a single cycle.
     * If all nodes have reported before this time, the min time is ignored and system state is broadcasted.
     */
    public long minTimeBeforeFirstSystemStateBroadcast = 0;

    /**
     * StateRequestTimeout for the request are randomized a bit to avoid congestion on replies. The effective
     * interval is
     * [nodeStateRequestTimeoutEarliestPercentage * nodeStateRequestTimeoutMS / 100,
     *                          nodeStateRequestTimeoutLatestPercentage * nodeStateRequestTimeoutMS / 100].
     */
    public int nodeStateRequestTimeoutMS = 5 * 60 * 1000;
    public int nodeStateRequestTimeoutEarliestPercentage = 80;
    public int nodeStateRequestTimeoutLatestPercentage = 95;
    public int nodeStateRequestRoundTripTimeMaxSeconds = 5;

    public int minTimeBetweenNewSystemStates = 0;
    public boolean showLocalSystemStatesInEventLog = true;

    /** Maximum time a node can be missing from slobrok before it is tagged down. */
    public int maxSlobrokDisconnectGracePeriod = 1000;

    /** Set by tests to retry often. */
    public BackOffPolicy slobrokBackOffPolicy = null;

    public Distribution storageDistribution;

    // TODO: Get rid of this by always getting nodes by distribution.getNodes()
    public Set<ConfiguredNode> nodes;

    private Duration maxDeferredTaskVersionWaitTime = Duration.ofSeconds(30);

    public boolean clusterHasGlobalDocumentTypes = false;

    public boolean enableTwoPhaseClusterStateActivation = false;

    // TODO: Choose a default value
    public double minMergeCompletionRatio = 1.0;

    public int maxDivergentNodesPrintedInTaskErrorMessages = 10;

    public boolean clusterFeedBlockEnabled = false;
    // Resource type -> limit in [0, 1]
    public Map<String, Double> clusterFeedBlockLimit = Collections.emptyMap();

    public double clusterFeedBlockNoiseLevel = 0.01;

    public FleetControllerOptions(String clusterName, Collection<ConfiguredNode> nodes) {
        this.clusterName = clusterName;
        maxTransitionTime.put(NodeType.DISTRIBUTOR, 0);
        maxTransitionTime.put(NodeType.STORAGE, 5000);
        this.nodes = new TreeSet<>(nodes);
    }

    /** Called on reconfiguration of this cluster */
    public void setStorageDistribution(Distribution distribution) {
        this.storageDistribution = distribution;
    }

    public Duration getMaxDeferredTaskVersionWaitTime() {
        return maxDeferredTaskVersionWaitTime;
    }

    public void setMaxDeferredTaskVersionWaitTime(Duration maxDeferredTaskVersionWaitTime) {
        this.maxDeferredTaskVersionWaitTime = maxDeferredTaskVersionWaitTime;
    }

    public long storageNodeMaxTransitionTimeMs() {
        return maxTransitionTime.getOrDefault(NodeType.STORAGE, 10_000);
    }

    public FleetControllerOptions clone() {
        try {
            // TODO: This should deep clone
            return (FleetControllerOptions) super.clone();
        } catch (CloneNotSupportedException e) {
            throw new RuntimeException("Will not happen");
        }
    }

    public static String splitZooKeeperAddress(String s) {
        StringBuilder sb = new StringBuilder();
        while (true) {
            int index = s.indexOf(',');
            if (index > 0) {
                sb.append(s.substring(0, index + 1)).append(' ');
                s = s.substring(index+1);
            } else {
                break;
            }
        }
        sb.append(s);
        return sb.toString();
    }

    static DecimalFormat DecimalDot2 = new DecimalFormat("0.00", new DecimalFormatSymbols(Locale.ENGLISH));

    public void writeHtmlState(StringBuilder sb) {
        String slobrokspecs = "";
        for (int i=0; i<slobrokConnectionSpecs.length; ++i) {
            if (i != 0) slobrokspecs += "<br>";
            slobrokspecs += slobrokConnectionSpecs[i];
        }
        sb.append("<h1>Current config</h1>\n")
          .append("<p>Fleet controller config id: ").append(fleetControllerConfigId == null ? null : fleetControllerConfigId.replaceAll("\n", "<br>\n")).append("</p>\n")
          .append("<p>Slobrok config id: ").append(slobrokConfigId == null ? null : slobrokConfigId.replaceAll("\n", "<br>\n")).append("</p>\n")
          .append("<table border=\"1\" cellspacing=\"0\"><tr><th>Property</th><th>Value</th></tr>\n");

        sb.append("<tr><td><nobr>Cluster name</nobr></td><td align=\"right\">").append(clusterName).append("</td></tr>");
        sb.append("<tr><td><nobr>Fleet controller index</nobr></td><td align=\"right\">").append(fleetControllerIndex).append("/").append(fleetControllerCount).append("</td></tr>");
        sb.append("<tr><td><nobr>Number of fleetcontrollers gathering states from nodes</nobr></td><td align=\"right\">").append(stateGatherCount).append("</td></tr>");

        sb.append("<tr><td><nobr>Slobrok connection spec</nobr></td><td align=\"right\">").append(slobrokspecs).append("</td></tr>");
        sb.append("<tr><td><nobr>RPC port</nobr></td><td align=\"right\">").append(rpcPort == 0 ? "Pick random available" : rpcPort).append("</td></tr>");
        sb.append("<tr><td><nobr>HTTP port</nobr></td><td align=\"right\">").append(httpPort == 0 ? "Pick random available" : httpPort).append("</td></tr>");
        sb.append("<tr><td><nobr>Master cooldown period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(masterZooKeeperCooldownPeriod)).append("</td></tr>");
        String zooKeeperAddress = (zooKeeperServerAddress == null ? "Not using Zookeeper" : splitZooKeeperAddress(zooKeeperServerAddress));
        sb.append("<tr><td><nobr>Zookeeper server address</nobr></td><td align=\"right\">").append(zooKeeperAddress).append("</td></tr>");
        sb.append("<tr><td><nobr>Zookeeper session timeout</nobr></td><td align=\"right\">").append(RealTimer.printDuration(zooKeeperSessionTimeout)).append("</td></tr>");

        sb.append("<tr><td><nobr>Cycle wait time</nobr></td><td align=\"right\">").append(cycleWaitTime).append(" ms</td></tr>");
        sb.append("<tr><td><nobr>Minimum time before first clusterstate broadcast as master</nobr></td><td align=\"right\">").append(RealTimer.printDuration(minTimeBeforeFirstSystemStateBroadcast)).append("</td></tr>");
        sb.append("<tr><td><nobr>Minimum time between official cluster states</nobr></td><td align=\"right\">").append(RealTimer.printDuration(minTimeBetweenNewSystemStates)).append("</td></tr>");
        sb.append("<tr><td><nobr>Slobrok mirror backoff policy</nobr></td><td align=\"right\">").append(slobrokBackOffPolicy == null ? "default" : "overridden").append("</td></tr>");

        sb.append("<tr><td><nobr>Node state request timeout</nobr></td><td align=\"right\">").append(RealTimer.printDuration(nodeStateRequestTimeoutMS)).append("</td></tr>");
        sb.append("<tr><td><nobr>VDS 4.1 node state polling frequency</nobr></td><td align=\"right\">").append(RealTimer.printDuration(statePollingFrequency)).append("</td></tr>");
        sb.append("<tr><td><nobr>Maximum distributor transition time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxTransitionTime.get(NodeType.DISTRIBUTOR))).append("</td></tr>");
        sb.append("<tr><td><nobr>Maximum storage transition time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxTransitionTime.get(NodeType.STORAGE))).append("</td></tr>");
        sb.append("<tr><td><nobr>Maximum initialize without progress time</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxInitProgressTime)).append("</td></tr>");
        sb.append("<tr><td><nobr>Maximum premature crashes</nobr></td><td align=\"right\">").append(maxPrematureCrashes).append("</td></tr>");
        sb.append("<tr><td><nobr>Stable state time period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(stableStateTimePeriod)).append("</td></tr>");
        sb.append("<tr><td><nobr>Slobrok disconnect grace period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(maxSlobrokDisconnectGracePeriod)).append("</td></tr>");

        sb.append("<tr><td><nobr>Number of distributor nodes</nobr></td><td align=\"right\">").append(nodes == null ? "Autodetect" : nodes.size()).append("</td></tr>");
        sb.append("<tr><td><nobr>Number of storage nodes</nobr></td><td align=\"right\">").append(nodes == null ? "Autodetect" : nodes.size()).append("</td></tr>");
        sb.append("<tr><td><nobr>Minimum distributor nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(minDistributorNodesUp).append("</td></tr>");
        sb.append("<tr><td><nobr>Minimum storage nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(minStorageNodesUp).append("</td></tr>");
        sb.append("<tr><td><nobr>Minimum percentage of distributor nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(DecimalDot2.format(100 * minRatioOfDistributorNodesUp)).append(" %</td></tr>");
        sb.append("<tr><td><nobr>Minimum percentage of storage nodes being up for cluster to be up</nobr></td><td align=\"right\">").append(DecimalDot2.format(100 * minRatioOfStorageNodesUp)).append(" %</td></tr>");

        sb.append("<tr><td><nobr>Show local cluster state changes</nobr></td><td align=\"right\">").append(showLocalSystemStatesInEventLog).append("</td></tr>");
        sb.append("<tr><td><nobr>Maximum event log size</nobr></td><td align=\"right\">").append(eventLogMaxSize).append("</td></tr>");
        sb.append("<tr><td><nobr>Maximum node event log size</nobr></td><td align=\"right\">").append(eventNodeLogMaxSize).append("</td></tr>");
        sb.append("<tr><td><nobr>Wanted distribution bits</nobr></td><td align=\"right\">").append(distributionBits).append("</td></tr>");
        sb.append("<tr><td><nobr>Max deferred task version wait time</nobr></td><td align=\"right\">").append(maxDeferredTaskVersionWaitTime.toMillis()).append("ms</td></tr>");
        sb.append("<tr><td><nobr>Cluster has global document types configured</nobr></td><td align=\"right\">").append(clusterHasGlobalDocumentTypes).append("</td></tr>");
        sb.append("<tr><td><nobr>Enable 2-phase cluster state activation protocol</nobr></td><td align=\"right\">").append(enableTwoPhaseClusterStateActivation).append("</td></tr>");
        sb.append("<tr><td><nobr>Cluster auto feed block on resource exhaustion enabled</nobr></td><td align=\"right\">")
                        .append(clusterFeedBlockEnabled).append("</td></tr>");
        sb.append("<tr><td><nobr>Feed block limits</nobr></td><td align=\"right\">")
                        .append(clusterFeedBlockLimit.entrySet().stream()
                                .map(kv -> String.format("%s: %.2f%%", kv.getKey(), kv.getValue() * 100.0))
                                .collect(Collectors.joining("<br/>"))).append("</td></tr>");

        sb.append("</table>");
    }

}