configdefinitions/src/vespa/fleetcontroller.def


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
namespace=vespa.config.content

## Name of VDS cluster
cluster_name string restart

## The fleetcontroller index. Each fleetcontroller should have a unique index
## which can be used to identify them.
index int restart

## Number of fleet controllers. If more than one, the fleetcontroller needs to
## do master election in order to know which one is active. It then needs to
## know how many exist.
fleet_controller_count int default=1

## The number of seconds we can attempt to talk to zookeeper before our session
## time out and we lose our connection.
zookeeper_session_timeout double default=30.0

## When a master candidate see the master disappearing from ZooKeeper, it wants
## to take over as master. But, before taking over, the master should be aware
## that it has lost its zookeeper connection, so it will stop reacting as
## master. Suggests setting this to 2-3 times the zookeeper session timeout.
## (Set in number of seconds)
master_zookeeper_cooldown_period double default=60.0

## Sets how many fleetcontrollers will gather state. A fleetcontroller
## gathering state can take over quicker should the master fail.
## If set to 1, only master will gather state. If set higher, others will
## also do so, prioritizing those fleetcontrollers likely to be the ones to
## take over if the master fails.
# TODO: Deprecated, not used anymore, remove in Vespa 9
state_gather_count int default=1

## Location of ZooKeeper servers
zookeeper_server string restart

## RPC Port used by fleetcontroller
rpc_port int default=6500 restart

## Port where fleetcontroller listens for HTTP status request
http_port int default=0 restart

## Maximum number of milliseconds a storage node will be automatically reported
## in maintenance due to node recently being available. (If 0, no time at all)
storage_transition_time int default=30000

## Maximum number of milliseconds a distributor will be automatically reported
## in maintenance due to node recently being available. (If 0, no time at all)
##
## Currently default is 0.. Should probably be more when we know it is working
## correctly
distributor_transition_time int default=0

## Maximum number of milliseconds allowed between progress increase during
## initialization. If no progress have been made during this time period, the
## node will be considered down.
##
## Currently disabled as 5.0 nodes gets load while initializing which may be
## higher pri than initializing, so nodes can stay in init for a long time.
init_progress_time int default=0

## Minimum time (in ms) between system state updates. To limit updates in a
## system where a lot is happening at the same time, this value will make sure
## we dont change the state too often.
min_time_between_new_systemstates int default=10000

## The maximum amount of premature crashes a node is allowed to have in a row
## before the fleetcontroller disables that node.
max_premature_crashes int default=100000

## If a node has been down or up this many milliseconds, clear the premature
## crash count of a node and consider the node as stable
stable_state_time_period int default=7200000

## The maximum number of events to keep in the event log
event_log_max_size int default=1024

## The maximum number of node events to keep in the node event log per node
event_node_log_max_size int default=1024

## The total number of distributor nodes that can exist. If 0, we dont know and
## will use the highest distributor index number we have ever seen + 1.
total_distributor_count int default=0

## The total number of storage nodes that can exist. If 0, we dont know and
## will use the highest storage index number we have ever seen + 1.
total_storage_count int default=0

## The minimum number of distributor nodes that should be up for the cluster
## state to be up. (Retired nodes counts as up in this case)
min_distributors_up_count int default=1

## The minimum number of storage nodes that should be up for the cluster state
## to be up (Retired nodes counts as up in this case)
min_storage_up_count int default=1

## The minimum ratio of known distributor nodes that should be up (or retired)
## for the cluster state to stay up.
min_distributor_up_ratio double default=0.01

## The minimum ratio of known storage nodes that should be up (or retired) for
## the cluster state to stay up.
min_storage_up_ratio double default=0.01

## Seconds to sleep after doing a work cycle where we did no work. Some
## events do not interrupt the sleeping, such as slobrok changes, so shouldn't
## set this too high
cycle_wait_time double default=0.1

## Minimum time to pass in seconds before broadcasting our first systemstate as
## a new fleetcontroller. Will broadcast earlier than this if we have gathered
## state from all before this. To prevent disturbance when taking over as
## fleetcontroller, give nodes a bit of time to answer so we dont temporarily
## report nodes as down.  See also max_slobrok_disconnect_grace_period and
## get_node_state_request_timeout.
min_time_before_first_system_state_broadcast double default=30.0

## Request timeout of node state requests. Keeping a high timeout allows us to
## always have a pending operation with very low cost. Keeping a low timeout is
## good to detect issues like packet loss. The default tries to balance the two
## by not resending too often, but detecting packet loss within a minute at
## least. If we can guarantee RPC layer to fail on packet loss within
## reasonable time we should increase this default.
get_node_state_request_timeout double default=120.0

## If a node is out of slobrok longer than this time period, assume the node
## is down, even if we have a pending node state request to it. Slobrok does
## a bit more keep alive checking than fleetcontroller, so it is possible that
## the node disappears from slobrok while it still looks ok in fleetcontroller.
max_slobrok_disconnect_grace_period double default=60.0

## Whether to show system states that have never been sent to storage nodes in
## the event log.
show_local_systemstates_in_event_log bool default=true

## The ideal number of distribution bits this system should have
ideal_distribution_bits int default=16

## Minimum ratio of nodes that have to be available (i.e. not Down) in any
## hierarchic content cluster group. If a higher ratio than this is Down at
## any point, the remaning nodes in the group will be automatically marked
## as down. Group nodes will automatically be taken back up as soon as node
## availability has been restored above the given threshold.
## Default is 0, i.e. functionality is for all intents and purposes disabled.
min_node_ratio_per_group double default=0.0

## If a cluster controller task has a dependency on a given cluster state
## version being published and ACKed by the cluster, it will be put on a wait
## queue while holding up the container thread associated with the task.
## This config specifies the maximum time a task can be held in this queue
## before being automatically failed out, if a version has not been ACKed
## within this duration.
max_deferred_task_version_wait_time_sec double default=30.0

## Whether or not the content cluster the controller has responsibility for
## contains any document types that are tagged as global. If this is true,
## global document-specific behavior is enabled that marks nodes down in the
## default space if they have merges pending in the global bucket space.
cluster_has_global_document_types bool default=false

## The minimum merge completion ratio of buckets in a bucket space before it is considered complete.
##
## Bucket merges are considered complete when:
## ((buckets_total - buckets_pending) / buckets_total)) >= min_merge_completion_ratio
min_merge_completion_ratio double default=1.0

## If enabled, cluster state transitions are performed as two distinct phases:
##
##  1) state bundle propagation and bucket info gathering phase
##  2) state activation phase, which is not performed until all nodes have completed phase 1
##
## This is to enable read-only operations to pass through the system during phase 1
## while nodes await phase 2. If this feature is disabled, nodes will implicitly do
## phase 2 as part of phase 1 at their own leisure, which means that actual state
## activation may happen at wildly different times throughout the cluster. The 2 phase
## transition logic aims to minimize the window of time where active states diverge.
enable_two_phase_cluster_state_transitions bool default=false

# If enabled, the cluster controller observes reported (categorized) resource usage from content nodes (via host info),
# and decides whether external feed should be blocked (or unblocked) in the entire cluster.
#
# Each resource category has a limit, which is specified in cluster_feed_block_limit.
# If one resource category from one content node is above the configured limit, feed is blocked.
# This information is pushed to all distributor nodes via a new cluster state bundle.
# The distributor nodes handle the actual feed blocking based on this information.
enable_cluster_feed_block bool default=true

# Contains all resource categories (key) with its corresponding feed block limit (value)
# used when the cluster controller decides whether external feed should be blocked (or unblocked) in the entire cluster.
#
# The keys used must match the similar keys in the host info JSON structure.
# All limits are numbers between 0.0 and 1.0.
cluster_feed_block_limit{} double

# To avoid having the cluster feed block state flip-flop from nodes that are hovering
# just around the feed block limits, this noise threshold implicitly makes the
# feed block limit value _lower_ for a resource that is already exhausted. I.e. the
# node must reach a lower resource usage than the limit for feed to be unblocked.
# This is in absolute numbers, so 0.01 implies that a block limit of 0.8 effectively
# becomes 0.79 for an already blocked node.
cluster_feed_block_noise_level double default=0.0

# For apps that have several groups this controls how many groups are allowed to
# be down simultaneously in this cluster.
max_number_of_groups_allowed_to_be_down int default=-1