storage/src/vespa/storage/config/stor-distributormanager.def


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320

# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
namespace=vespa.config.content.core

## Maximum number of ideal state operations scheduled by a distributor.
maxpendingidealstateoperations int default=100

## The total size of unique documents in a bucket before we split it due to
## being too big. By default this is now 16 MB. Should be kept in sync with stor-filestor.def:bucket_merge_chunk_size.
splitsize int default=16772216

## The maximum amount of entries in a file before we should attempt to split it.
## A meta data entry in a slotfile currently uses 40 bytes. It is probably
## good to have the split size, such that all meta data entries are normally
## read when you do the initial read. With the default of 1024, meta data will
## take up around 40 kB, and the default initial read is 64 kB, allowing the
## file to grow a bit above max and still all be read in initial read.
splitcount int default=1024

## The maximum size of unique documents that allows the system to reduce
## the number of split bits on the bucket, or join two buckets together.
## The size must be lower than this number, and the count must be lower than
## joincount.
joinsize int default=16000000

## The maximum number of unique documents that allows for joining (see
## joinsize).
joincount int default=512

## Minimum level of splitting for buckets
minsplitcount int default=16

## If non-empty, continuously delete all the documents matching this selection.
garbagecollection.selectiontoremove string default=""

## The interval with which each bucket is purged using the selection above.
## If 0, no garbage collection is done.
garbagecollection.interval int default=0

## If false, dont do splits inline with feeding.
inlinebucketsplitting bool default=true

## List of state checkers (ideal state generators) that should be ignored in the cluster.
## One or more of the following (case insensitive):
##
## SynchronizeAndMove
## DeleteExtraCopies
## JoinBuckets
## SplitBucket
## SplitInconsistentBuckets
## SetBucketState
## GarbageCollection
blockedstatecheckers[] string restart

## Whether or not distributor should issue reverts when operations partially
## fail.
enable_revert bool default=true

## Maximum nodes involved in a merge operation. Currently, this can not be more
## than 16 nodes due to protocol limitations. However, decreasing the max may
## be useful if 16 node merges ends up too expensive.
maximum_nodes_per_merge int default=16

## For internal in process debugging, it may be useful to not start the
## distributor thread to be able to call tick() manually and run single threaded
start_distributor_thread bool default=true restart

## The number of ticks calls done before a wait is done.  This can be
## set higher than 10 for the distributor to improve speed of bucket iterations
## while still keep CPU load low/moderate.
ticks_before_wait int default=10

## The sleep time between ticks if there are no more queued tasks.
ticks_wait_time_ms int default=1

## Max processing time used by deadlock detector.
max_process_time_ms int default=5000

## Allow overriding default priorities of certain maintenance operations.
## This is an advanced feature, do not touch this unless you have a very good
## reason to do so! Configuring these values wrongly may cause starvation of
## important operations, leading to unpredictable behavior and/or data loss.
##
## Merge used to move data to ideal location
priority_merge_move_to_ideal_node int default=165

## Merge for copies that have gotten out of sync with each other
priority_merge_out_of_sync_copies int default=120

## Merge for restoring redundancy of copies
priority_merge_too_few_copies int default=120

## Merge that involves a global bucket. There are generally significantly fewer such
## buckets than default-space buckets, and searches to documents in the default space
## may depend on the presence of (all) global documents. Consequently, this priority
## should be higher (i.e. numerically smaller) than that of regular merges.
priority_merge_global_buckets int default=115

## Copy activation when there are no other active copies (likely causing
## lack of search coverage for that bucket)
priority_activate_no_existing_active int default=100

## Copy activation when there is already an active copy for the bucket.
priority_activate_with_existing_active int default=100

## Deletion of bucket copy.
priority_delete_bucket_copy int default=120

## Joining caused by bucket siblings getting sufficiently small to fit into a
## single bucket.
priority_join_buckets int default=155

## Splitting caused by system increasing its minimum distribution bit count.
priority_split_distribution_bits int default=200

## Splitting due to bucket exceeding max document count or byte size (see
## splitcount and splitsize config values)
priority_split_large_bucket int default=175

## Splitting due to buckets being inconsistently split. Should be higher
## priority than the vast majority of external load.
priority_split_inconsistent_bucket int default=110

## Background garbage collection. Should be lower priority than external load
## and other ideal state operations (aside from perhaps minimum bit splitting).
priority_garbage_collection int default=200

## The distributor can send joins that "lift" a bucket without any siblings
## higher up in the bucket tree hierarchy. The assumption is that if this
## is done for all sibling-less buckets, they will all eventually reach a
## level in the tree where they do in fact have a sibling and may (if their
## sizes allow) be joined into a single bucket.
enable_join_for_sibling_less_buckets bool default=false

## There exists a distribution edge case where bucket siblings end up having
## non-equal ideal locations. This will normally inhibit join operations, as
## these are only allowed when all nodes have all source buckets involved in
## the join operation. Setting this property to true means such buckets may
## still be joined at the cost of transient inconsistencies for the buckets
## being joined into.
enable_inconsistent_join bool default=false

## The distributor host info reporter may be disabled entirely, in which case
## no per-node statistics for merges, latencies or bucket replication factors
## will be reported back to the cluster controller. Disabling this may make
## sense in large clusters that do not make use of these reports directly or
## indirectly, as it causes potentially significant processing overhead on the
## cluster controller.
## This host reporter must never be disabled on a Hosted Vespa system, or
## automatic upgrades will stall.
enable_host_info_reporting bool default=true

## For each available node, the distributor will report back to the cluster
## controller a value which indicates the minimum replication factor for any
## bucket contained on said node. This config exposes a way to alter how this
## replication factor is computed.
##
## Valid enum values and their semantics:
##
## TRUSTED - only trusted replicas are counted.
## ANY - any replica present is counted. This may return an overly optimistic
##       view of the system. E.g. if there are 3 replicas, 1 having 1000 docs
##       and 2 having 1 doc, all being out of sync, counting with ANY will still
##       treat this as a minimum replication factor of 3. Conversely, with
##       TRUSTED such a bucket would most likely have a factor of 0 (or 1 iff
##       the trusted status for the replica with 1000 docs is known).
minimum_replica_counting_mode enum { TRUSTED, ANY } default=TRUSTED

## Bucket activation only makes sense for indexed search clusters, but Proton
## may also be run in store-only or streaming mode, in which case it does not
## actually require any activations. If the model infers that Proton is running
## in such a mode, activation will be explicitly disabled.
##
## Activation is always disabled entirely for clusters using VDS as their
## engine, regardless of the value of this setting.
disable_bucket_activation bool default=false


## Maximum clock skew across nodes in the cluster, in whole seconds.
## Used to prevent timestamp collisions during distributor bucket ownership
## transfers.
## Zero means this mechanism is disabled.
max_cluster_clock_skew_sec int default=1

## If set, a distributor will only allow one active operation per document ID
## for puts, updates and removes. This helps prevent issues caused by concurrent
## modifications to documents when sent from multiple feed clients.
sequence_mutating_operations bool default=true

## Number of seconds that scheduling of new merge operations should be inhibited
## towards a node if it has indicated that its merge queues are full or it is
## suffering from resource exhaustion.
inhibit_merge_sending_on_busy_node_duration_sec int default=1

## If set, enables potentially stale reads during cluster state transitions where
## buckets change ownership. This also implicitly enables support for two-phase
## cluster state transitions on the distributor.
## For this option to take effect, the cluster controller must also have two-phase
## states enabled.
allow_stale_reads_during_cluster_state_transitions bool default=false

## If greater than zero, injects a thread sleep into certain parts of the bucket
## processing logic. This allows for easier testing of racing edge cases where the
## main distributor thread is CPU-blocked processing large amounts of buckets, but
## without actually needing to use a lot of buckets in the test itself.
## Setting any of these values only makes sense for testing!
simulated_db_pruning_latency_msec int default=0
simulated_db_merging_latency_msec int default=0

## Whether to use a B-tree data structure for the distributor bucket database instead
## of the legacy database. Setting this option may trigger alternate code paths for
## read only operations, as the B-tree database is thread safe for concurrent reads.
use_btree_database bool default=true restart

## If a bucket is inconsistent and an Update operation is received, a two-phase
## write-repair path is triggered in which a Get is sent to all diverging replicas.
## Once received, the update is applied on the distributor and pushed out to the
## content nodes as Puts.
## Iff this config is set to true AND all Gets return the same timestamp from all
## content nodes, the two-phase update path reverts back to the regular fast path.
## Since all replicas of the document were in sync, applying the update in-place
## shall be considered safe.
## DEPRECATED -- always enabled with 3-phase updates.
restart_with_fast_update_path_if_all_get_timestamps_are_consistent bool default=true

## If set, no merge operations may be generated for any reason by a distributor.
## This is ONLY intended for system testing of certain transient edge cases and
## MUST NOT be set to true in a production environment.
merge_operations_disabled bool default=false

## If set, Get operations that are initiated by the client (i.e. _not_ Get operations
## that are initiated by the distributor) will be forwarded to the backend with
## a flag signalling that weak read consistency may be used. This allows the
## backend to minimize internal locking. The downside is that it's not guaranteed
## to observe the most recent writes to the document, nor to observe an atomically
## consistent view of fields across document versions.
## This is mostly useful in a system that is effectively read-only.
use_weak_internal_read_consistency_for_client_gets bool default=false

## If true, adds an initial metadata-only fetch phase to updates that touch buckets
## with inconsistent replicas. Metadata timestamps are compared and a single full Get
## is sent _only_ to one node with the highest timestamp. Without a metadata phase,
## full gets would be sent to _all_ nodes.
## Setting this option to true always implicitly enables the fast update restart
## feature, so it's not required to set that config to true, nor will setting it
## to false actually disable the feature.
enable_metadata_only_fetch_phase_for_inconsistent_updates bool default=true

## If a distributor main thread tick is constantly processing requests or responses
## originating from other nodes, setting this value above zero will prevent implicit
## maintenance scans from being done as part of the tick for up to N rounds of ticking.
## This is to reduce the amount of CPU spent on ideal state calculations and bucket DB
## accesses when the distributor is heavily loaded with feed operations.
max_consecutively_inhibited_maintenance_ticks int default=20

## If set, pending merges to buckets in the global bucket space will be prioritized
## higher than merges to buckets in the default bucket space. This ensures that global
## documents will be kept in sync without being starved by non-global documents.
## Note that enabling this feature risks starving default bucket space merges if a
## resource exhaustion case prevents global merges from completing.
## This is a live config for that reason, i.e. it can be disabled in an emergency
## situation if needed.
prioritize_global_bucket_merges bool default=true

## If set, activation of bucket replicas is limited to only those replicas that have
## bucket info consistent with a majority of the other replicas for that bucket.
## Multiple active replicas is only a feature that is enabled for grouped clusters,
## and this feature is intended to prevent nodes in stale groups (whose buckets are
## likely to be out of sync) from serving query traffic until they have gotten back
## into sync.
## Up to the given number of groups can have their replica activation inhibited
## by this feature. If zero, the feature is functionally disabled.
## If more groups are out of sync than the configured number N, the inhibited groups
## will be the N first groups present in the distribution config.
## Note: this feature only kicks in if the number of groups in the cluster is greater
## than 1.
max_activation_inhibited_out_of_sync_groups int default=0

## Specifies the number of stripes over which a distributor internally distributes
## its buckets and operation processing. Every stripe receives its own thread.
## If <= 0, the number of stripes is inferred automatically based on the number of
## CPU cores available. If > 0, the number of stripes is explicitly overridden.
## Stripe counts must be a power of two.
num_distributor_stripes int default=0 restart

## If set, the maintenance scheduler will implicitly clear entries from its internal
## bucket maintenance priority database even when no operation can be started for the
## bucket due to being blocked by concurrent operations. This avoids potential head-of-line
## blocking of later buckets in the priority database.
implicitly_clear_bucket_priority_on_schedule bool default=true

## Enables sending merges that are forwarded between content nodes in ideal state node key
## order, instead of strictly increasing node key order (which is the default).
## Even if this config is set to true, unordered merges will only be sent if _all_ nodes
## involved in a given merge have previously reported (as part of bucket info fetching)
## that they support the unordered merge feature.
use_unordered_merge_chaining bool default=true

## If true, inhibits _all_ merges to buckets in the default bucket space if the current
## cluster state bundle indicates that global merges are pending in the cluster, i.e.
## one or more nodes is in maintenance mode in the default bucket space but marked up in
## the global bucket space.
inhibit_default_merges_when_global_merges_pending bool default=true

## If true, garbage collection is performed in two phases (metadata gathering and deletion)
## instead of just a single phase. Two-phase GC allows for ensuring the same set of documents
## is deleted across all nodes and explicitly takes write locks on the distributor to prevent
## concurrent feed ops to GC'd documents from potentially creating inconsistencies.
## Two-phase GC is only used iff all replica content nodes support the feature AND it's enabled
## by this config.
enable_two_phase_garbage_collection bool default=true

## If true, a conditional Put or Remove operation received for a bucket with inconsistent
## replicas will trigger an implicit distributed condition probe to resolve the outcome of
## the condition across all divergent replicas.
enable_condition_probing bool default=true

## If true, changes in the cluster where a subset of the nodes become unavailable or buckets
## change ownership between distributors will trigger an explicit cancellation of all pending
## requests partially or fully "invalidated" by such a change.
enable_operation_cancellation bool default=false