storage/src/vespa/storage/config/stor-distributormanager.def


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
namespace=vespa.config.content.core

## Maximum number of ideal state operations scheduled by a distributor.
maxpendingidealstateoperations int default=100

## The total size of unique documents in a bucket before we split it due to
## being too big. By default this is now 16 MB. Should be kept in sync with stor-filestor.def:bucket_merge_chunk_size.
splitsize int default=16772216

## The maximum amount of entries in a file before we should attempt to split it.
## A meta data entry in a slotfile currently uses 40 bytes. It is probably
## good to have the split size, such that all meta data entries are normally
## read when you do the initial read. With the default of 1024, meta data will
## take up around 40 kB, and the default initial read is 64 kB, allowing the
## file to grow a bit above max and still all be read in initial read.
splitcount int default=1024

## The maximum size of unique documents that allows the system to reduce
## the number of split bits on the bucket, or join two buckets together.
## The size must be lower than this number, and the count must be lower than
## joincount.
joinsize int default=16000000

## The maximum number of unique documents that allows for joining (see
## joinsize).
joincount int default=512

## Minimum level of splitting for buckets
minsplitcount int default=16

## If non-empty, continuously delete all the documents matching this selection.
garbagecollection.selectiontoremove string default=""

## The interval with which each bucket is purged using the selection above.
## If 0, no garbage collection is done.
garbagecollection.interval int default=0

## If false, dont do splits inline with feeding.
inlinebucketsplitting bool default=true

## Maximum nodes involved in a merge operation. Currently, this can not be more
## than 16 nodes due to protocol limitations. However, decreasing the max may
## be useful if 16 node merges ends up too expensive.
maximum_nodes_per_merge int default=16

## For internal in process debugging, it may be useful to not start the
## distributor thread to be able to call tick() manually and run single threaded
start_distributor_thread bool default=true restart

## The distributor can send joins that "lift" a bucket without any siblings
## higher up in the bucket tree hierarchy. The assumption is that if this
## is done for all sibling-less buckets, they will all eventually reach a
## level in the tree where they do in fact have a sibling and may (if their
## sizes allow) be joined into a single bucket.
enable_join_for_sibling_less_buckets bool default=false

## There exists a distribution edge case where bucket siblings end up having
## non-equal ideal locations. This will normally inhibit join operations, as
## these are only allowed when all nodes have all source buckets involved in
## the join operation. Setting this property to true means such buckets may
## still be joined at the cost of transient inconsistencies for the buckets
## being joined into.
enable_inconsistent_join bool default=false

## For each available node, the distributor will report back to the cluster
## controller a value which indicates the minimum replication factor for any
## bucket contained on said node. This config exposes a way to alter how this
## replication factor is computed.
##
## Valid enum values and their semantics:
##
## TRUSTED - only trusted replicas are counted.
## ANY - any replica present is counted. This may return an overly optimistic
##       view of the system. E.g. if there are 3 replicas, 1 having 1000 docs
##       and 2 having 1 doc, all being out of sync, counting with ANY will still
##       treat this as a minimum replication factor of 3. Conversely, with
##       TRUSTED such a bucket would most likely have a factor of 0 (or 1 iff
##       the trusted status for the replica with 1000 docs is known).
minimum_replica_counting_mode enum { TRUSTED, ANY } default=TRUSTED

## Bucket activation only makes sense for indexed search clusters, but Proton
## may also be run in store-only or streaming mode, in which case it does not
## actually require any activations. If the model infers that Proton is running
## in such a mode, activation will be explicitly disabled.
##
## Activation is always disabled entirely for clusters using VDS as their
## engine, regardless of the value of this setting.
disable_bucket_activation bool default=false


## Maximum clock skew across nodes in the cluster, in whole seconds.
## Used to prevent timestamp collisions during distributor bucket ownership
## transfers.
## Zero means this mechanism is disabled.
max_cluster_clock_skew_sec int default=1

## If set, a distributor will only allow one active operation per document ID
## for puts, updates and removes. This helps prevent issues caused by concurrent
## modifications to documents when sent from multiple feed clients.
sequence_mutating_operations bool default=true

## Number of seconds that scheduling of new merge operations should be inhibited
## towards a node if it has indicated that its merge queues are full or it is
## suffering from resource exhaustion.
inhibit_merge_sending_on_busy_node_duration_sec int default=1

## If set, enables potentially stale reads during cluster state transitions where
## buckets change ownership. This also implicitly enables support for two-phase
## cluster state transitions on the distributor.
## For this option to take effect, the cluster controller must also have two-phase
## states enabled.
allow_stale_reads_during_cluster_state_transitions bool default=false

## If greater than zero, injects a thread sleep into certain parts of the bucket
## processing logic. This allows for easier testing of racing edge cases where the
## main distributor thread is CPU-blocked processing large amounts of buckets, but
## without actually needing to use a lot of buckets in the test itself.
## Setting any of these values only makes sense for testing!
simulated_db_pruning_latency_msec int default=0
simulated_db_merging_latency_msec int default=0

## If a bucket is inconsistent and an Update operation is received, a two-phase
## write-repair path is triggered in which a Get is sent to all diverging replicas.
## Once received, the update is applied on the distributor and pushed out to the
## content nodes as Puts.
## Iff this config is set to true AND all Gets return the same timestamp from all
## content nodes, the two-phase update path reverts back to the regular fast path.
## Since all replicas of the document were in sync, applying the update in-place
## shall be considered safe.
## DEPRECATED -- always enabled with 3-phase updates.
restart_with_fast_update_path_if_all_get_timestamps_are_consistent bool default=true

## If set, no merge operations may be generated for any reason by a distributor.
## This is ONLY intended for system testing of certain transient edge cases and
## MUST NOT be set to true in a production environment.
merge_operations_disabled bool default=false

## If set, Get operations that are initiated by the client (i.e. _not_ Get operations
## that are initiated by the distributor) will be forwarded to the backend with
## a flag signalling that weak read consistency may be used. This allows the
## backend to minimize internal locking. The downside is that it's not guaranteed
## to observe the most recent writes to the document, nor to observe an atomically
## consistent view of fields across document versions.
## This is mostly useful in a system that is effectively read-only.
use_weak_internal_read_consistency_for_client_gets bool default=false

## If true, adds an initial metadata-only fetch phase to updates that touch buckets
## with inconsistent replicas. Metadata timestamps are compared and a single full Get
## is sent _only_ to one node with the highest timestamp. Without a metadata phase,
## full gets would be sent to _all_ nodes.
## Setting this option to true always implicitly enables the fast update restart
## feature, so it's not required to set that config to true, nor will setting it
## to false actually disable the feature.
enable_metadata_only_fetch_phase_for_inconsistent_updates bool default=true

## If a distributor main thread tick is constantly processing requests or responses
## originating from other nodes, setting this value above zero will prevent implicit
## maintenance scans from being done as part of the tick for up to N rounds of ticking.
## This is to reduce the amount of CPU spent on ideal state calculations and bucket DB
## accesses when the distributor is heavily loaded with feed operations.
max_consecutively_inhibited_maintenance_ticks int default=20

## If set, activation of bucket replicas is limited to only those replicas that have
## bucket info consistent with a majority of the other replicas for that bucket.
## Multiple active replicas is only a feature that is enabled for grouped clusters,
## and this feature is intended to prevent nodes in stale groups (whose buckets are
## likely to be out of sync) from serving query traffic until they have gotten back
## into sync.
## Up to the given number of groups can have their replica activation inhibited
## by this feature. If zero, the feature is functionally disabled.
## If more groups are out of sync than the configured number N, the inhibited groups
## will be the N first groups present in the distribution config.
## Note: this feature only kicks in if the number of groups in the cluster is greater
## than 1.
max_activation_inhibited_out_of_sync_groups int default=0

## Specifies the number of stripes over which a distributor internally distributes
## its buckets and operation processing. Every stripe receives its own thread.
## If <= 0, the number of stripes is inferred automatically based on the number of
## CPU cores available. If > 0, the number of stripes is explicitly overridden.
## Stripe counts must be a power of two.
num_distributor_stripes int default=0 restart

## If set, the maintenance scheduler will implicitly clear entries from its internal
## bucket maintenance priority database even when no operation can be started for the
## bucket due to being blocked by concurrent operations. This avoids potential head-of-line
## blocking of later buckets in the priority database.
implicitly_clear_bucket_priority_on_schedule bool default=true

## Enables sending merges that are forwarded between content nodes in ideal state node key
## order, instead of strictly increasing node key order (which is the default).
## Even if this config is set to true, unordered merges will only be sent if _all_ nodes
## involved in a given merge have previously reported (as part of bucket info fetching)
## that they support the unordered merge feature.
use_unordered_merge_chaining bool default=true

## If true, inhibits _all_ merges to buckets in the default bucket space if the current
## cluster state bundle indicates that global merges are pending in the cluster, i.e.
## one or more nodes is in maintenance mode in the default bucket space but marked up in
## the global bucket space.
inhibit_default_merges_when_global_merges_pending bool default=true

## If true, garbage collection is performed in two phases (metadata gathering and deletion)
## instead of just a single phase. Two-phase GC allows for ensuring the same set of documents
## is deleted across all nodes and explicitly takes write locks on the distributor to prevent
## concurrent feed ops to GC'd documents from potentially creating inconsistencies.
## Two-phase GC is only used iff all replica content nodes support the feature AND it's enabled
## by this config.
enable_two_phase_garbage_collection bool default=true

## If true, a conditional Put or Remove operation received for a bucket with inconsistent
## replicas will trigger an implicit distributed condition probe to resolve the outcome of
## the condition across all divergent replicas.
enable_condition_probing bool default=true

## If true, changes in the cluster where a subset of the nodes become unavailable or buckets
## change ownership between distributors will trigger an explicit cancellation of all pending
## requests partially or fully "invalidated" by such a change.
enable_operation_cancellation bool default=false