# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
namespace=vespa.config.content.core

## Maximum number of ideal state operations scheduled by a distributor.
maxpendingidealstateoperations int default=100

## The total size of unique documents in a bucket before we split it due to
## being too big. By default this is now 16 MB. Should be kept in sync with stor-filestor.def:bucket_merge_chunk_size.
splitsize int default=16772216

## The maximum amount of entries in a file before we should attempt to split it.
## A meta data entry in a slotfile currently uses 40 bytes. It is probably
## good to have the split size, such that all meta data entries are normally
## read when you do the initial read. With the default of 1024, meta data will
## take up around 40 kB, and the default initial read is 64 kB, allowing the
## file to grow a bit above max and still all be read in initial read.
splitcount int default=1024

## The maximum size of unique documents that allows the system to reduce
## the number of split bits on the bucket, or join two buckets together.
## The size must be lower than this number, and the count must be lower than
## joincount.
joinsize int default=16000000

## The maximum number of unique documents that allows for joining (see
## joinsize).
joincount int default=512

## Minimum level of splitting for buckets
minsplitcount int default=16

## If non-empty, continuously delete all the documents matching this selection.
garbagecollection.selectiontoremove string default=""

## The interval with which each bucket is purged using the selection above.
## If 0, no garbage collection is done.
garbagecollection.interval int default=0

## If false, dont do splits inline with feeding.
inlinebucketsplitting bool default=true

## List of state checkers (ideal state generators) that should be ignored in the cluster.
## One or more of the following (case insensitive):
##
## SynchronizeAndMove
## DeleteExtraCopies
## JoinBuckets
## SplitBucket
## SplitInconsistentBuckets
## SetBucketState
## GarbageCollection
blockedstatecheckers[] string restart

## Whether or not distributor should issue reverts when operations partially
## fail.
enable_revert bool default=true

## Maximum nodes involved in a merge operation. Currently, this can not be more
## than 16 nodes due to protocol limitations. However, decreasing the max may
## be useful if 16 node merges ends up too expensive.
maximum_nodes_per_merge int default=16

## For internal in process debugging, it may be useful to not start the
## distributor thread to be able to call tick() manually and run single threaded
start_distributor_thread bool default=true restart

## The number of ticks calls done before a wait is done.  This can be
## set higher than 10 for the distributor to improve speed of bucket iterations
## while still keep CPU load low/moderate.
ticks_before_wait int default=10

## The sleep time between ticks if there are no more queued tasks.
ticks_wait_time_ms int default=1

## Max processing time used by deadlock detector.
max_process_time_ms int default=5000

## Allow overriding default priorities of certain maintenance operations.
## This is an advanced feature, do not touch this unless you have a very good
## reason to do so! Configuring these values wrongly may cause starvation of
## important operations, leading to unpredictable behavior and/or data loss.
##
## Merge used to move data to ideal location
priority_merge_move_to_ideal_node int default=165

## Merge for copies that have gotten out of sync with each other
priority_merge_out_of_sync_copies int default=120

## Merge for restoring redundancy of copies
priority_merge_too_few_copies int default=120

## Merge that involves a global bucket. There are generally significantly fewer such
## buckets than default-space buckets, and searches to documents in the default space
## may depend on the presence of (all) global documents. Consequently, this priority
## should be higher (i.e. numerically smaller) than that of regular merges.
priority_merge_global_buckets int default=115

## Copy activation when there are no other active copies (likely causing
## lack of search coverage for that bucket)
priority_activate_no_existing_active int default=100

## Copy activation when there is already an active copy for the bucket.
priority_activate_with_existing_active int default=100

## Deletion of bucket copy.
priority_delete_bucket_copy int default=120

## Joining caused by bucket siblings getting sufficiently small to fit into a
## single bucket.
priority_join_buckets int default=155

## Splitting caused by system increasing its minimum distribution bit count.
priority_split_distribution_bits int default=200

## Splitting due to bucket exceeding max document count or byte size (see
## splitcount and splitsize config values)
priority_split_large_bucket int default=175

## Splitting due to buckets being inconsistently split. Should be higher
## priority than the vast majority of external load.
priority_split_inconsistent_bucket int default=110

## Background garbage collection. Should be lower priority than external load
## and other ideal state operations (aside from perhaps minimum bit splitting).
priority_garbage_collection int default=200

## The distributor can send joins that "lift" a bucket without any siblings
## higher up in the bucket tree hierarchy. The assumption is that if this
## is done for all sibling-less buckets, they will all eventually reach a
## level in the tree where they do in fact have a sibling and may (if their
## sizes allow) be joined into a single bucket.
enable_join_for_sibling_less_buckets bool default=false

## There exists a distribution edge case where bucket siblings end up having
## non-equal ideal locations. This will normally inhibit join operations, as
## these are only allowed when all nodes have all source buckets involved in
## the join operation. Setting this property to true means such buckets may
## still be joined at the cost of transient inconsistencies for the buckets
## being joined into.
enable_inconsistent_join bool default=false

## The distributor host info reporter may be disabled entirely, in which case
## no per-node statistics for merges, latencies or bucket replication factors
## will be reported back to the cluster controller. Disabling this may make
## sense in large clusters that do not make use of these reports directly or
## indirectly, as it causes potentially significant processing overhead on the
## cluster controller.
## This host reporter must never be disabled on a Hosted Vespa system, or
## automatic upgrades will stall.
enable_host_info_reporting bool default=true

## For each available node, the distributor will report back to the cluster
## controller a value which indicates the minimum replication factor for any
## bucket contained on said node. This config exposes a way to alter how this
## replication factor is computed.
##
## Valid enum values and their semantics:
##
## TRUSTED - only trusted replicas are counted.
## ANY - any replica present is counted. This may return an overly optimistic
##       view of the system. E.g. if there are 3 replicas, 1 having 1000 docs
##       and 2 having 1 doc, all being out of sync, counting with ANY will still
##       treat this as a minimum replication factor of 3. Conversely, with
##       TRUSTED such a bucket would most likely have a factor of 0 (or 1 iff
##       the trusted status for the replica with 1000 docs is known).
minimum_replica_counting_mode enum { TRUSTED, ANY } default=TRUSTED

## Bucket activation only makes sense for indexed search clusters, but Proton
## may also be run in store-only or streaming mode, in which case it does not
## actually require any activations. If the model infers that Proton is running
## in such a mode, activation will be explicitly disabled.
##
## Activation is always disabled entirely for clusters using VDS as their
## engine, regardless of the value of this setting.
disable_bucket_activation bool default=false


## Maximum clock skew across nodes in the cluster, in whole seconds.
## Used to prevent timestamp collisions during distributor bucket ownership
## transfers.
## Zero means this mechanism is disabled.
max_cluster_clock_skew_sec int default=1

## If set, a distributor will only allow one active operation per document ID
## for puts, updates and removes. This helps prevent issues caused by concurrent
## modifications to documents when sent from multiple feed clients.
sequence_mutating_operations bool default=true

## Number of seconds that scheduling of new merge operations should be inhibited
## towards a node if it has indicated that its merge queues are full or it is
## suffering from resource exhaustion.
inhibit_merge_sending_on_busy_node_duration_sec int default=1

## If set, enables potentially stale reads during cluster state transitions where
## buckets change ownership. This also implicitly enables support for two-phase
## cluster state transitions on the distributor.
## For this option to take effect, the cluster controller must also have two-phase
## states enabled.
allow_stale_reads_during_cluster_state_transitions bool default=false

## If greater than zero, injects a thread sleep into certain parts of the bucket
## processing logic. This allows for easier testing of racing edge cases where the
## main distributor thread is CPU-blocked processing large amounts of buckets, but
## without actually needing to use a lot of buckets in the test itself.
## Setting any of these values only makes sense for testing!
simulated_db_pruning_latency_msec int default=0
simulated_db_merging_latency_msec int default=0

## Whether to use a B-tree data structure for the distributor bucket database instead
## of the legacy database. Setting this option may trigger alternate code paths for
## read only operations, as the B-tree database is thread safe for concurrent reads.
use_btree_database bool default=true restart

## If a bucket is inconsistent and an Update operation is received, a two-phase
## write-repair path is triggered in which a Get is sent to all diverging replicas.
## Once received, the update is applied on the distributor and pushed out to the
## content nodes as Puts.
## Iff this config is set to true AND all Gets return the same timestamp from all
## content nodes, the two-phase update path reverts back to the regular fast path.
## Since all replicas of the document were in sync, applying the update in-place
## shall be considered safe.
## DEPRECATED -- always enabled with 3-phase updates.
restart_with_fast_update_path_if_all_get_timestamps_are_consistent bool default=true

## If set, no merge operations may be generated for any reason by a distributor.
## This is ONLY intended for system testing of certain transient edge cases and
## MUST NOT be set to true in a production environment.
merge_operations_disabled bool default=false

## If set, Get operations that are initiated by the client (i.e. _not_ Get operations
## that are initiated by the distributor) will be forwarded to the backend with
## a flag signalling that weak read consistency may be used. This allows the
## backend to minimize internal locking. The downside is that it's not guaranteed
## to observe the most recent writes to the document, nor to observe an atomically
## consistent view of fields across document versions.
## This is mostly useful in a system that is effectively read-only.
use_weak_internal_read_consistency_for_client_gets bool default=false

## If true, adds an initial metadata-only fetch phase to updates that touch buckets
## with inconsistent replicas. Metadata timestamps are compared and a single full Get
## is sent _only_ to one node with the highest timestamp. Without a metadata phase,
## full gets would be sent to _all_ nodes.
## Setting this option to true always implicitly enables the fast update restart
## feature, so it's not required to set that config to true, nor will setting it
## to false actually disable the feature.
enable_metadata_only_fetch_phase_for_inconsistent_updates bool default=true

## If a distributor main thread tick is constantly processing requests or responses
## originating from other nodes, setting this value above zero will prevent implicit
## maintenance scans from being done as part of the tick for up to N rounds of ticking.
## This is to reduce the amount of CPU spent on ideal state calculations and bucket DB
## accesses when the distributor is heavily loaded with feed operations.
max_consecutively_inhibited_maintenance_ticks int default=20

## If set, pending merges to buckets in the global bucket space will be prioritized
## higher than merges to buckets in the default bucket space. This ensures that global
## documents will be kept in sync without being starved by non-global documents.
## Note that enabling this feature risks starving default bucket space merges if a
## resource exhaustion case prevents global merges from completing.
## This is a live config for that reason, i.e. it can be disabled in an emergency
## situation if needed.
prioritize_global_bucket_merges bool default=true

## If set, activation of bucket replicas is limited to only those replicas that have
## bucket info consistent with a majority of the other replicas for that bucket.
## Multiple active replicas is only a feature that is enabled for grouped clusters,
## and this feature is intended to prevent nodes in stale groups (whose buckets are
## likely to be out of sync) from serving query traffic until they have gotten back
## into sync.
## Up to the given number of groups can have their replica activation inhibited
## by this feature. If zero, the feature is functionally disabled.
## If more groups are out of sync than the configured number N, the inhibited groups
## will be the N first groups present in the distribution config.
## Note: this feature only kicks in if the number of groups in the cluster is greater
## than 1.
max_activation_inhibited_out_of_sync_groups int default=0

## Specifies the number of stripes over which a distributor internally distributes
## its buckets and operation processing. Every stripe receives its own thread.
## If <= 0, the number of stripes is inferred automatically based on the number of
## CPU cores available. If > 0, the number of stripes is explicitly overridden.
## Stripe counts must be a power of two.
num_distributor_stripes int default=0 restart

## If set, the maintenance scheduler will implicitly clear entries from its internal
## bucket maintenance priority database even when no operation can be started for the
## bucket due to being blocked by concurrent operations. This avoids potential head-of-line
## blocking of later buckets in the priority database.
implicitly_clear_bucket_priority_on_schedule bool default=true

## Enables sending merges that are forwarded between content nodes in ideal state node key
## order, instead of strictly increasing node key order (which is the default).
## Even if this config is set to true, unordered merges will only be sent if _all_ nodes
## involved in a given merge have previously reported (as part of bucket info fetching)
## that they support the unordered merge feature.
use_unordered_merge_chaining bool default=true

## If true, inhibits _all_ merges to buckets in the default bucket space if the current
## cluster state bundle indicates that global merges are pending in the cluster, i.e.
## one or more nodes is in maintenance mode in the default bucket space but marked up in
## the global bucket space.
inhibit_default_merges_when_global_merges_pending bool default=true

## If true, garbage collection is performed in two phases (metadata gathering and deletion)
## instead of just a single phase. Two-phase GC allows for ensuring the same set of documents
## is deleted across all nodes and explicitly takes write locks on the distributor to prevent
## concurrent feed ops to GC'd documents from potentially creating inconsistencies.
## Two-phase GC is only used iff all replica content nodes support the feature AND it's enabled
## by this config.
enable_two_phase_garbage_collection bool default=true

## If true, a conditional Put or Remove operation received for a bucket with inconsistent
## replicas will trigger an implicit distributed condition probe to resolve the outcome of
## the condition across all divergent replicas.
enable_condition_probing bool default=true

## If true, changes in the cluster where a subset of the nodes become unavailable or buckets
## change ownership between distributors will trigger an explicit cancellation of all pending
## requests partially or fully "invalidated" by such a change.
enable_operation_cancellation bool default=false