# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
namespace=vespa.config.content

## Name of VDS cluster
cluster_name string restart

## The fleetcontroller index. Each fleetcontroller should have a unique index
## which can be used to identify them.
index int restart

## Number of fleet controllers. If more than one, the fleetcontroller needs to
## do master election in order to know which one is active. It then needs to
## know how many exist.
fleet_controller_count int default=1

## The number of seconds we can attempt to talk to zookeeper before our session
## time out and we lose our connection.
zookeeper_session_timeout double default=30.0

## When a master candidate see the master disappearing from ZooKeeper, it wants
## to take over as master. But, before taking over, the master should be aware
## that it has lost its zookeeper connection, so it will stop reacting as
## master. Suggests setting this to 2-3 times the zookeeper session timeout.
## (Set in number of seconds)
master_zookeeper_cooldown_period double default=60.0

## Sets how many fleetcontrollers will gather state. A fleetcontroller
## gathering state can take over quicker should the master fail.
## If set to 1, only master will gather state. If set higher, others will
## also do so, prioritizing those fleetcontrollers likely to be the ones to
## take over if the master fails.
# TODO: Deprecated, not used anymore, remove in Vespa 9
state_gather_count int default=1

## Location of ZooKeeper servers
zookeeper_server string restart

## RPC Port used by fleetcontroller
rpc_port int default=6500 restart

## Port where fleetcontroller listens for HTTP status request
http_port int default=0 restart

## Maximum number of milliseconds a storage node will be automatically reported
## in maintenance due to node recently being available. (If 0, no time at all)
storage_transition_time int default=30000

## Maximum number of milliseconds a distributor will be automatically reported
## in maintenance due to node recently being available. (If 0, no time at all)
##
## Currently default is 0.. Should probably be more when we know it is working
## correctly
distributor_transition_time int default=0

## Maximum number of milliseconds allowed between progress increase during
## initialization. If no progress have been made during this time period, the
## node will be considered down.
##
## Currently disabled as 5.0 nodes gets load while initializing which may be
## higher pri than initializing, so nodes can stay in init for a long time.
init_progress_time int default=0

## Minimum time (in ms) between system state updates. To limit updates in a
## system where a lot is happening at the same time, this value will make sure
## we dont change the state too often.
min_time_between_new_systemstates int default=10000

## The maximum amount of premature crashes a node is allowed to have in a row
## before the fleetcontroller disables that node.
max_premature_crashes int default=100000

## If a node has been down or up this many milliseconds, clear the premature
## crash count of a node and consider the node as stable
stable_state_time_period int default=7200000

## The maximum number of events to keep in the event log
event_log_max_size int default=1024

## The maximum number of node events to keep in the node event log per node
event_node_log_max_size int default=1024

## The total number of distributor nodes that can exist. If 0, we dont know and
## will use the highest distributor index number we have ever seen + 1.
total_distributor_count int default=0

## The total number of storage nodes that can exist. If 0, we dont know and
## will use the highest storage index number we have ever seen + 1.
total_storage_count int default=0

## The minimum number of distributor nodes that should be up for the cluster
## state to be up. (Retired nodes counts as up in this case)
min_distributors_up_count int default=1

## The minimum number of storage nodes that should be up for the cluster state
## to be up (Retired nodes counts as up in this case)
min_storage_up_count int default=1

## The minimum ratio of known distributor nodes that should be up (or retired)
## for the cluster state to stay up.
min_distributor_up_ratio double default=0.01

## The minimum ratio of known storage nodes that should be up (or retired) for
## the cluster state to stay up.
min_storage_up_ratio double default=0.01

## Seconds to sleep after doing a work cycle where we did no work. Some
## events do not interrupt the sleeping, such as slobrok changes, so shouldn't
## set this too high
cycle_wait_time double default=0.1

## Minimum time to pass in seconds before broadcasting our first systemstate as
## a new fleetcontroller. Will broadcast earlier than this if we have gathered
## state from all before this. To prevent disturbance when taking over as
## fleetcontroller, give nodes a bit of time to answer so we dont temporarily
## report nodes as down.  See also max_slobrok_disconnect_grace_period and
## get_node_state_request_timeout.
min_time_before_first_system_state_broadcast double default=30.0

## Request timeout of node state requests. Keeping a high timeout allows us to
## always have a pending operation with very low cost. Keeping a low timeout is
## good to detect issues like packet loss. The default tries to balance the two
## by not resending too often, but detecting packet loss within a minute at
## least. If we can guarantee RPC layer to fail on packet loss within
## reasonable time we should increase this default.
get_node_state_request_timeout double default=120.0

## If a node is out of slobrok longer than this time period, assume the node
## is down, even if we have a pending node state request to it. Slobrok does
## a bit more keep alive checking than fleetcontroller, so it is possible that
## the node disappears from slobrok while it still looks ok in fleetcontroller.
max_slobrok_disconnect_grace_period double default=60.0

## Whether to show system states that have never been sent to storage nodes in
## the event log.
show_local_systemstates_in_event_log bool default=true

## The ideal number of distribution bits this system should have
ideal_distribution_bits int default=16

## Minimum ratio of nodes that have to be available (i.e. not Down) in any
## hierarchic content cluster group. If a higher ratio than this is Down at
## any point, the remaning nodes in the group will be automatically marked
## as down. Group nodes will automatically be taken back up as soon as node
## availability has been restored above the given threshold.
## Default is 0, i.e. functionality is for all intents and purposes disabled.
min_node_ratio_per_group double default=0.0

## If a cluster controller task has a dependency on a given cluster state
## version being published and ACKed by the cluster, it will be put on a wait
## queue while holding up the container thread associated with the task.
## This config specifies the maximum time a task can be held in this queue
## before being automatically failed out, if a version has not been ACKed
## within this duration.
max_deferred_task_version_wait_time_sec double default=30.0

## Whether or not the content cluster the controller has responsibility for
## contains any document types that are tagged as global. If this is true,
## global document-specific behavior is enabled that marks nodes down in the
## default space if they have merges pending in the global bucket space.
cluster_has_global_document_types bool default=false

## The minimum merge completion ratio of buckets in a bucket space before it is considered complete.
##
## Bucket merges are considered complete when:
## ((buckets_total - buckets_pending) / buckets_total)) >= min_merge_completion_ratio
min_merge_completion_ratio double default=1.0

## If enabled, cluster state transitions are performed as two distinct phases:
##
##  1) state bundle propagation and bucket info gathering phase
##  2) state activation phase, which is not performed until all nodes have completed phase 1
##
## This is to enable read-only operations to pass through the system during phase 1
## while nodes await phase 2. If this feature is disabled, nodes will implicitly do
## phase 2 as part of phase 1 at their own leisure, which means that actual state
## activation may happen at wildly different times throughout the cluster. The 2 phase
## transition logic aims to minimize the window of time where active states diverge.
enable_two_phase_cluster_state_transitions bool default=false

# If enabled, the cluster controller observes reported (categorized) resource usage from content nodes (via host info),
# and decides whether external feed should be blocked (or unblocked) in the entire cluster.
#
# Each resource category has a limit, which is specified in cluster_feed_block_limit.
# If one resource category from one content node is above the configured limit, feed is blocked.
# This information is pushed to all distributor nodes via a new cluster state bundle.
# The distributor nodes handle the actual feed blocking based on this information.
enable_cluster_feed_block bool default=true

# Contains all resource categories (key) with its corresponding feed block limit (value)
# used when the cluster controller decides whether external feed should be blocked (or unblocked) in the entire cluster.
#
# The keys used must match the similar keys in the host info JSON structure.
# All limits are numbers between 0.0 and 1.0.
cluster_feed_block_limit{} double

# To avoid having the cluster feed block state flip-flop from nodes that are hovering
# just around the feed block limits, this noise threshold implicitly makes the
# feed block limit value _lower_ for a resource that is already exhausted. I.e. the
# node must reach a lower resource usage than the limit for feed to be unblocked.
# This is in absolute numbers, so 0.01 implies that a block limit of 0.8 effectively
# becomes 0.79 for an already blocked node.
cluster_feed_block_noise_level double default=0.0

# For apps that have several groups this controls how many groups are allowed to
# be down simultaneously in this cluster.
max_number_of_groups_allowed_to_be_down int default=-1