aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
blob: 549b82edf6c2b16f725fb718f65ac88df82027ff (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;

import com.yahoo.vdslib.distribution.ConfiguredNode;
import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.distribution.Group;
import com.yahoo.vdslib.distribution.GroupVisitor;
import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeType;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

class GroupAvailabilityCalculator {

    private final Distribution distribution;
    private final double minNodeRatioPerGroup;
    private final int safeMaintenanceGroupThreshold;
    private final List<Integer> nodesSafelySetToMaintenance;

    private GroupAvailabilityCalculator(Distribution distribution,
                                        double minNodeRatioPerGroup,
                                        int safeMaintenanceGroupThreshold,
                                        List<Integer> nodesSafelySetToMaintenance) {
        this.distribution = Objects.requireNonNull(distribution, "distribution must be non-null");
        this.minNodeRatioPerGroup = minNodeRatioPerGroup;
        this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
        this.nodesSafelySetToMaintenance = nodesSafelySetToMaintenance;
    }

    public static class Builder {
        private Distribution distribution;
        private double minNodeRatioPerGroup = 1.0;
        private int safeMaintenanceGroupThreshold = 2;
        private final List<Integer> nodesSafelySetToMaintenance = new ArrayList<>();

        Builder withDistribution(Distribution distribution) {
            this.distribution = distribution;
            return this;
        }
        Builder withMinNodeRatioPerGroup(double minRatio) {
            this.minNodeRatioPerGroup = minRatio;
            return this;
        }
        /**
         * If the number of nodes safely set to maintenance is at least this number, the remaining
         * nodes in the group will be set to maintenance (storage nodes) or down (distributors).
         *
         * <p>This feature is disabled if safeMaintenanceGroupThreshold is 0 (not default).</p>
         */
        Builder withSafeMaintenanceGroupThreshold(int safeMaintenanceGroupThreshold) {
            this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
            return this;
        }
        Builder withNodesSafelySetToMaintenance(List<Integer> nodesSafelySetToMaintenance) {
            this.nodesSafelySetToMaintenance.addAll(nodesSafelySetToMaintenance);
            return this;
        }
        GroupAvailabilityCalculator build() {
            return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup,
                    safeMaintenanceGroupThreshold, nodesSafelySetToMaintenance);
        }
    }

    public static Builder builder() {
        return new Builder();
    }

    private class InsufficientAvailabilityGroupVisitor implements GroupVisitor {
        private final Set<Integer> implicitlyMaintained = new HashSet<>();
        private final Set<Integer> implicitlyDown = new HashSet<>();
        private final ClusterState clusterState;
        private final Set<Integer> nodesSafelySetToMaintenance;
        private final int safeMaintenanceGroupThreshold;

        public InsufficientAvailabilityGroupVisitor(ClusterState clusterState,
                                                    List<Integer> nodesSafelySetToMaintenance,
                                                    int safeMaintenanceGroupThreshold) {
            this.clusterState = clusterState;
            this.nodesSafelySetToMaintenance = Set.copyOf(nodesSafelySetToMaintenance);
            this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
        }

        private boolean nodeIsAvailableInState(final int index, final String states) {
            return clusterState.getNodeState(new Node(NodeType.STORAGE, index)).getState().oneOf(states);
        }

        private Stream<ConfiguredNode> availableNodesIn(Group g) {
            // We consider nodes in states (u)p, (i)nitializing, (m)aintenance as being
            // available from the perspective of taking entire groups down (even though
            // maintenance mode is a half-truth in this regard).
            return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "uim"));
        }

        private Stream<ConfiguredNode> candidateNodesForSettingDown(Group g) {
            // We don't implicitly set (m)aintenance nodes down, as these are usually set
            // in maintenance for a good reason (e.g. orchestration or manual reboot).
            // Similarly, we don't take down (r)etired nodes as these may contain data
            // that the rest of the cluster needs.
            return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "ui"));
        }

        private Stream<ConfiguredNode> candidateNodesForSettingMaintenance(Group g) {
            // Most states should be set in maintenance, e.g. retirement may take a long time,
            // so force maintenance to allow upgrades.
            return g.getNodes().stream()
                    // "m" is NOT included since that would be a no-op.
                    .filter(n -> nodeIsAvailableInState(n.index(), "uird"));
        }

        private double computeGroupAvailability(Group g) {
            // TODO also look at distributors
            final long availableNodes = availableNodesIn(g).count();
            // Model should make it impossible to deploy with zero nodes in a group,
            // so no div by zero risk.
            return availableNodes / (double)g.getNodes().size();
        }

        private int computeNodesSafelySetToMaintenance(Group group) {
            Set<ConfiguredNode> nodesInGroupSafelySetToMaintenance = group.getNodes().stream()
                    .filter(configuredNode -> nodesSafelySetToMaintenance.contains(configuredNode.index()))
                    .collect(Collectors.toSet());

            return nodesInGroupSafelySetToMaintenance.size();
        }

        private void markAllAvailableGroupNodeIndicesAsDown(Group group) {
            candidateNodesForSettingDown(group).forEach(n -> implicitlyDown.add(n.index()));
        }

        private void markAllAvailableGroupNodeIndicesAsMaintained(Group group) {
            candidateNodesForSettingMaintenance(group).forEach(n -> implicitlyMaintained.add(n.index()));
        }

        @Override
        public boolean visitGroup(Group group) {
            if (group.isLeafGroup()) {
                if (safeMaintenanceGroupThreshold > 0 &&
                        computeNodesSafelySetToMaintenance(group) >= safeMaintenanceGroupThreshold) {
                    markAllAvailableGroupNodeIndicesAsMaintained(group);
                } else if (computeGroupAvailability(group) < minNodeRatioPerGroup) {
                    markAllAvailableGroupNodeIndicesAsDown(group);
                }
            }
            return true;
        }

        Result result() {
            var intersection = new HashSet<>(implicitlyMaintained);
            intersection.retainAll(implicitlyDown);
            if (intersection.size() > 0) {
                throw new IllegalStateException("Nodes implicitly both maintenance and down: " + intersection);
            }

            return new Result(implicitlyMaintained, implicitlyDown);
        }
    }

    private static boolean isFlatCluster(Group root) {
        return root.isLeafGroup();
    }

    public static class Result {
        private final Set<Integer> shouldBeMaintained;
        private final Set<Integer> shouldBeDown;

        public Result() { this(Set.of(), Set.of()); }

        public Result(Set<Integer> shouldBeMaintained, Set<Integer> shouldBeDown) {
            this.shouldBeMaintained = Set.copyOf(shouldBeMaintained);
            this.shouldBeDown = Set.copyOf(shouldBeDown);
        }

        public Set<Integer> nodesThatShouldBeMaintained() { return shouldBeMaintained; }
        public Set<Integer> nodesThatShouldBeDown() { return shouldBeDown; }
    }

    public Result calculate(ClusterState state) {
        if (isFlatCluster(distribution.getRootGroup())) {
            // Implicit group takedown only applies to hierarchic cluster setups.
            return new Result();
        }
        InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(
                state, nodesSafelySetToMaintenance, safeMaintenanceGroupThreshold);
        distribution.visitGroups(visitor);
        return visitor.result();
    }

    public Set<Integer> nodesThatShouldBeDown(ClusterState state) {
        return calculate(state).nodesThatShouldBeDown();
    }

}