aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main
diff options
context:
space:
mode:
authorHarald Musum <musum@yahooinc.com>2023-07-16 22:46:09 +0200
committerHarald Musum <musum@yahooinc.com>2023-07-16 22:46:09 +0200
commit1375d80d4949799cb37d68c8f2b90a7a4f331b57 (patch)
tree17ce8affd385db9cc61daf298bc1e81575059083 /clustercontroller-core/src/main
parent9f56b9f995e2297f60b02fc06af5a8f0897612bf (diff)
Check redundancy also for groups that are up
When we allow several groups to go down for maintenance we should check nodes in the groups that are up if they have the required redundancy. They might be up but have not yet synced all buckets after coming up. We want to wait with allowing more nodes to be taken down until that is done.
Diffstat (limited to 'clustercontroller-core/src/main')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java79
1 files changed, 79 insertions, 0 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index a36bd187a47..31bd9c59992 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -13,10 +13,12 @@ import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
+
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -262,6 +264,11 @@ public class NodeStateChangeChecker {
if (aGroupContainsNode(retiredAndNotUpGroups, node)) {
numberOfGroupsToConsider = retiredAndNotUpGroups.size() - 1;
}
+
+ var result = checkRedundancyForGroupsThatAreUp(retiredAndNotUpGroups);
+ if (result.isPresent() && result.get().notAllowed())
+ return result;
+
if (numberOfGroupsToConsider < maxNumberOfGroupsAllowedToBeDown) {
log.log(FINE, "Allow, retiredAndNotUpGroups=" + retiredAndNotUpGroups);
return Optional.of(allow());
@@ -272,6 +279,28 @@ public class NodeStateChangeChecker {
sortSetIntoList(retiredAndNotUpGroups))));
}
+ private Optional<Result> checkRedundancyForGroupsThatAreUp(Set<Integer> retiredAndNotUpGroups) {
+ Set<Integer> groupsThatAreUp = groupsThatAreUp(retiredAndNotUpGroups);
+ log.log(FINE, "Check min replication for groups " + groupsThatAreUp);
+ for (int group : groupsThatAreUp) {
+ List<ConfiguredNode> nodesInGroup = getNodesInGroup(group);
+ for (var n : nodesInGroup) {
+ log.log(FINE, "Check min replication for index " + n.index() + " in group " + group);
+ Set<Integer> indexesToCheck = nodesInGroup.stream().map(ConfiguredNode::index).collect(Collectors.toSet());
+ var r = checkRedundancySeenFromDistributor(clusterInfo.getDistributorNodeInfo(n.index()), indexesToCheck);
+ if (r.notAllowed())
+ return Optional.of(r);
+ }
+ }
+ return Optional.empty();
+ }
+
+ private Set<Integer> groupsThatAreUp(Set<Integer> retiredAndNotUpGroups) {
+ var allGroups = allGroupIndexes();
+ allGroups.removeAll(retiredAndNotUpGroups);
+ return allGroups;
+ }
+
private static boolean nodeIsDown(ClusterState clusterState, NodeInfo nodeInfo) {
return clusterState.getNodeState(nodeInfo.getNode()).getState() == DOWN;
}
@@ -404,6 +433,33 @@ public class NodeStateChangeChecker {
return allow();
}
+ private Result checkRedundancySeenFromDistributor(DistributorNodeInfo distributorNodeInfo, Set<Integer> indexesToCheck) {
+ Map<Integer, Integer> replication = new LinkedHashMap<>(minReplication(distributorNodeInfo));
+
+ Integer minReplication = null;
+ Integer minReplicationIndex = null;
+ for (var entry : replication.entrySet()) {
+ Integer value = entry.getValue();
+ Integer nodeIndex = entry.getKey();
+ if ( ! indexesToCheck.contains(nodeIndex)) continue;
+ if (minReplication == null || (value != null && value < minReplication)) {
+ minReplication = value;
+ minReplicationIndex = nodeIndex;
+ if (minReplication < requiredRedundancy) break;
+ }
+ }
+
+ // Why test on != null? Missing min-replication is OK (indicate empty/few buckets on system).
+ if (minReplication != null && minReplication < requiredRedundancy) {
+ return disallow("Distributor " + distributorNodeInfo.getNodeIndex()
+ + " says storage node " + minReplicationIndex
+ + " has buckets with redundancy as low as "
+ + minReplication + ", but we require at least " + requiredRedundancy);
+ }
+
+ return allow();
+ }
+
private Map<Integer, Integer> minReplication(DistributorNodeInfo distributorNodeInfo) {
Map<Integer, Integer> replicationPerNodeIndex = new HashMap<>();
for (StorageNode storageNode : distributorNodeInfo.getHostInfo().getDistributor().getStorageNodes()) {
@@ -456,6 +512,25 @@ public class NodeStateChangeChecker {
.collect(Collectors.toSet());
}
+ private Set<Integer> allGroupIndexes() {
+ return clusterInfo.getAllNodeInfos().stream()
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .map(Group::getIndex)
+ .collect(Collectors.toSet());
+ }
+
+ private Group groupForThisIndex(int groupIndex) {
+ return clusterInfo.getAllNodeInfos().stream()
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .filter(group -> group.getIndex() == groupIndex)
+ .findFirst()
+ .orElseThrow();
+ }
+
// groups with at least one node with the same state & description
private Set<Integer> groupsWithSameStateAndDescription(State state, String newDescription) {
return clusterInfo.getAllNodeInfos().stream()
@@ -485,6 +560,10 @@ public class NodeStateChangeChecker {
.collect(Collectors.toSet());
}
+ private List<ConfiguredNode> getNodesInGroup(int groupIndex) {
+ return groupForThisIndex(groupIndex).getNodes();
+ }
+
public static class Result {
public enum Action {