summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorHarald Musum <musum@yahooinc.com>2023-07-17 21:24:11 +0200
committerHarald Musum <musum@yahooinc.com>2023-07-17 21:24:11 +0200
commit715a4b3e10738eec80446dd06cb8c8e9911cec50 (patch)
tree8a561d13bfa7e0f759b570a571484bd9c9b05ca5 /clustercontroller-core
parent4de39b50afecc59d1e7cc05b658200979016c5ef (diff)
Check min replication seen from all distributors that are UP
This means we will also check distributors that are on same node as a retired storage node
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java44
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java2
2 files changed, 17 insertions, 29 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index 01d9e73c7bc..c6fc9c8c184 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -18,6 +18,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -265,7 +266,7 @@ public class NodeStateChangeChecker {
numberOfGroupsToConsider = retiredAndNotUpGroups.size() - 1;
}
- var result = checkRedundancyForGroupsThatAreUp(retiredAndNotUpGroups);
+ var result = checkRedundancy(retiredAndNotUpGroups);
if (result.isPresent() && result.get().notAllowed())
return result;
@@ -279,28 +280,23 @@ public class NodeStateChangeChecker {
sortSetIntoList(retiredAndNotUpGroups))));
}
- private Optional<Result> checkRedundancyForGroupsThatAreUp(Set<Integer> retiredAndNotUpGroups) {
- Set<Integer> groupsThatAreUp = groupsThatAreUp(retiredAndNotUpGroups);
- log.log(FINE, "Check min replication for groups " + groupsThatAreUp);
- for (int group : groupsThatAreUp) {
- List<ConfiguredNode> nodesInGroup = getNodesInGroup(group);
- for (var n : nodesInGroup) {
- log.log(FINE, "Check min replication for index " + n.index() + " in group " + group);
- Set<Integer> indexesToCheck = nodesInGroup.stream().map(ConfiguredNode::index).collect(Collectors.toSet());
- var r = checkRedundancySeenFromDistributor(clusterInfo.getDistributorNodeInfo(n.index()), indexesToCheck);
- if (r.notAllowed())
- return Optional.of(r);
- }
+ // Check redundancy for nodes seen from all distributors that are UP for
+ // storage nodes that are in groups that should be UP
+ private Optional<Result> checkRedundancy(Set<Integer> retiredAndNotUpGroups) {
+ Set<Integer> indexesToCheck = new HashSet<>();
+ retiredAndNotUpGroups.forEach(index -> getNodesInGroup(index).forEach(node -> indexesToCheck.add(node.index())));
+
+ for (var distributorNodeInfo : clusterInfo.getDistributorNodeInfos()) {
+ // Skip distributors that are DOWN (otherwise they will be UP and should be checked)
+ if (distributorNodeInfo.getUserWantedState().getState() != UP) continue;
+
+ var r = checkRedundancySeenFromDistributor(distributorNodeInfo, indexesToCheck);
+ if (r.notAllowed())
+ return Optional.of(r);
}
return Optional.empty();
}
- private Set<Integer> groupsThatAreUp(Set<Integer> retiredAndNotUpGroups) {
- var allGroups = allGroupIndexes();
- allGroups.removeAll(retiredAndNotUpGroups);
- return allGroups;
- }
-
private static boolean nodeIsDown(ClusterState clusterState, NodeInfo nodeInfo) {
return clusterState.getNodeState(nodeInfo.getNode()).getState() == DOWN;
}
@@ -458,6 +454,7 @@ public class NodeStateChangeChecker {
return allow();
}
+ // Replication per storage node index
private Map<Integer, Integer> minReplication(DistributorNodeInfo distributorNodeInfo) {
Map<Integer, Integer> replicationPerNodeIndex = new HashMap<>();
for (StorageNode storageNode : distributorNodeInfo.getHostInfo().getDistributor().getStorageNodes()) {
@@ -510,15 +507,6 @@ public class NodeStateChangeChecker {
.collect(Collectors.toSet());
}
- private Set<Integer> allGroupIndexes() {
- return clusterInfo.getAllNodeInfos().stream()
- .map(NodeInfo::getGroup)
- .filter(Objects::nonNull)
- .filter(Group::isLeafGroup)
- .map(Group::getIndex)
- .collect(Collectors.toSet());
- }
-
private Group groupForThisIndex(int groupIndex) {
return clusterInfo.getAllNodeInfos().stream()
.map(NodeInfo::getGroup)
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
index 9cb589636da..b73ee86251f 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
@@ -309,7 +309,7 @@ public class NodeStateChangeCheckerTest {
Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
assertFalse(result.allowed(), result.toString());
assertFalse(result.isAlreadySet());
- assertEquals("Distributor 2 says storage node 2 has buckets with redundancy as low as 1, but we require at least 4", result.reason());
+ assertEquals("Distributor 0 says storage node 0 has buckets with redundancy as low as 1, but we require at least 4", result.reason());
}
}