summaryrefslogtreecommitdiffstats
path: root/config-model
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@verizonmedia.com>2019-10-24 15:20:28 +0200
committerJon Bratseth <bratseth@verizonmedia.com>2019-10-24 15:20:28 +0200
commit9e5551f62ebabad3d9e9b53da0dba3d4f2068702 (patch)
treea9e62d9e3fa4588c26e73886ec437f1891f6e07b /config-model
parent453525e3992472544e8b27627e7253d367b19c0f (diff)
Include retired cluster controllers
When all nodes are replaced we lose the cluster state if we only allocate cluster controllers on non-retired nodes. This does not work with global documents as it leads all nodes to enter the maintenance state where buckets are deactivated until global documents are completely migrated to new nodes. By including the retired cluster controllers in the cluster controller cluster we preserve the existing ZooKeeper state which prevents this situation.
Diffstat (limited to 'config-model')
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java24
-rw-r--r--config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java13
2 files changed, 25 insertions, 12 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java
index fdf88124012..270b425fda9 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java
@@ -50,7 +50,6 @@ import com.yahoo.vespa.model.content.engines.PersistenceEngine;
import com.yahoo.vespa.model.content.engines.ProtonEngine;
import com.yahoo.vespa.model.content.storagecluster.StorageCluster;
import com.yahoo.vespa.model.search.IndexedSearchCluster;
-import com.yahoo.vespa.model.search.MultilevelDispatchValidator;
import com.yahoo.vespa.model.search.Tuning;
import org.w3c.dom.Element;
@@ -318,7 +317,8 @@ public class ContentCluster extends AbstractConfigProducer implements
if (clusterControllers == null) {
List<HostResource> hosts = admin.getClusterControllerHosts();
if (hosts.size() > 1) {
- context.getDeployState().getDeployLogger().log(Level.INFO, "When having content cluster(s) and more than 1 config server it is recommended to configure cluster controllers explicitly.");
+ context.getDeployState().getDeployLogger().log(Level.INFO,
+ "When having content cluster(s) and more than 1 config server it is recommended to configure cluster controllers explicitly.");
}
clusterControllers = createClusterControllers(admin, hosts, "cluster-controllers", false, context.getDeployState());
admin.setClusterControllers(clusterControllers);
@@ -348,10 +348,20 @@ public class ContentCluster extends AbstractConfigProducer implements
}
private List<HostResource> drawControllerHosts(int count, StorageGroup rootGroup, Collection<ContainerModel> containers) {
- List<HostResource> hosts = drawContentHostsRecursively(count, rootGroup);
+ List<HostResource> hosts = drawControllerHosts(count, false, rootGroup, containers);
+ List<HostResource> retiredHosts = drawControllerHosts(count, true, rootGroup, containers);
+
+ // preserve the cluster state in case all pre-existing controllers are on retired nodes
+ List<HostResource> all = new ArrayList<>(hosts);
+ all.addAll(retiredHosts);
+ return all;
+ }
+
+ private List<HostResource> drawControllerHosts(int count, boolean retired, StorageGroup rootGroup, Collection<ContainerModel> containers) {
+ List<HostResource> hosts = drawContentHostsRecursively(count, retired, rootGroup);
// if (hosts.size() < count) // supply with containers TODO: Currently disabled due to leading to topology change problems
// hosts.addAll(drawContainerHosts(count - hosts.size(), containers, new HashSet<>(hosts)));
- if (hosts.size() % 2 == 0) // ZK clusters of even sizes are less available (even in the size=2 case)
+ if (hosts.size() % 2 == 0 && ! hosts.isEmpty()) // ZK clusters of even sizes are less available (even in the size=2 case)
hosts = hosts.subList(0, hosts.size()-1);
return hosts;
}
@@ -425,16 +435,16 @@ public class ContentCluster extends AbstractConfigProducer implements
*/
// Note: This method cannot be changed to draw different nodes without ensuring that it will draw nodes
// which overlaps with previously drawn nodes as that will prevent rolling upgrade
- private List<HostResource> drawContentHostsRecursively(int count, StorageGroup group) {
+ private List<HostResource> drawContentHostsRecursively(int count, boolean retired, StorageGroup group) {
Set<HostResource> hosts = new HashSet<>();
if (group.getNodes().isEmpty()) {
int hostsPerSubgroup = (int)Math.ceil((double)count / group.getSubgroups().size());
for (StorageGroup subgroup : group.getSubgroups())
- hosts.addAll(drawContentHostsRecursively(hostsPerSubgroup, subgroup));
+ hosts.addAll(drawContentHostsRecursively(hostsPerSubgroup, retired, subgroup));
}
else {
hosts.addAll(group.getNodes().stream()
- .filter(node -> ! node.isRetired()) // Avoid retired controllers to avoid surprises on expiry
+ .filter(node -> node.isRetired() == retired)
.map(StorageNode::getHostResource).collect(Collectors.toList()));
}
diff --git a/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java b/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java
index 7c71c399f35..d4457fe59cd 100644
--- a/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java
+++ b/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java
@@ -636,7 +636,7 @@ public class ModelProvisioningTest {
}
@Test
- public void testClusterControllersAreNotPlacedOnRetiredNodes() {
+ public void testClusterControllersIncludeNonRetiredNodes() {
String services =
"<?xml version='1.0' encoding='utf-8' ?>\n" +
"<services>" +
@@ -662,11 +662,14 @@ public class ModelProvisioningTest {
// Check content clusters
ContentCluster cluster = model.getContentClusters().get("bar");
ClusterControllerContainerCluster clusterControllers = cluster.getClusterControllers();
- assertEquals(3, clusterControllers.getContainers().size());
+ assertEquals(3 + 3, clusterControllers.getContainers().size()); // 3 new + 3 retired
assertEquals("bar-controllers", clusterControllers.getName());
- assertEquals("Skipping retired default09", "node-1-3-9-08", clusterControllers.getContainers().get(0).getHostName());
- assertEquals("Skipping retired default06", "node-1-3-9-05", clusterControllers.getContainers().get(1).getHostName());
- assertEquals("Skipping retired default03", "node-1-3-9-02", clusterControllers.getContainers().get(2).getHostName());
+ assertEquals("Non-retired", "node-1-3-9-08", clusterControllers.getContainers().get(0).getHostName());
+ assertEquals("Non-retired", "node-1-3-9-05", clusterControllers.getContainers().get(1).getHostName());
+ assertEquals("Non-retired", "node-1-3-9-02", clusterControllers.getContainers().get(2).getHostName());
+ assertEquals("Retired", "node-1-3-9-09", clusterControllers.getContainers().get(3).getHostName());
+ assertEquals("Retired", "node-1-3-9-06", clusterControllers.getContainers().get(4).getHostName());
+ assertEquals("Retired", "node-1-3-9-03", clusterControllers.getContainers().get(5).getHostName());
}
@Test