diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2021-03-23 11:56:24 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2021-03-23 11:56:24 +0100 |
commit | 44f64849fdd2a0dc62e84eb701bae1216d0b2215 (patch) | |
tree | eb8700943c722c4e5bfc62714580025e01f49a5d /orchestrator | |
parent | 81871d3f99f2155b0d81b347b61ac685a7bbc13e (diff) |
Require 3 config server (and controller) hosts
We already require 3 config server (and controller) nodes, but it is not
sufficient to protect the hosts from being left with only 1 healthy host: Say
the config server host application contains 2 nodes. An upgrade of host-admin
on one of those nodes is allowed, since only the host is suspended and none of
the 2 nodes are down. This is fixed by handling config server hosts similar to
config servers: assume 3 nodes.
Diffstat (limited to 'orchestrator')
3 files changed, 68 insertions, 10 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java index 7417318c572..57dcb5f3069 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java @@ -81,9 +81,10 @@ class ClusterApiImpl implements ClusterApi { servicesDownAndNotInGroup = servicesNotInGroup.stream().filter(this::serviceEffectivelyDown).collect(Collectors.toSet()); int serviceInstances = serviceCluster.serviceInstances().size(); - if (serviceCluster.isConfigServerLike() && serviceInstances < numberOfConfigServers) { + if ((serviceCluster.isConfigServerLike() || serviceCluster.isConfigServerHostLike()) && + serviceInstances < numberOfConfigServers) { missingServices = numberOfConfigServers - serviceInstances; - descriptionOfMissingServices = missingServices + " missing config server" + (missingServices > 1 ? "s" : ""); + descriptionOfMissingServices = missingServices + " missing " + serviceCluster.nodeDescription(missingServices > 1); } else { missingServices = 0; descriptionOfMissingServices = "NA"; diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java index ff1c56f6b2f..47586425c1d 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.orchestrator.policy; +import com.yahoo.vespa.applicationmodel.ClusterId; import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.flags.BooleanFlag; import com.yahoo.vespa.flags.FetchVector; @@ -149,7 +150,11 @@ public class HostedVespaClusterPolicy implements ClusterPolicy { return ConcurrentSuspensionLimitForCluster.ONE_NODE; } - if (clusterApi.serviceType().equals(ServiceType.HOST_ADMIN)) { + if (clusterApi.serviceType() == ServiceType.HOST_ADMIN) { + if (Set.of(ClusterId.CONFIG_SERVER_HOST, ClusterId.CONTROLLER_HOST).contains(clusterApi.clusterId())) { + return ConcurrentSuspensionLimitForCluster.ONE_NODE; + } + return ConcurrentSuspensionLimitForCluster.TWENTY_PERCENT; } diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java index 412a08a8305..6d7c79176fb 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java @@ -1,17 +1,15 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.orchestrator.model; +import com.yahoo.config.provision.Zone; import com.yahoo.test.ManualClock; import com.yahoo.vespa.applicationmodel.ApplicationInstance; -import com.yahoo.vespa.applicationmodel.ApplicationInstanceId; -import com.yahoo.vespa.applicationmodel.ClusterId; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceCluster; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.applicationmodel.ServiceType; -import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.orchestrator.OrchestratorUtil; @@ -20,6 +18,9 @@ import com.yahoo.vespa.orchestrator.policy.HostedVespaClusterPolicy; import com.yahoo.vespa.orchestrator.policy.SuspensionReasons; import com.yahoo.vespa.orchestrator.status.HostInfos; import com.yahoo.vespa.orchestrator.status.HostStatus; +import com.yahoo.vespa.service.duper.ConfigServerApplication; +import com.yahoo.vespa.service.duper.ConfigServerHostApplication; +import com.yahoo.vespa.service.duper.InfraApplication; import org.junit.Test; import java.time.Duration; @@ -131,6 +132,38 @@ public class ClusterApiImplTest { } @Test + public void testCfghost1SuspensionFailsWithMissingCfghost3() { + ClusterApiImpl clusterApi = makeConfigClusterApi( + ModelTestUtils.NUMBER_OF_CONFIG_SERVERS, + new ConfigServerHostApplication(), + ServiceStatus.UP, + ServiceStatus.UP); + + HostedVespaClusterPolicy policy = new HostedVespaClusterPolicy(flagSource); + + try { + policy.verifyGroupGoingDownIsFine(clusterApi); + fail(); + } catch (HostStateChangeDeniedException e) { + assertThat(e.getMessage(), + containsString("Changing the state of cfg1 would violate enough-services-up: " + + "Suspension of service with type 'hostadmin' not allowed: 33% are suspended already. " + + "Services down on resumed hosts: [1 missing config server host].")); + } + + flagSource.withBooleanFlag(Flags.GROUP_SUSPENSION.id(), true); + + try { + policy.verifyGroupGoingDownIsFine(clusterApi); + fail(); + } catch (HostStateChangeDeniedException e) { + assertThat(e.getMessage(), + containsString("Suspension of service with type 'hostadmin' not allowed: 33% are suspended already. " + + "Services down on resumed hosts: [1 missing config server host].")); + } + } + + @Test public void testCfg1SuspendsIfDownWithMissingCfg3() throws HostStateChangeDeniedException { ClusterApiImpl clusterApi = makeCfg1ClusterApi(ServiceStatus.DOWN, ServiceStatus.UP); @@ -140,6 +173,19 @@ public class ClusterApiImplTest { } @Test + public void testCfghost1SuspendsIfDownWithMissingCfghost3() throws HostStateChangeDeniedException { + ClusterApiImpl clusterApi = makeConfigClusterApi( + ModelTestUtils.NUMBER_OF_CONFIG_SERVERS, + new ConfigServerHostApplication(), + ServiceStatus.DOWN, + ServiceStatus.UP); + + HostedVespaClusterPolicy policy = new HostedVespaClusterPolicy(flagSource); + + policy.verifyGroupGoingDownIsFine(clusterApi); + } + + @Test public void testSingleConfigServerCanSuspend() { for (var status : EnumSet.of(ServiceStatus.UP, ServiceStatus.DOWN)) { var clusterApi = makeConfigClusterApi(1, status); @@ -265,6 +311,11 @@ public class ClusterApiImplTest { } private ClusterApiImpl makeConfigClusterApi(int clusterSize, ServiceStatus first, ServiceStatus... rest) { + return makeConfigClusterApi(clusterSize, new ConfigServerApplication(), first, rest); + } + + private ClusterApiImpl makeConfigClusterApi(int clusterSize, InfraApplication infraApplication, + ServiceStatus first, ServiceStatus... rest) { var serviceStatusList = new ArrayList<ServiceStatus>(); serviceStatusList.add(first); serviceStatusList.addAll(List.of(rest)); @@ -275,9 +326,10 @@ public class ClusterApiImplTest { for (int i = 0; i < hostnames.size(); i++) { instances.add(modelUtils.createServiceInstance("cs" + i + 1, hostnames.get(i), serviceStatusList.get(i))); } + ServiceCluster serviceCluster = modelUtils.createServiceCluster( - ClusterId.CONFIG_SERVER.s(), - ServiceType.CONFIG_SERVER, + infraApplication.getClusterId().s(), + infraApplication.getServiceType(), instances ); for (var instance : instances) { @@ -287,8 +339,8 @@ public class ClusterApiImplTest { Set<ServiceCluster> serviceClusterSet = Set.of(serviceCluster); ApplicationInstance application = new ApplicationInstance( - TenantId.HOSTED_VESPA, - ApplicationInstanceId.CONFIG_SERVER, + infraApplication.getTenantId(), + infraApplication.getApplicationInstanceId(Zone.defaultZone()), serviceClusterSet); serviceCluster.setApplicationInstance(application); |