aboutsummaryrefslogtreecommitdiffstats
path: root/orchestrator
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@yahooinc.com>2021-09-23 17:03:58 +0200
committerHåkon Hallingstad <hakon@yahooinc.com>2021-09-23 17:03:58 +0200
commitb229fb52a962c71fc74a1998c31a3ddcc868e991 (patch)
tree38d23ac3f9c1d838521b299672b8b3d2d1a68c53 /orchestrator
parent7b5054f54ae6768387a4806103d3d908a6c52f02 (diff)
Disallow cfg suspension based solely on being down
Diffstat (limited to 'orchestrator')
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java13
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java27
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java58
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java6
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java29
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java12
6 files changed, 80 insertions, 65 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java
index fd115702588..47ba6dedd84 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java
@@ -17,13 +17,20 @@ public interface ClusterApi {
String clusterInfo();
ClusterId clusterId();
ServiceType serviceType();
+
+ /** Some human-readable string naming the service(s) to a human reader. */
+ String serviceDescription(boolean plural);
+
boolean isStorageCluster();
- /** Returns the reasons no services are up in the implied group, or empty if some services are up. */
- Optional<SuspensionReasons> reasonsForNoServicesInGroupIsUp();
+ boolean isConfigServerLike();
+
+ /** Returns the non-empty reasons for why all services are down, or otherwise empty. */
+ Optional<SuspensionReasons> allServicesDown();
+
boolean noServicesOutsideGroupIsDown() throws HostStateChangeDeniedException;
- int percentageOfServicesDown();
+ int percentageOfServicesDownOutsideGroup();
int percentageOfServicesDownIfGroupIsAllowedToBeDown();
Optional<StorageNode> storageNodeInGroup();
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java
index 42f8a187e98..6880700e4a9 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java
@@ -107,6 +107,11 @@ class ClusterApiImpl implements ClusterApi {
}
@Override
+ public String serviceDescription(boolean plural) {
+ return serviceCluster.serviceDescription(plural);
+ }
+
+ @Override
public boolean isStorageCluster() {
return VespaModelUtil.isStorage(serviceCluster);
}
@@ -117,7 +122,12 @@ class ClusterApiImpl implements ClusterApi {
}
@Override
- public Optional<SuspensionReasons> reasonsForNoServicesInGroupIsUp() {
+ public boolean isConfigServerLike() {
+ return serviceCluster.isConfigServerLike();
+ }
+
+ @Override
+ public Optional<SuspensionReasons> allServicesDown() {
SuspensionReasons reasons = new SuspensionReasons();
for (ServiceInstance service : servicesInGroup) {
@@ -157,9 +167,8 @@ class ClusterApiImpl implements ClusterApi {
}
@Override
- public int percentageOfServicesDown() {
- int servicesDownInGroupCount = (int) servicesInGroup.stream().filter(this::serviceEffectivelyDown).count();
- int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices + servicesDownInGroupCount;
+ public int percentageOfServicesDownOutsideGroup() {
+ int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices;
return numberOfServicesDown * 100 / (serviceCluster.serviceInstances().size() + missingServices);
}
@@ -187,12 +196,11 @@ class ClusterApiImpl implements ClusterApi {
description.append(" ");
final int nodeLimit = 3;
- description.append("Suspended hosts: ");
description.append(suspended.stream().sorted().distinct().limit(nodeLimit).collect(Collectors.toList()).toString());
if (suspended.size() > nodeLimit) {
- description.append(", and " + (suspended.size() - nodeLimit) + " more");
+ description.append(" and " + (suspended.size() - nodeLimit) + " others");
}
- description.append(".");
+ description.append(" are suspended.");
}
Set<ServiceInstance> downElsewhere = servicesDownAndNotInGroup().stream()
@@ -204,7 +212,6 @@ class ClusterApiImpl implements ClusterApi {
description.append(" ");
final int serviceLimit = 2; // services info is verbose
- description.append("Services down on resumed hosts: ");
description.append(Stream.concat(
downElsewhere.stream().map(ServiceInstance::toString).sorted(),
missingServices > 0 ? Stream.of(descriptionOfMissingServices) : Stream.of())
@@ -213,9 +220,9 @@ class ClusterApiImpl implements ClusterApi {
.toString());
if (downElsewhereTotal > serviceLimit) {
- description.append(", and " + (downElsewhereTotal - serviceLimit) + " more");
+ description.append(" and " + (downElsewhereTotal - serviceLimit) + " others");
}
- description.append(".");
+ description.append(" are down.");
}
return description.toString();
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java
index 3a6c24a05e3..8b8ab7986d8 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java
@@ -23,30 +23,7 @@ public class HostedVespaClusterPolicy implements ClusterPolicy {
@Override
public SuspensionReasons verifyGroupGoingDownIsFine(ClusterApi clusterApi) throws HostStateChangeDeniedException {
- if (clusterApi.noServicesOutsideGroupIsDown()) {
- return SuspensionReasons.nothingNoteworthy();
- }
-
- int percentageOfServicesAllowedToBeDown = getConcurrentSuspensionLimit(clusterApi).asPercentage();
- if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= percentageOfServicesAllowedToBeDown) {
- return SuspensionReasons.nothingNoteworthy();
- }
-
- Optional<SuspensionReasons> suspensionReasons = clusterApi.reasonsForNoServicesInGroupIsUp();
- if (suspensionReasons.isPresent()) {
- return suspensionReasons.get();
- }
-
- String message = percentageOfServicesAllowedToBeDown <= 0
- ? "Suspension of service with type '" + clusterApi.serviceType() + "' not allowed: "
- + clusterApi.percentageOfServicesDown() + "% are suspended already." + clusterApi.downDescription()
- : "Suspension of service with type '" + clusterApi.serviceType()
- + "' would increase from " + clusterApi.percentageOfServicesDown()
- + "% to " + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown()
- + "%, over the limit of " + percentageOfServicesAllowedToBeDown + "%."
- + clusterApi.downDescription();
-
- throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(), ENOUGH_SERVICES_UP_CONSTRAINT, message);
+ return verifyGroupGoingDownIsFine(clusterApi, true);
}
@Override
@@ -54,22 +31,37 @@ public class HostedVespaClusterPolicy implements ClusterPolicy {
// This policy is similar to verifyGroupGoingDownIsFine, except that having no services up in the group will
// not allow the suspension: We are a bit more cautious when removing nodes.
+ verifyGroupGoingDownIsFine(clusterApi, false);
+ }
+
+ private SuspensionReasons verifyGroupGoingDownIsFine(ClusterApi clusterApi, boolean allowIfAllServicesAreDown)
+ throws HostStateChangeDeniedException {
if (clusterApi.noServicesOutsideGroupIsDown()) {
- return;
+ return SuspensionReasons.nothingNoteworthy();
}
int percentageOfServicesAllowedToBeDown = getConcurrentSuspensionLimit(clusterApi).asPercentage();
if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= percentageOfServicesAllowedToBeDown) {
- return;
+ return SuspensionReasons.nothingNoteworthy();
}
- throw new HostStateChangeDeniedException(
- clusterApi.getNodeGroup(),
- ENOUGH_SERVICES_UP_CONSTRAINT,
- "Down percentage for service type " + clusterApi.serviceType()
- + " would increase to " + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown()
- + "%, over the limit of " + percentageOfServicesAllowedToBeDown + "%."
- + clusterApi.downDescription());
+ // Disallow suspending a 2nd and downed config server to avoid losing ZK quorum.
+ if (!clusterApi.isConfigServerLike()) {
+ Optional<SuspensionReasons> suspensionReasons = clusterApi.allServicesDown();
+ if (suspensionReasons.isPresent()) {
+ return suspensionReasons.get();
+ }
+ }
+
+ String message = percentageOfServicesAllowedToBeDown <= 0
+ ? clusterApi.percentageOfServicesDownOutsideGroup() + "% of the " + clusterApi.serviceDescription(true)
+ + " are down or suspended already:" + clusterApi.downDescription()
+ : "The percentage of downed or suspended " + clusterApi.serviceDescription(true)
+ + " would increase from " + clusterApi.percentageOfServicesDownOutsideGroup() + "% to "
+ + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "% (limit is "
+ + percentageOfServicesAllowedToBeDown + "%):" + clusterApi.downDescription();
+
+ throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(), ENOUGH_SERVICES_UP_CONSTRAINT, message);
}
// Non-private for testing purposes
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java
index 83c902ed981..f885d5301de 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java
@@ -102,7 +102,7 @@ public class OrchestratorTest {
fail();
} catch (HostStateChangeDeniedException e) {
assertThat(e.getMessage(), containsString("Changing the state of cfg2 would violate enough-services-up"));
- assertThat(e.getMessage(), containsString("Suspended hosts: [cfg1]"));
+ assertThat(e.getMessage(), containsString("[cfg1] are suspended."));
}
// cfg1 is removed from the application
@@ -114,7 +114,7 @@ public class OrchestratorTest {
fail();
} catch (HostStateChangeDeniedException e) {
assertThat(e.getMessage(), containsString("Changing the state of cfg2 would violate enough-services-up"));
- assertThat(e.getMessage(), containsString("Services down on resumed hosts: [1 missing config server]"));
+ assertThat(e.getMessage(), containsString("[1 missing config server] are down."));
}
// cfg1 is reprovisioned, added to the node repo, and activated
@@ -129,7 +129,7 @@ public class OrchestratorTest {
fail();
} catch (HostStateChangeDeniedException e) {
assertThat(e.getMessage(), containsString("Changing the state of cfg1 would violate enough-services-up"));
- assertThat(e.getMessage(), containsString("Suspended hosts: [cfg2]"));
+ assertThat(e.getMessage(), containsString("[cfg2] are suspended"));
}
// etc (should be the same as for cfg1)
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java
index da8591c6631..3ea739b385e 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java
@@ -95,11 +95,11 @@ public class ClusterApiImplTest {
assertEquals("{ clusterId=cluster, serviceType=service-type }", clusterApi.clusterInfo());
assertFalse(clusterApi.isStorageCluster());
- assertEquals(" Suspended hosts: [host3, host4]. Services down on resumed hosts: [" +
- "ServiceInstance{configId=service-2, hostName=host2, serviceStatus=" +
- "ServiceStatusInfo{status=DOWN, since=Optional.empty, lastChecked=Optional.empty}}].",
+ assertEquals(" [host3, host4] are suspended. [ServiceInstance{configId=service-2, hostName=host2, " +
+ "serviceStatus=ServiceStatusInfo{status=DOWN, since=Optional.empty, lastChecked=Optional.empty}}] " +
+ "are down.",
clusterApi.downDescription());
- assertEquals(60, clusterApi.percentageOfServicesDown());
+ assertEquals(60, clusterApi.percentageOfServicesDownOutsideGroup());
assertEquals(80, clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown());
}
@@ -181,8 +181,8 @@ public class ClusterApiImplTest {
fail();
} catch (HostStateChangeDeniedException e) {
assertThat(e.getMessage(),
- containsString("Suspension of service with type 'configserver' not allowed: 33% are suspended already. " +
- "Services down on resumed hosts: [1 missing config server]."));
+ containsString("Changing the state of cfg1 would violate enough-services-up: 33% of the config " +
+ "servers are down or suspended already: [1 missing config server] are down."));
}
}
@@ -201,18 +201,25 @@ public class ClusterApiImplTest {
fail();
} catch (HostStateChangeDeniedException e) {
assertThat(e.getMessage(),
- containsString("Suspension of service with type 'hostadmin' not allowed: 33% are suspended already. " +
- "Services down on resumed hosts: [1 missing config server host]."));
+ containsString("Changing the state of cfg1 would violate enough-services-up: 33% of the config " +
+ "server hosts are down or suspended already: [1 missing config server host] are down."));
}
}
@Test
- public void testCfg1SuspendsIfDownWithMissingCfg3() throws HostStateChangeDeniedException {
+ public void testCfg1DoesNotSuspendIfDownWithMissingCfg3() throws HostStateChangeDeniedException {
ClusterApiImpl clusterApi = makeCfg1ClusterApi(ServiceStatus.DOWN, ServiceStatus.UP);
HostedVespaClusterPolicy policy = new HostedVespaClusterPolicy(flagSource, zone);
- policy.verifyGroupGoingDownIsFine(clusterApi);
+ try {
+ policy.verifyGroupGoingDownIsFine(clusterApi);
+ fail();
+ } catch (HostStateChangeDeniedException e) {
+ assertThat(e.getMessage(),
+ containsString("Changing the state of cfg1 would violate enough-services-up: 33% of the config " +
+ "servers are down or suspended already: [1 missing config server] are down."));
+ }
}
@Test
@@ -320,7 +327,7 @@ public class ClusterApiImplTest {
clock);
assertEquals(expectedNoServicesInGroupIsUp.map(SuspensionReasons::getMessagesInOrder),
- clusterApi.reasonsForNoServicesInGroupIsUp().map(SuspensionReasons::getMessagesInOrder));
+ clusterApi.allServicesDown().map(SuspensionReasons::getMessagesInOrder));
assertEquals(expectedNoServicesOutsideGroupIsDown, clusterApi.noServicesOutsideGroupIsDown());
}
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java
index 303dabebba8..8f47051621f 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java
@@ -131,13 +131,14 @@ public class HostedVespaClusterPolicyTest {
int percentageOfServicesDownIfGroupIsAllowedToBeDown,
boolean expectSuccess) throws HostStateChangeDeniedException {
when(clusterApi.noServicesOutsideGroupIsDown()).thenReturn(noServicesOutsideGroupIsDown);
- when(clusterApi.reasonsForNoServicesInGroupIsUp()).thenReturn(noServicesInGroupIsUp);
+ when(clusterApi.allServicesDown()).thenReturn(noServicesInGroupIsUp);
when(clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown()).thenReturn(20);
doReturn(ConcurrentSuspensionLimitForCluster.TEN_PERCENT).when(policy).getConcurrentSuspensionLimit(clusterApi);
when(applicationApi.applicationId()).thenReturn(ApplicationId.fromSerializedForm("a:b:c"));
when(clusterApi.serviceType()).thenReturn(new ServiceType("service-type"));
- when(clusterApi.percentageOfServicesDown()).thenReturn(5);
+ when(clusterApi.serviceDescription(true)).thenReturn("services of {service-type,cluster-id}");
+ when(clusterApi.percentageOfServicesDownOutsideGroup()).thenReturn(5);
when(clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown()).thenReturn(percentageOfServicesDownIfGroupIsAllowedToBeDown);
when(clusterApi.downDescription()).thenReturn(" Down description");
@@ -152,9 +153,10 @@ public class HostedVespaClusterPolicyTest {
}
} catch (HostStateChangeDeniedException e) {
if (!expectSuccess) {
- assertEquals("Changing the state of node-group would violate enough-services-up: " +
- "Suspension of service with type 'service-type' would increase from 5% to 13%, " +
- "over the limit of 10%. Down description", e.getMessage());
+ assertEquals("Changing the state of node-group would violate enough-services-up: The percentage of downed " +
+ "or suspended services of {service-type,cluster-id} would increase from 5% to 13% (limit is 10%): " +
+ "Down description",
+ e.getMessage());
assertEquals("enough-services-up", e.getConstraintName());
}
}