From b229fb52a962c71fc74a1998c31a3ddcc868e991 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Thu, 23 Sep 2021 17:03:58 +0200 Subject: Disallow cfg suspension based solely on being down --- .../yahoo/vespa/orchestrator/model/ClusterApi.java | 13 +++-- .../vespa/orchestrator/model/ClusterApiImpl.java | 27 ++++++---- .../policy/HostedVespaClusterPolicy.java | 58 ++++++++++------------ .../yahoo/vespa/orchestrator/OrchestratorTest.java | 6 +-- .../orchestrator/model/ClusterApiImplTest.java | 29 +++++++---- .../policy/HostedVespaClusterPolicyTest.java | 12 +++-- 6 files changed, 80 insertions(+), 65 deletions(-) (limited to 'orchestrator') diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java index fd115702588..47ba6dedd84 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java @@ -17,13 +17,20 @@ public interface ClusterApi { String clusterInfo(); ClusterId clusterId(); ServiceType serviceType(); + + /** Some human-readable string naming the service(s) to a human reader. */ + String serviceDescription(boolean plural); + boolean isStorageCluster(); - /** Returns the reasons no services are up in the implied group, or empty if some services are up. */ - Optional reasonsForNoServicesInGroupIsUp(); + boolean isConfigServerLike(); + + /** Returns the non-empty reasons for why all services are down, or otherwise empty. */ + Optional allServicesDown(); + boolean noServicesOutsideGroupIsDown() throws HostStateChangeDeniedException; - int percentageOfServicesDown(); + int percentageOfServicesDownOutsideGroup(); int percentageOfServicesDownIfGroupIsAllowedToBeDown(); Optional storageNodeInGroup(); diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java index 42f8a187e98..6880700e4a9 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java @@ -106,6 +106,11 @@ class ClusterApiImpl implements ClusterApi { return serviceCluster.serviceType(); } + @Override + public String serviceDescription(boolean plural) { + return serviceCluster.serviceDescription(plural); + } + @Override public boolean isStorageCluster() { return VespaModelUtil.isStorage(serviceCluster); @@ -117,7 +122,12 @@ class ClusterApiImpl implements ClusterApi { } @Override - public Optional reasonsForNoServicesInGroupIsUp() { + public boolean isConfigServerLike() { + return serviceCluster.isConfigServerLike(); + } + + @Override + public Optional allServicesDown() { SuspensionReasons reasons = new SuspensionReasons(); for (ServiceInstance service : servicesInGroup) { @@ -157,9 +167,8 @@ class ClusterApiImpl implements ClusterApi { } @Override - public int percentageOfServicesDown() { - int servicesDownInGroupCount = (int) servicesInGroup.stream().filter(this::serviceEffectivelyDown).count(); - int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices + servicesDownInGroupCount; + public int percentageOfServicesDownOutsideGroup() { + int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices; return numberOfServicesDown * 100 / (serviceCluster.serviceInstances().size() + missingServices); } @@ -187,12 +196,11 @@ class ClusterApiImpl implements ClusterApi { description.append(" "); final int nodeLimit = 3; - description.append("Suspended hosts: "); description.append(suspended.stream().sorted().distinct().limit(nodeLimit).collect(Collectors.toList()).toString()); if (suspended.size() > nodeLimit) { - description.append(", and " + (suspended.size() - nodeLimit) + " more"); + description.append(" and " + (suspended.size() - nodeLimit) + " others"); } - description.append("."); + description.append(" are suspended."); } Set downElsewhere = servicesDownAndNotInGroup().stream() @@ -204,7 +212,6 @@ class ClusterApiImpl implements ClusterApi { description.append(" "); final int serviceLimit = 2; // services info is verbose - description.append("Services down on resumed hosts: "); description.append(Stream.concat( downElsewhere.stream().map(ServiceInstance::toString).sorted(), missingServices > 0 ? Stream.of(descriptionOfMissingServices) : Stream.of()) @@ -213,9 +220,9 @@ class ClusterApiImpl implements ClusterApi { .toString()); if (downElsewhereTotal > serviceLimit) { - description.append(", and " + (downElsewhereTotal - serviceLimit) + " more"); + description.append(" and " + (downElsewhereTotal - serviceLimit) + " others"); } - description.append("."); + description.append(" are down."); } return description.toString(); diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java index 3a6c24a05e3..8b8ab7986d8 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java @@ -23,30 +23,7 @@ public class HostedVespaClusterPolicy implements ClusterPolicy { @Override public SuspensionReasons verifyGroupGoingDownIsFine(ClusterApi clusterApi) throws HostStateChangeDeniedException { - if (clusterApi.noServicesOutsideGroupIsDown()) { - return SuspensionReasons.nothingNoteworthy(); - } - - int percentageOfServicesAllowedToBeDown = getConcurrentSuspensionLimit(clusterApi).asPercentage(); - if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= percentageOfServicesAllowedToBeDown) { - return SuspensionReasons.nothingNoteworthy(); - } - - Optional suspensionReasons = clusterApi.reasonsForNoServicesInGroupIsUp(); - if (suspensionReasons.isPresent()) { - return suspensionReasons.get(); - } - - String message = percentageOfServicesAllowedToBeDown <= 0 - ? "Suspension of service with type '" + clusterApi.serviceType() + "' not allowed: " - + clusterApi.percentageOfServicesDown() + "% are suspended already." + clusterApi.downDescription() - : "Suspension of service with type '" + clusterApi.serviceType() - + "' would increase from " + clusterApi.percentageOfServicesDown() - + "% to " + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() - + "%, over the limit of " + percentageOfServicesAllowedToBeDown + "%." - + clusterApi.downDescription(); - - throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(), ENOUGH_SERVICES_UP_CONSTRAINT, message); + return verifyGroupGoingDownIsFine(clusterApi, true); } @Override @@ -54,22 +31,37 @@ public class HostedVespaClusterPolicy implements ClusterPolicy { // This policy is similar to verifyGroupGoingDownIsFine, except that having no services up in the group will // not allow the suspension: We are a bit more cautious when removing nodes. + verifyGroupGoingDownIsFine(clusterApi, false); + } + + private SuspensionReasons verifyGroupGoingDownIsFine(ClusterApi clusterApi, boolean allowIfAllServicesAreDown) + throws HostStateChangeDeniedException { if (clusterApi.noServicesOutsideGroupIsDown()) { - return; + return SuspensionReasons.nothingNoteworthy(); } int percentageOfServicesAllowedToBeDown = getConcurrentSuspensionLimit(clusterApi).asPercentage(); if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= percentageOfServicesAllowedToBeDown) { - return; + return SuspensionReasons.nothingNoteworthy(); } - throw new HostStateChangeDeniedException( - clusterApi.getNodeGroup(), - ENOUGH_SERVICES_UP_CONSTRAINT, - "Down percentage for service type " + clusterApi.serviceType() - + " would increase to " + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() - + "%, over the limit of " + percentageOfServicesAllowedToBeDown + "%." - + clusterApi.downDescription()); + // Disallow suspending a 2nd and downed config server to avoid losing ZK quorum. + if (!clusterApi.isConfigServerLike()) { + Optional suspensionReasons = clusterApi.allServicesDown(); + if (suspensionReasons.isPresent()) { + return suspensionReasons.get(); + } + } + + String message = percentageOfServicesAllowedToBeDown <= 0 + ? clusterApi.percentageOfServicesDownOutsideGroup() + "% of the " + clusterApi.serviceDescription(true) + + " are down or suspended already:" + clusterApi.downDescription() + : "The percentage of downed or suspended " + clusterApi.serviceDescription(true) + + " would increase from " + clusterApi.percentageOfServicesDownOutsideGroup() + "% to " + + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "% (limit is " + + percentageOfServicesAllowedToBeDown + "%):" + clusterApi.downDescription(); + + throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(), ENOUGH_SERVICES_UP_CONSTRAINT, message); } // Non-private for testing purposes diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java index 83c902ed981..f885d5301de 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorTest.java @@ -102,7 +102,7 @@ public class OrchestratorTest { fail(); } catch (HostStateChangeDeniedException e) { assertThat(e.getMessage(), containsString("Changing the state of cfg2 would violate enough-services-up")); - assertThat(e.getMessage(), containsString("Suspended hosts: [cfg1]")); + assertThat(e.getMessage(), containsString("[cfg1] are suspended.")); } // cfg1 is removed from the application @@ -114,7 +114,7 @@ public class OrchestratorTest { fail(); } catch (HostStateChangeDeniedException e) { assertThat(e.getMessage(), containsString("Changing the state of cfg2 would violate enough-services-up")); - assertThat(e.getMessage(), containsString("Services down on resumed hosts: [1 missing config server]")); + assertThat(e.getMessage(), containsString("[1 missing config server] are down.")); } // cfg1 is reprovisioned, added to the node repo, and activated @@ -129,7 +129,7 @@ public class OrchestratorTest { fail(); } catch (HostStateChangeDeniedException e) { assertThat(e.getMessage(), containsString("Changing the state of cfg1 would violate enough-services-up")); - assertThat(e.getMessage(), containsString("Suspended hosts: [cfg2]")); + assertThat(e.getMessage(), containsString("[cfg2] are suspended")); } // etc (should be the same as for cfg1) diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java index da8591c6631..3ea739b385e 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ClusterApiImplTest.java @@ -95,11 +95,11 @@ public class ClusterApiImplTest { assertEquals("{ clusterId=cluster, serviceType=service-type }", clusterApi.clusterInfo()); assertFalse(clusterApi.isStorageCluster()); - assertEquals(" Suspended hosts: [host3, host4]. Services down on resumed hosts: [" + - "ServiceInstance{configId=service-2, hostName=host2, serviceStatus=" + - "ServiceStatusInfo{status=DOWN, since=Optional.empty, lastChecked=Optional.empty}}].", + assertEquals(" [host3, host4] are suspended. [ServiceInstance{configId=service-2, hostName=host2, " + + "serviceStatus=ServiceStatusInfo{status=DOWN, since=Optional.empty, lastChecked=Optional.empty}}] " + + "are down.", clusterApi.downDescription()); - assertEquals(60, clusterApi.percentageOfServicesDown()); + assertEquals(60, clusterApi.percentageOfServicesDownOutsideGroup()); assertEquals(80, clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown()); } @@ -181,8 +181,8 @@ public class ClusterApiImplTest { fail(); } catch (HostStateChangeDeniedException e) { assertThat(e.getMessage(), - containsString("Suspension of service with type 'configserver' not allowed: 33% are suspended already. " + - "Services down on resumed hosts: [1 missing config server].")); + containsString("Changing the state of cfg1 would violate enough-services-up: 33% of the config " + + "servers are down or suspended already: [1 missing config server] are down.")); } } @@ -201,18 +201,25 @@ public class ClusterApiImplTest { fail(); } catch (HostStateChangeDeniedException e) { assertThat(e.getMessage(), - containsString("Suspension of service with type 'hostadmin' not allowed: 33% are suspended already. " + - "Services down on resumed hosts: [1 missing config server host].")); + containsString("Changing the state of cfg1 would violate enough-services-up: 33% of the config " + + "server hosts are down or suspended already: [1 missing config server host] are down.")); } } @Test - public void testCfg1SuspendsIfDownWithMissingCfg3() throws HostStateChangeDeniedException { + public void testCfg1DoesNotSuspendIfDownWithMissingCfg3() throws HostStateChangeDeniedException { ClusterApiImpl clusterApi = makeCfg1ClusterApi(ServiceStatus.DOWN, ServiceStatus.UP); HostedVespaClusterPolicy policy = new HostedVespaClusterPolicy(flagSource, zone); - policy.verifyGroupGoingDownIsFine(clusterApi); + try { + policy.verifyGroupGoingDownIsFine(clusterApi); + fail(); + } catch (HostStateChangeDeniedException e) { + assertThat(e.getMessage(), + containsString("Changing the state of cfg1 would violate enough-services-up: 33% of the config " + + "servers are down or suspended already: [1 missing config server] are down.")); + } } @Test @@ -320,7 +327,7 @@ public class ClusterApiImplTest { clock); assertEquals(expectedNoServicesInGroupIsUp.map(SuspensionReasons::getMessagesInOrder), - clusterApi.reasonsForNoServicesInGroupIsUp().map(SuspensionReasons::getMessagesInOrder)); + clusterApi.allServicesDown().map(SuspensionReasons::getMessagesInOrder)); assertEquals(expectedNoServicesOutsideGroupIsDown, clusterApi.noServicesOutsideGroupIsDown()); } diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java index 303dabebba8..8f47051621f 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicyTest.java @@ -131,13 +131,14 @@ public class HostedVespaClusterPolicyTest { int percentageOfServicesDownIfGroupIsAllowedToBeDown, boolean expectSuccess) throws HostStateChangeDeniedException { when(clusterApi.noServicesOutsideGroupIsDown()).thenReturn(noServicesOutsideGroupIsDown); - when(clusterApi.reasonsForNoServicesInGroupIsUp()).thenReturn(noServicesInGroupIsUp); + when(clusterApi.allServicesDown()).thenReturn(noServicesInGroupIsUp); when(clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown()).thenReturn(20); doReturn(ConcurrentSuspensionLimitForCluster.TEN_PERCENT).when(policy).getConcurrentSuspensionLimit(clusterApi); when(applicationApi.applicationId()).thenReturn(ApplicationId.fromSerializedForm("a:b:c")); when(clusterApi.serviceType()).thenReturn(new ServiceType("service-type")); - when(clusterApi.percentageOfServicesDown()).thenReturn(5); + when(clusterApi.serviceDescription(true)).thenReturn("services of {service-type,cluster-id}"); + when(clusterApi.percentageOfServicesDownOutsideGroup()).thenReturn(5); when(clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown()).thenReturn(percentageOfServicesDownIfGroupIsAllowedToBeDown); when(clusterApi.downDescription()).thenReturn(" Down description"); @@ -152,9 +153,10 @@ public class HostedVespaClusterPolicyTest { } } catch (HostStateChangeDeniedException e) { if (!expectSuccess) { - assertEquals("Changing the state of node-group would violate enough-services-up: " + - "Suspension of service with type 'service-type' would increase from 5% to 13%, " + - "over the limit of 10%. Down description", e.getMessage()); + assertEquals("Changing the state of node-group would violate enough-services-up: The percentage of downed " + + "or suspended services of {service-type,cluster-id} would increase from 5% to 13% (limit is 10%): " + + "Down description", + e.getMessage()); assertEquals("enough-services-up", e.getConstraintName()); } } -- cgit v1.2.3