diff options
author | Håkon Hallingstad <hakon@yahooinc.com> | 2023-08-01 15:16:57 +0200 |
---|---|---|
committer | Håkon Hallingstad <hakon@yahooinc.com> | 2023-08-01 15:16:57 +0200 |
commit | 5c7ee97241489c69858ca813d164a3d4309409ab (patch) | |
tree | 213f8e5b34b61a04b89b75ae5ab8a69727f23e49 /orchestrator/src/main | |
parent | ad484e51eb9d86bb47288aa742ac06ad82f1a354 (diff) |
Use orchestration override if present
Diffstat (limited to 'orchestrator/src/main')
5 files changed, 118 insertions, 33 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java index cb2a2fe5f62..7fa3bd45b4c 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java @@ -30,10 +30,19 @@ public interface ClusterApi { boolean noServicesOutsideGroupIsDown() throws HostStateChangeDeniedException; - int percentageOfServicesDownOutsideGroup(); - int percentageOfServicesDownIfGroupIsAllowedToBeDown(); + /** Returns the number of services currently in the cluster, plus the number of missing services. */ + int size(); + + int servicesDownOutsideGroup(); + default int percentageOfServicesDownOutsideGroup() { return sizePercentageOf(servicesDownOutsideGroup()); } + int servicesDownIfGroupIsAllowedToBeDown(); + default int percentageOfServicesDownIfGroupIsAllowedToBeDown() { return sizePercentageOf(servicesDownIfGroupIsAllowedToBeDown()); } + + ClusterPolicyOverride clusterPolicyOverride(); Optional<StorageNode> storageNodeInGroup(); String downDescription(); + + private int sizePercentageOf(int count) { return (int) Math.round(100.0 * count / size()); } } diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java index 736b909a82f..6240761dd6b 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java @@ -173,15 +173,21 @@ class ClusterApiImpl implements ClusterApi { } @Override - public int percentageOfServicesDownOutsideGroup() { - int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices; - return numberOfServicesDown * 100 / (serviceCluster.serviceInstances().size() + missingServices); + public int size() { return serviceCluster.serviceInstances().size() + missingServices; } + + @Override + public int servicesDownOutsideGroup() { + return servicesDownAndNotInGroup().size() + missingServices; + } + + @Override + public int servicesDownIfGroupIsAllowedToBeDown() { + return servicesDownAndNotInGroup().size() + servicesInGroup.size() + missingServices; } @Override - public int percentageOfServicesDownIfGroupIsAllowedToBeDown() { - int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices + servicesInGroup.size(); - return numberOfServicesDown * 100 / (serviceCluster.serviceInstances().size() + missingServices); + public ClusterPolicyOverride clusterPolicyOverride() { + return clusterPolicyOverride; } /** @@ -206,7 +212,7 @@ class ClusterApiImpl implements ClusterApi { if (suspended.size() > nodeLimit) { description.append(" and " + (suspended.size() - nodeLimit) + " others"); } - description.append(" are suspended."); + description.append(" " + isOrAre(suspended.size()) + " suspended."); } Set<ServiceInstance> downElsewhere = servicesDownAndNotInGroup().stream() @@ -228,12 +234,14 @@ class ClusterApiImpl implements ClusterApi { if (downElsewhereTotal > serviceLimit) { description.append(" and " + (downElsewhereTotal - serviceLimit) + " others"); } - description.append(" are down."); + description.append(" " + isOrAre(downElsewhereTotal) + " down."); } return description.toString(); } + private static String isOrAre(int count) { return count == 1 ? "is" : "are"; } + private Optional<StorageNode> storageNodeInGroup(Predicate<ServiceInstance> storageServicePredicate) { if (!VespaModelUtil.isStorage(serviceCluster)) { return Optional.empty(); diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java index 2bdfa1a6659..f724a4da9cb 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java @@ -1,6 +1,9 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.orchestrator.model; +import com.yahoo.vespa.orchestrator.policy.SuspensionLimit; + +import java.util.Optional; import java.util.OptionalDouble; import java.util.OptionalInt; @@ -28,6 +31,16 @@ public record ClusterPolicyOverride(int deployedSize, OptionalInt expectedSize, } + public static ClusterPolicyOverride fromDeployedSize(int deployedSize) { + return new ClusterPolicyOverride(deployedSize, OptionalInt.empty(), OptionalInt.empty(), OptionalDouble.empty()); + } + + public Optional<SuspensionLimit> getSuspensionLimit() { + return allowedDown.isPresent() || allowedDownRatio.isPresent() ? + Optional.of(new SuspensionLimit(allowedDown.orElse(0), allowedDownRatio.orElse(0.0))) : + Optional.empty(); + } + public OptionalInt allowedDownPercentage() { return allowedDownRatio.isPresent() ? OptionalInt.of((int) Math.round(allowedDownRatio.getAsDouble() * 100.0)) : diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java index 5d553c86c50..88b339e15f3 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java @@ -37,10 +37,11 @@ public class HostedVespaClusterPolicy implements ClusterPolicy { return SuspensionReasons.nothingNoteworthy(); } - int percentageOfServicesAllowedToBeDown = getConcurrentSuspensionLimit(clusterApi).asPercentage(); - if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= percentageOfServicesAllowedToBeDown) { + SuspensionLimit limit = getConcurrentSuspensionLimit(clusterApi); + if (clusterApi.servicesDownIfGroupIsAllowedToBeDown() <= limit.allowedDown()) + return SuspensionReasons.nothingNoteworthy(); + if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= limit.allowedDownPercentage()) return SuspensionReasons.nothingNoteworthy(); - } // Be a bit more cautious when removing nodes permanently if (!permanent) { @@ -50,19 +51,39 @@ public class HostedVespaClusterPolicy implements ClusterPolicy { } } - String message = percentageOfServicesAllowedToBeDown <= 0 - ? clusterApi.percentageOfServicesDownOutsideGroup() + "% of the " + clusterApi.serviceDescription(true) - + " are down or suspended already:" + clusterApi.downDescription() - : "The percentage of downed or suspended " + clusterApi.serviceDescription(true) - + " would increase from " + clusterApi.percentageOfServicesDownOutsideGroup() + "% to " - + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "% (limit is " - + percentageOfServicesAllowedToBeDown + "%):" + clusterApi.downDescription(); + final String message; + if (limit.allowedDownPercentage() > 0) { + final String numberDescription; + final String fromDescription; + final String toDescription; + final String limitDescription; + if (limit.allowedDown() > 1) { + numberDescription = "number (percentage)"; + fromDescription = clusterApi.servicesDownOutsideGroup() + " (" + clusterApi.percentageOfServicesDownOutsideGroup() + "%)"; + toDescription = clusterApi.servicesDownIfGroupIsAllowedToBeDown() + " (" + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "%)"; + limitDescription = limit.allowedDown() + " (" + limit.allowedDownPercentage() + "%)"; + } else { + numberDescription = "percentage"; + fromDescription = clusterApi.percentageOfServicesDownOutsideGroup() + "%"; + toDescription = clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "%"; + limitDescription = limit.allowedDownPercentage() + "%"; + } - throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(), ENOUGH_SERVICES_UP_CONSTRAINT, message); + message = "The %s of %s that are down would increase from %s to %s which is beyond the limit of %s" + .formatted(numberDescription, clusterApi.serviceDescription(true), fromDescription, toDescription, limitDescription); + } else { + message = "%d %s %s already down".formatted(clusterApi.servicesDownOutsideGroup(), + clusterApi.serviceDescription(false), + clusterApi.servicesDownOutsideGroup() == 1 ? "is" : "are"); + } + + throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(), + ENOUGH_SERVICES_UP_CONSTRAINT, + message + ":" + clusterApi.downDescription()); } // Non-private for testing purposes - ConcurrentSuspensionLimitForCluster getConcurrentSuspensionLimit(ClusterApi clusterApi) { + SuspensionLimit getConcurrentSuspensionLimit(ClusterApi clusterApi) { // Possible service clusters on a node as of 2021-01-22: // // CLUSTER ID SERVICE TYPE HEALTH ASSOCIATION @@ -102,45 +123,50 @@ public class HostedVespaClusterPolicy implements ClusterPolicy { // H proxy (same as B) // I proxy host + Optional<SuspensionLimit> override = clusterApi.clusterPolicyOverride().getSuspensionLimit(); + if (override.isPresent()) { + return override.get(); + } + if (clusterApi.serviceType().equals(ServiceType.CLUSTER_CONTROLLER)) { - return ConcurrentSuspensionLimitForCluster.ONE_NODE; + return SuspensionLimit.fromAllowedDown(1); } if (Set.of(ServiceType.STORAGE, ServiceType.SEARCH, ServiceType.DISTRIBUTOR, ServiceType.TRANSACTION_LOG_SERVER) .contains(clusterApi.serviceType())) { // Delegate to the cluster controller - return ConcurrentSuspensionLimitForCluster.ALL_NODES; + return SuspensionLimit.fromAllowedDownRatio(1); } if (clusterApi.serviceType().equals(ServiceType.CONTAINER)) { - return ConcurrentSuspensionLimitForCluster.TEN_PERCENT; + return SuspensionLimit.fromAllowedDownRatio(0.1); } if (VespaModelUtil.ADMIN_CLUSTER_ID.equals(clusterApi.clusterId())) { if (ServiceType.SLOBROK.equals(clusterApi.serviceType())) { - return ConcurrentSuspensionLimitForCluster.ONE_NODE; + return SuspensionLimit.fromAllowedDown(1); } - return ConcurrentSuspensionLimitForCluster.ALL_NODES; + return SuspensionLimit.fromAllowedDownRatio(1); } else if (ServiceType.METRICS_PROXY.equals(clusterApi.serviceType())) { - return ConcurrentSuspensionLimitForCluster.ALL_NODES; + return SuspensionLimit.fromAllowedDownRatio(1); } if (Set.of(ServiceType.CONFIG_SERVER, ServiceType.CONTROLLER).contains(clusterApi.serviceType())) { - return ConcurrentSuspensionLimitForCluster.ONE_NODE; + return SuspensionLimit.fromAllowedDown(1); } if (clusterApi.serviceType().equals(ServiceType.HOST_ADMIN)) { if (Set.of(ClusterId.CONFIG_SERVER_HOST, ClusterId.CONTROLLER_HOST).contains(clusterApi.clusterId())) { - return ConcurrentSuspensionLimitForCluster.ONE_NODE; + return SuspensionLimit.fromAllowedDown(1); } return zone.system().isCd() - ? ConcurrentSuspensionLimitForCluster.FIFTY_PERCENT - : ConcurrentSuspensionLimitForCluster.TWENTY_PERCENT; + ? SuspensionLimit.fromAllowedDownRatio(0.5) + : SuspensionLimit.fromAllowedDownRatio(0.2); } // The above should cover all cases, but if not we'll return a reasonable default: - return ConcurrentSuspensionLimitForCluster.TEN_PERCENT; + return SuspensionLimit.fromAllowedDownRatio(0.1); } } diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/SuspensionLimit.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/SuspensionLimit.java new file mode 100644 index 00000000000..8a3d62dcc9c --- /dev/null +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/SuspensionLimit.java @@ -0,0 +1,29 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.orchestrator.policy; + +/** + * @author hakonhall + * + * @param allowedDown the maximum number of services (nodes) that are allowed to be down. + * @param allowedDownRatio the maximum ratio of services (nodes) that are allowed to be down. + */ +public record SuspensionLimit(int allowedDown, double allowedDownRatio) { + public SuspensionLimit { + if (allowedDown < 0) + throw new IllegalArgumentException("allowedDown cannot be negative: " + allowedDown); + if (allowedDownRatio < 0.0 || allowedDownRatio > 1.0) + throw new IllegalArgumentException("allowedDownRatio must be between 0.0 and 1.0: " + allowedDownRatio); + } + + public static SuspensionLimit fromAllowedDown(int allowedDown) { + return new SuspensionLimit(allowedDown, 0); + } + + public static SuspensionLimit fromAllowedDownRatio(double allowedDownRatio) { + return new SuspensionLimit(0, allowedDownRatio); + } + + public int allowedDownPercentage() { + return (int) Math.round(allowedDownRatio * 100.0); + } +} |