aboutsummaryrefslogtreecommitdiffstats
path: root/orchestrator/src/main
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@yahooinc.com>2023-08-01 15:16:57 +0200
committerHåkon Hallingstad <hakon@yahooinc.com>2023-08-01 15:16:57 +0200
commit5c7ee97241489c69858ca813d164a3d4309409ab (patch)
tree213f8e5b34b61a04b89b75ae5ab8a69727f23e49 /orchestrator/src/main
parentad484e51eb9d86bb47288aa742ac06ad82f1a354 (diff)
Use orchestration override if present
Diffstat (limited to 'orchestrator/src/main')
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java13
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java24
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java13
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java72
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/SuspensionLimit.java29
5 files changed, 118 insertions, 33 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java
index cb2a2fe5f62..7fa3bd45b4c 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApi.java
@@ -30,10 +30,19 @@ public interface ClusterApi {
boolean noServicesOutsideGroupIsDown() throws HostStateChangeDeniedException;
- int percentageOfServicesDownOutsideGroup();
- int percentageOfServicesDownIfGroupIsAllowedToBeDown();
+ /** Returns the number of services currently in the cluster, plus the number of missing services. */
+ int size();
+
+ int servicesDownOutsideGroup();
+ default int percentageOfServicesDownOutsideGroup() { return sizePercentageOf(servicesDownOutsideGroup()); }
+ int servicesDownIfGroupIsAllowedToBeDown();
+ default int percentageOfServicesDownIfGroupIsAllowedToBeDown() { return sizePercentageOf(servicesDownIfGroupIsAllowedToBeDown()); }
+
+ ClusterPolicyOverride clusterPolicyOverride();
Optional<StorageNode> storageNodeInGroup();
String downDescription();
+
+ private int sizePercentageOf(int count) { return (int) Math.round(100.0 * count / size()); }
}
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java
index 736b909a82f..6240761dd6b 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterApiImpl.java
@@ -173,15 +173,21 @@ class ClusterApiImpl implements ClusterApi {
}
@Override
- public int percentageOfServicesDownOutsideGroup() {
- int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices;
- return numberOfServicesDown * 100 / (serviceCluster.serviceInstances().size() + missingServices);
+ public int size() { return serviceCluster.serviceInstances().size() + missingServices; }
+
+ @Override
+ public int servicesDownOutsideGroup() {
+ return servicesDownAndNotInGroup().size() + missingServices;
+ }
+
+ @Override
+ public int servicesDownIfGroupIsAllowedToBeDown() {
+ return servicesDownAndNotInGroup().size() + servicesInGroup.size() + missingServices;
}
@Override
- public int percentageOfServicesDownIfGroupIsAllowedToBeDown() {
- int numberOfServicesDown = servicesDownAndNotInGroup().size() + missingServices + servicesInGroup.size();
- return numberOfServicesDown * 100 / (serviceCluster.serviceInstances().size() + missingServices);
+ public ClusterPolicyOverride clusterPolicyOverride() {
+ return clusterPolicyOverride;
}
/**
@@ -206,7 +212,7 @@ class ClusterApiImpl implements ClusterApi {
if (suspended.size() > nodeLimit) {
description.append(" and " + (suspended.size() - nodeLimit) + " others");
}
- description.append(" are suspended.");
+ description.append(" " + isOrAre(suspended.size()) + " suspended.");
}
Set<ServiceInstance> downElsewhere = servicesDownAndNotInGroup().stream()
@@ -228,12 +234,14 @@ class ClusterApiImpl implements ClusterApi {
if (downElsewhereTotal > serviceLimit) {
description.append(" and " + (downElsewhereTotal - serviceLimit) + " others");
}
- description.append(" are down.");
+ description.append(" " + isOrAre(downElsewhereTotal) + " down.");
}
return description.toString();
}
+ private static String isOrAre(int count) { return count == 1 ? "is" : "are"; }
+
private Optional<StorageNode> storageNodeInGroup(Predicate<ServiceInstance> storageServicePredicate) {
if (!VespaModelUtil.isStorage(serviceCluster)) {
return Optional.empty();
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java
index 2bdfa1a6659..f724a4da9cb 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/ClusterPolicyOverride.java
@@ -1,6 +1,9 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.orchestrator.model;
+import com.yahoo.vespa.orchestrator.policy.SuspensionLimit;
+
+import java.util.Optional;
import java.util.OptionalDouble;
import java.util.OptionalInt;
@@ -28,6 +31,16 @@ public record ClusterPolicyOverride(int deployedSize, OptionalInt expectedSize,
}
+ public static ClusterPolicyOverride fromDeployedSize(int deployedSize) {
+ return new ClusterPolicyOverride(deployedSize, OptionalInt.empty(), OptionalInt.empty(), OptionalDouble.empty());
+ }
+
+ public Optional<SuspensionLimit> getSuspensionLimit() {
+ return allowedDown.isPresent() || allowedDownRatio.isPresent() ?
+ Optional.of(new SuspensionLimit(allowedDown.orElse(0), allowedDownRatio.orElse(0.0))) :
+ Optional.empty();
+ }
+
public OptionalInt allowedDownPercentage() {
return allowedDownRatio.isPresent() ?
OptionalInt.of((int) Math.round(allowedDownRatio.getAsDouble() * 100.0)) :
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java
index 5d553c86c50..88b339e15f3 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/HostedVespaClusterPolicy.java
@@ -37,10 +37,11 @@ public class HostedVespaClusterPolicy implements ClusterPolicy {
return SuspensionReasons.nothingNoteworthy();
}
- int percentageOfServicesAllowedToBeDown = getConcurrentSuspensionLimit(clusterApi).asPercentage();
- if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= percentageOfServicesAllowedToBeDown) {
+ SuspensionLimit limit = getConcurrentSuspensionLimit(clusterApi);
+ if (clusterApi.servicesDownIfGroupIsAllowedToBeDown() <= limit.allowedDown())
+ return SuspensionReasons.nothingNoteworthy();
+ if (clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() <= limit.allowedDownPercentage())
return SuspensionReasons.nothingNoteworthy();
- }
// Be a bit more cautious when removing nodes permanently
if (!permanent) {
@@ -50,19 +51,39 @@ public class HostedVespaClusterPolicy implements ClusterPolicy {
}
}
- String message = percentageOfServicesAllowedToBeDown <= 0
- ? clusterApi.percentageOfServicesDownOutsideGroup() + "% of the " + clusterApi.serviceDescription(true)
- + " are down or suspended already:" + clusterApi.downDescription()
- : "The percentage of downed or suspended " + clusterApi.serviceDescription(true)
- + " would increase from " + clusterApi.percentageOfServicesDownOutsideGroup() + "% to "
- + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "% (limit is "
- + percentageOfServicesAllowedToBeDown + "%):" + clusterApi.downDescription();
+ final String message;
+ if (limit.allowedDownPercentage() > 0) {
+ final String numberDescription;
+ final String fromDescription;
+ final String toDescription;
+ final String limitDescription;
+ if (limit.allowedDown() > 1) {
+ numberDescription = "number (percentage)";
+ fromDescription = clusterApi.servicesDownOutsideGroup() + " (" + clusterApi.percentageOfServicesDownOutsideGroup() + "%)";
+ toDescription = clusterApi.servicesDownIfGroupIsAllowedToBeDown() + " (" + clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "%)";
+ limitDescription = limit.allowedDown() + " (" + limit.allowedDownPercentage() + "%)";
+ } else {
+ numberDescription = "percentage";
+ fromDescription = clusterApi.percentageOfServicesDownOutsideGroup() + "%";
+ toDescription = clusterApi.percentageOfServicesDownIfGroupIsAllowedToBeDown() + "%";
+ limitDescription = limit.allowedDownPercentage() + "%";
+ }
- throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(), ENOUGH_SERVICES_UP_CONSTRAINT, message);
+ message = "The %s of %s that are down would increase from %s to %s which is beyond the limit of %s"
+ .formatted(numberDescription, clusterApi.serviceDescription(true), fromDescription, toDescription, limitDescription);
+ } else {
+ message = "%d %s %s already down".formatted(clusterApi.servicesDownOutsideGroup(),
+ clusterApi.serviceDescription(false),
+ clusterApi.servicesDownOutsideGroup() == 1 ? "is" : "are");
+ }
+
+ throw new HostStateChangeDeniedException(clusterApi.getNodeGroup(),
+ ENOUGH_SERVICES_UP_CONSTRAINT,
+ message + ":" + clusterApi.downDescription());
}
// Non-private for testing purposes
- ConcurrentSuspensionLimitForCluster getConcurrentSuspensionLimit(ClusterApi clusterApi) {
+ SuspensionLimit getConcurrentSuspensionLimit(ClusterApi clusterApi) {
// Possible service clusters on a node as of 2021-01-22:
//
// CLUSTER ID SERVICE TYPE HEALTH ASSOCIATION
@@ -102,45 +123,50 @@ public class HostedVespaClusterPolicy implements ClusterPolicy {
// H proxy (same as B)
// I proxy host
+ Optional<SuspensionLimit> override = clusterApi.clusterPolicyOverride().getSuspensionLimit();
+ if (override.isPresent()) {
+ return override.get();
+ }
+
if (clusterApi.serviceType().equals(ServiceType.CLUSTER_CONTROLLER)) {
- return ConcurrentSuspensionLimitForCluster.ONE_NODE;
+ return SuspensionLimit.fromAllowedDown(1);
}
if (Set.of(ServiceType.STORAGE, ServiceType.SEARCH, ServiceType.DISTRIBUTOR, ServiceType.TRANSACTION_LOG_SERVER)
.contains(clusterApi.serviceType())) {
// Delegate to the cluster controller
- return ConcurrentSuspensionLimitForCluster.ALL_NODES;
+ return SuspensionLimit.fromAllowedDownRatio(1);
}
if (clusterApi.serviceType().equals(ServiceType.CONTAINER)) {
- return ConcurrentSuspensionLimitForCluster.TEN_PERCENT;
+ return SuspensionLimit.fromAllowedDownRatio(0.1);
}
if (VespaModelUtil.ADMIN_CLUSTER_ID.equals(clusterApi.clusterId())) {
if (ServiceType.SLOBROK.equals(clusterApi.serviceType())) {
- return ConcurrentSuspensionLimitForCluster.ONE_NODE;
+ return SuspensionLimit.fromAllowedDown(1);
}
- return ConcurrentSuspensionLimitForCluster.ALL_NODES;
+ return SuspensionLimit.fromAllowedDownRatio(1);
} else if (ServiceType.METRICS_PROXY.equals(clusterApi.serviceType())) {
- return ConcurrentSuspensionLimitForCluster.ALL_NODES;
+ return SuspensionLimit.fromAllowedDownRatio(1);
}
if (Set.of(ServiceType.CONFIG_SERVER, ServiceType.CONTROLLER).contains(clusterApi.serviceType())) {
- return ConcurrentSuspensionLimitForCluster.ONE_NODE;
+ return SuspensionLimit.fromAllowedDown(1);
}
if (clusterApi.serviceType().equals(ServiceType.HOST_ADMIN)) {
if (Set.of(ClusterId.CONFIG_SERVER_HOST, ClusterId.CONTROLLER_HOST).contains(clusterApi.clusterId())) {
- return ConcurrentSuspensionLimitForCluster.ONE_NODE;
+ return SuspensionLimit.fromAllowedDown(1);
}
return zone.system().isCd()
- ? ConcurrentSuspensionLimitForCluster.FIFTY_PERCENT
- : ConcurrentSuspensionLimitForCluster.TWENTY_PERCENT;
+ ? SuspensionLimit.fromAllowedDownRatio(0.5)
+ : SuspensionLimit.fromAllowedDownRatio(0.2);
}
// The above should cover all cases, but if not we'll return a reasonable default:
- return ConcurrentSuspensionLimitForCluster.TEN_PERCENT;
+ return SuspensionLimit.fromAllowedDownRatio(0.1);
}
}
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/SuspensionLimit.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/SuspensionLimit.java
new file mode 100644
index 00000000000..8a3d62dcc9c
--- /dev/null
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/policy/SuspensionLimit.java
@@ -0,0 +1,29 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.orchestrator.policy;
+
+/**
+ * @author hakonhall
+ *
+ * @param allowedDown the maximum number of services (nodes) that are allowed to be down.
+ * @param allowedDownRatio the maximum ratio of services (nodes) that are allowed to be down.
+ */
+public record SuspensionLimit(int allowedDown, double allowedDownRatio) {
+ public SuspensionLimit {
+ if (allowedDown < 0)
+ throw new IllegalArgumentException("allowedDown cannot be negative: " + allowedDown);
+ if (allowedDownRatio < 0.0 || allowedDownRatio > 1.0)
+ throw new IllegalArgumentException("allowedDownRatio must be between 0.0 and 1.0: " + allowedDownRatio);
+ }
+
+ public static SuspensionLimit fromAllowedDown(int allowedDown) {
+ return new SuspensionLimit(allowedDown, 0);
+ }
+
+ public static SuspensionLimit fromAllowedDownRatio(double allowedDownRatio) {
+ return new SuspensionLimit(0, allowedDownRatio);
+ }
+
+ public int allowedDownPercentage() {
+ return (int) Math.round(allowedDownRatio * 100.0);
+ }
+}