diff options
author | Jon Bratseth <bratseth@gmail.com> | 2023-01-30 22:38:42 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2023-01-30 22:38:42 +0100 |
commit | 3fece131449e5d21c5de12e44c2450afa74c9bae (patch) | |
tree | d631411500f9573783c53306c6fb77b60ace3552 /node-repository | |
parent | 4737f68f2ca971b8eed75f08bf91d0ccd4e086b6 (diff) |
Various improvements to avoid excessive scaling
- Don't scale if there are provisioned nodes
- Don't scale if less than 5 minutes since last time
- Metrics are stable unless the cluster is actually retiring
Diffstat (limited to 'node-repository')
4 files changed, 33 insertions, 29 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 53dc0aa7999..410a4fcb773 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -67,7 +67,7 @@ public class Autoscaler { if (! limits.isEmpty() && cluster.minResources().equals(cluster.maxResources())) return Autoscaling.dontScale(Autoscaling.Status.unavailable, "Autoscaling is not enabled", clusterModel); - if ( ! clusterIsStable(clusterNodes, nodeRepository)) + if ( ! clusterModel.isStable(nodeRepository)) return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", clusterModel); var currentAllocation = new AllocatableClusterResources(clusterNodes, nodeRepository); @@ -78,30 +78,16 @@ public class Autoscaler { if (! worthRescaling(currentAllocation.realResources(), bestAllocation.get().realResources())) { if (bestAllocation.get().fulfilment() < 0.9999999) - return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents better scaling of this cluster", clusterModel); + return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents ideal scaling of this cluster", clusterModel); else if ( ! clusterModel.safeToScaleDown() && clusterModel.idealLoad().any(v -> v < 1.0)) - return Autoscaling.dontScale(Status.ideal, "Cooling down before considering to scale down", clusterModel); + return Autoscaling.dontScale(Status.ideal, "Cooling off before considering to scale down", clusterModel); else - return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled", clusterModel); + return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled (within limits)", clusterModel); } return Autoscaling.scaleTo(bestAllocation.get().advertisedResources(), clusterModel); } - public static boolean clusterIsStable(NodeList clusterNodes, NodeRepository nodeRepository) { - // The cluster is processing recent changes - if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() || - node.allocation().get().membership().retired() || - node.allocation().get().removable())) - return false; - - // A deployment is ongoing - if (nodeRepository.nodes().list(Node.State.reserved).owner(clusterNodes.first().get().allocation().get().owner()).size() > 0) - return false; - - return true; - } - /** Returns true if it is worthwhile to make the given resource change, false if it is too insignificant */ public static boolean worthRescaling(ClusterResources from, ClusterResources to) { // *Increase* if needed with no regard for cost difference to prevent running out of a resource diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index fc9d77d7486..da74ad0b63b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -3,10 +3,11 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Zone; +import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; -import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import java.time.Clock; import java.time.Duration; @@ -117,6 +118,24 @@ public class ClusterModel { return adjustment; } + public boolean isStable(NodeRepository nodeRepository) { + // An autoscaling decision was recently made + if (hasScaledIn(Duration.ofMinutes(5))) + return false; + + // The cluster is processing recent changes + if (nodes.stream().anyMatch(node -> node.status().wantToRetire() || + node.allocation().get().membership().retired() || + node.allocation().get().removable())) + return false; + + // A deployment is ongoing + if ( ! nodeRepository.nodes().list(Node.State.reserved, Node.State.provisioned).owner(application.id()).isEmpty()) + return false; + + return true; + } + /** Are we in a position to make decisions to scale down at this point? */ public boolean safeToScaleDown() { if (hasScaledIn(scalingDuration().multipliedBy(3))) return false; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java index 9e28a9d85bb..dc0327c9537 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java @@ -11,7 +11,6 @@ import com.yahoo.slime.Slime; import com.yahoo.slime.SlimeUtils; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; -import com.yahoo.vespa.hosted.provision.NodeRepository; import java.time.Instant; import java.util.ArrayList; @@ -38,26 +37,26 @@ public class MetricsResponse { private final Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics = new HashMap<>(); /** Creates this from a metrics/V2 response */ - public MetricsResponse(String response, NodeList applicationNodes, NodeRepository nodeRepository) { - this(SlimeUtils.jsonToSlime(response), applicationNodes, nodeRepository); + public MetricsResponse(String response, NodeList applicationNodes) { + this(SlimeUtils.jsonToSlime(response), applicationNodes); } public MetricsResponse(Collection<Pair<String, NodeMetricSnapshot>> metrics) { this.nodeMetrics = metrics; } - private MetricsResponse(Slime response, NodeList applicationNodes, NodeRepository nodeRepository) { + private MetricsResponse(Slime response, NodeList applicationNodes) { nodeMetrics = new ArrayList<>(); Inspector root = response.get(); Inspector nodes = root.field("nodes"); - nodes.traverse((ArrayTraverser)(__, node) -> consumeNode(node, applicationNodes, nodeRepository)); + nodes.traverse((ArrayTraverser)(__, node) -> consumeNode(node, applicationNodes)); } public Collection<Pair<String, NodeMetricSnapshot>> nodeMetrics() { return nodeMetrics; } public Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics() { return clusterMetrics; } - private void consumeNode(Inspector nodeObject, NodeList applicationNodes, NodeRepository nodeRepository) { + private void consumeNode(Inspector nodeObject, NodeList applicationNodes) { String hostname = nodeObject.field("hostname").asString(); Optional<Node> node = applicationNodes.node(hostname); if (node.isEmpty()) return; // Node is not part of this cluster any longer @@ -72,7 +71,7 @@ public class MetricsResponse { Metric.disk.from(nodeValues)), (long)Metric.generation.from(nodeValues), Metric.inService.from(nodeValues) > 0, - clusterIsStable(node.get(), applicationNodes, nodeRepository), + clusterIsStable(node.get(), applicationNodes), Metric.queryRate.from(nodeValues)))); var cluster = node.get().allocation().get().membership().cluster().id(); @@ -101,9 +100,9 @@ public class MetricsResponse { item.field("values").traverse((ObjectTraverser)(name, value) -> values.put(name, value.asDouble())); } - private boolean clusterIsStable(Node node, NodeList applicationNodes, NodeRepository nodeRepository) { + private boolean clusterIsStable(Node node, NodeList applicationNodes) { ClusterSpec cluster = node.allocation().get().membership().cluster(); - return Autoscaler.clusterIsStable(applicationNodes.cluster(cluster.id()), nodeRepository); + return applicationNodes.cluster(cluster.id()).retired().isEmpty(); } public static MetricsResponse empty() { return new MetricsResponse(List.of()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java index 5d3d8d40ad4..11f245ee332 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java @@ -63,7 +63,7 @@ public class MetricsV2MetricsFetcher extends AbstractComponent implements Metric // Collector 'autoscaling' defined in com.yahoo.vespa.model.admin.monitoring.MetricConsumer String url = "http://" + metricsV2Container.get().hostname() + ":" + 4080 + apiPath + "?consumer=autoscaling"; return httpClient.get(url) - .thenApply(response -> new MetricsResponse(response, applicationNodes, nodeRepository)); + .thenApply(response -> new MetricsResponse(response, applicationNodes)); } } |