aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-01-30 22:38:42 +0100
committerJon Bratseth <bratseth@gmail.com>2023-01-30 22:38:42 +0100
commit3fece131449e5d21c5de12e44c2450afa74c9bae (patch)
treed631411500f9573783c53306c6fb77b60ace3552 /node-repository
parent4737f68f2ca971b8eed75f08bf91d0ccd4e086b6 (diff)
Various improvements to avoid excessive scaling
- Don't scale if there are provisioned nodes - Don't scale if less than 5 minutes since last time - Metrics are stable unless the cluster is actually retiring
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java21
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java17
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java2
4 files changed, 33 insertions, 29 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 53dc0aa7999..410a4fcb773 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -67,7 +67,7 @@ public class Autoscaler {
if (! limits.isEmpty() && cluster.minResources().equals(cluster.maxResources()))
return Autoscaling.dontScale(Autoscaling.Status.unavailable, "Autoscaling is not enabled", clusterModel);
- if ( ! clusterIsStable(clusterNodes, nodeRepository))
+ if ( ! clusterModel.isStable(nodeRepository))
return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", clusterModel);
var currentAllocation = new AllocatableClusterResources(clusterNodes, nodeRepository);
@@ -78,30 +78,16 @@ public class Autoscaler {
if (! worthRescaling(currentAllocation.realResources(), bestAllocation.get().realResources())) {
if (bestAllocation.get().fulfilment() < 0.9999999)
- return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents better scaling of this cluster", clusterModel);
+ return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents ideal scaling of this cluster", clusterModel);
else if ( ! clusterModel.safeToScaleDown() && clusterModel.idealLoad().any(v -> v < 1.0))
- return Autoscaling.dontScale(Status.ideal, "Cooling down before considering to scale down", clusterModel);
+ return Autoscaling.dontScale(Status.ideal, "Cooling off before considering to scale down", clusterModel);
else
- return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled", clusterModel);
+ return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled (within limits)", clusterModel);
}
return Autoscaling.scaleTo(bestAllocation.get().advertisedResources(), clusterModel);
}
- public static boolean clusterIsStable(NodeList clusterNodes, NodeRepository nodeRepository) {
- // The cluster is processing recent changes
- if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() ||
- node.allocation().get().membership().retired() ||
- node.allocation().get().removable()))
- return false;
-
- // A deployment is ongoing
- if (nodeRepository.nodes().list(Node.State.reserved).owner(clusterNodes.first().get().allocation().get().owner()).size() > 0)
- return false;
-
- return true;
- }
-
/** Returns true if it is worthwhile to make the given resource change, false if it is too insignificant */
public static boolean worthRescaling(ClusterResources from, ClusterResources to) {
// *Increase* if needed with no regard for cost difference to prevent running out of a resource
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index fc9d77d7486..da74ad0b63b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -3,10 +3,11 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Zone;
+import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
-import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import java.time.Clock;
import java.time.Duration;
@@ -117,6 +118,24 @@ public class ClusterModel {
return adjustment;
}
+ public boolean isStable(NodeRepository nodeRepository) {
+ // An autoscaling decision was recently made
+ if (hasScaledIn(Duration.ofMinutes(5)))
+ return false;
+
+ // The cluster is processing recent changes
+ if (nodes.stream().anyMatch(node -> node.status().wantToRetire() ||
+ node.allocation().get().membership().retired() ||
+ node.allocation().get().removable()))
+ return false;
+
+ // A deployment is ongoing
+ if ( ! nodeRepository.nodes().list(Node.State.reserved, Node.State.provisioned).owner(application.id()).isEmpty())
+ return false;
+
+ return true;
+ }
+
/** Are we in a position to make decisions to scale down at this point? */
public boolean safeToScaleDown() {
if (hasScaledIn(scalingDuration().multipliedBy(3))) return false;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index 9e28a9d85bb..dc0327c9537 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -11,7 +11,6 @@ import com.yahoo.slime.Slime;
import com.yahoo.slime.SlimeUtils;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
-import com.yahoo.vespa.hosted.provision.NodeRepository;
import java.time.Instant;
import java.util.ArrayList;
@@ -38,26 +37,26 @@ public class MetricsResponse {
private final Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics = new HashMap<>();
/** Creates this from a metrics/V2 response */
- public MetricsResponse(String response, NodeList applicationNodes, NodeRepository nodeRepository) {
- this(SlimeUtils.jsonToSlime(response), applicationNodes, nodeRepository);
+ public MetricsResponse(String response, NodeList applicationNodes) {
+ this(SlimeUtils.jsonToSlime(response), applicationNodes);
}
public MetricsResponse(Collection<Pair<String, NodeMetricSnapshot>> metrics) {
this.nodeMetrics = metrics;
}
- private MetricsResponse(Slime response, NodeList applicationNodes, NodeRepository nodeRepository) {
+ private MetricsResponse(Slime response, NodeList applicationNodes) {
nodeMetrics = new ArrayList<>();
Inspector root = response.get();
Inspector nodes = root.field("nodes");
- nodes.traverse((ArrayTraverser)(__, node) -> consumeNode(node, applicationNodes, nodeRepository));
+ nodes.traverse((ArrayTraverser)(__, node) -> consumeNode(node, applicationNodes));
}
public Collection<Pair<String, NodeMetricSnapshot>> nodeMetrics() { return nodeMetrics; }
public Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics() { return clusterMetrics; }
- private void consumeNode(Inspector nodeObject, NodeList applicationNodes, NodeRepository nodeRepository) {
+ private void consumeNode(Inspector nodeObject, NodeList applicationNodes) {
String hostname = nodeObject.field("hostname").asString();
Optional<Node> node = applicationNodes.node(hostname);
if (node.isEmpty()) return; // Node is not part of this cluster any longer
@@ -72,7 +71,7 @@ public class MetricsResponse {
Metric.disk.from(nodeValues)),
(long)Metric.generation.from(nodeValues),
Metric.inService.from(nodeValues) > 0,
- clusterIsStable(node.get(), applicationNodes, nodeRepository),
+ clusterIsStable(node.get(), applicationNodes),
Metric.queryRate.from(nodeValues))));
var cluster = node.get().allocation().get().membership().cluster().id();
@@ -101,9 +100,9 @@ public class MetricsResponse {
item.field("values").traverse((ObjectTraverser)(name, value) -> values.put(name, value.asDouble()));
}
- private boolean clusterIsStable(Node node, NodeList applicationNodes, NodeRepository nodeRepository) {
+ private boolean clusterIsStable(Node node, NodeList applicationNodes) {
ClusterSpec cluster = node.allocation().get().membership().cluster();
- return Autoscaler.clusterIsStable(applicationNodes.cluster(cluster.id()), nodeRepository);
+ return applicationNodes.cluster(cluster.id()).retired().isEmpty();
}
public static MetricsResponse empty() { return new MetricsResponse(List.of()); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java
index 5d3d8d40ad4..11f245ee332 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java
@@ -63,7 +63,7 @@ public class MetricsV2MetricsFetcher extends AbstractComponent implements Metric
// Collector 'autoscaling' defined in com.yahoo.vespa.model.admin.monitoring.MetricConsumer
String url = "http://" + metricsV2Container.get().hostname() + ":" + 4080 + apiPath + "?consumer=autoscaling";
return httpClient.get(url)
- .thenApply(response -> new MetricsResponse(response, applicationNodes, nodeRepository));
+ .thenApply(response -> new MetricsResponse(response, applicationNodes));
}
}