aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java50
1 files changed, 41 insertions, 9 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 2c0e0a2bdb0..2d192fae11f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -2,12 +2,14 @@
package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.config.provision.ClusterResources;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import java.time.Duration;
import java.time.Instant;
@@ -21,9 +23,9 @@ import java.util.Optional;
*/
public class Autoscaler {
- /** What cost difference is worth a reallocation? */
+ /** What cost difference factor is worth a reallocation? */
private static final double costDifferenceWorthReallocation = 0.1;
- /** What resource difference is worth a reallocation? */
+ /** What difference factor for a resource is worth a reallocation? */
private static final double resourceDifferenceWorthReallocation = 0.1;
private final MetricsDb metricsDb;
@@ -62,27 +64,31 @@ public class Autoscaler {
if ( ! stable(clusterNodes, nodeRepository))
return Advice.none("Cluster change in progress");
- Duration scalingWindow = cluster.scalingDuration(clusterNodes.clusterSpec());
+ Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster);
if (scaledIn(scalingWindow, cluster))
return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last rescaling");
- var clusterNodesTimeseries = new ClusterNodesTimeseries(scalingWindow, cluster, clusterNodes, metricsDb);
- var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
+ ClusterTimeseries clusterTimeseries =
+ new ClusterTimeseries(scalingWindow, cluster, clusterNodes, metricsDb);
+ AllocatableClusterResources currentAllocation =
+ new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
- int measurementsPerNode = clusterNodesTimeseries.measurementsPerNode();
+ int measurementsPerNode = clusterTimeseries.measurementsPerNode();
if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow))
return Advice.none("Collecting more data before making new scaling decisions: " +
"Have " + measurementsPerNode + " measurements per node but require " +
minimumMeasurementsPerNode(scalingWindow));
- int nodesMeasured = clusterNodesTimeseries.nodesMeasured();
+ int nodesMeasured = clusterTimeseries.nodesMeasured();
if (nodesMeasured != clusterNodes.size())
return Advice.none("Collecting more data before making new scaling decisions: " +
"Have measurements from " + nodesMeasured + " but require from " + clusterNodes.size());
+ double cpuLoad = clusterTimeseries.averageLoad(Resource.cpu);
+ double memoryLoad = clusterTimeseries.averageLoad(Resource.memory);
+ double diskLoad = clusterTimeseries.averageLoad(Resource.disk);
- var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
- var target = ResourceTarget.idealLoad(clusterTimeseries, clusterNodesTimeseries, currentAllocation, application);
+ var target = ResourceTarget.idealLoad(cpuLoad, memoryLoad, diskLoad, currentAllocation, application);
Optional<AllocatableClusterResources> bestAllocation =
allocationOptimizer.findBestAllocation(target, currentAllocation, limits);
@@ -122,6 +128,32 @@ public class Autoscaler {
.isAfter(nodeRepository.clock().instant().minus(delay));
}
+ /** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */
+ private Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) {
+ int completedEventCount = 0;
+ Duration totalDuration = Duration.ZERO;
+ for (ScalingEvent event : cluster.scalingEvents()) {
+ if (event.duration().isEmpty()) continue;
+ completedEventCount++;
+ totalDuration = totalDuration.plus(event.duration().get());
+ }
+
+ if (completedEventCount == 0) { // Use defaults
+ if (clusterSpec.isStateful()) return Duration.ofHours(12);
+ return Duration.ofMinutes(10);
+ }
+ else {
+ Duration predictedDuration = totalDuration.dividedBy(completedEventCount);
+
+ // TODO: Remove when we have reliable completion for content clusters
+ if (clusterSpec.isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative())
+ return Duration.ofHours(12);
+
+ if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum
+ return predictedDuration;
+ }
+ }
+
static Duration maxScalingWindow() {
return Duration.ofHours(48);
}