diff options
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java | 50 |
1 files changed, 41 insertions, 9 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 2c0e0a2bdb0..2d192fae11f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -2,12 +2,14 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterResources; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import java.time.Duration; import java.time.Instant; @@ -21,9 +23,9 @@ import java.util.Optional; */ public class Autoscaler { - /** What cost difference is worth a reallocation? */ + /** What cost difference factor is worth a reallocation? */ private static final double costDifferenceWorthReallocation = 0.1; - /** What resource difference is worth a reallocation? */ + /** What difference factor for a resource is worth a reallocation? */ private static final double resourceDifferenceWorthReallocation = 0.1; private final MetricsDb metricsDb; @@ -62,27 +64,31 @@ public class Autoscaler { if ( ! stable(clusterNodes, nodeRepository)) return Advice.none("Cluster change in progress"); - Duration scalingWindow = cluster.scalingDuration(clusterNodes.clusterSpec()); + Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster); if (scaledIn(scalingWindow, cluster)) return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last rescaling"); - var clusterNodesTimeseries = new ClusterNodesTimeseries(scalingWindow, cluster, clusterNodes, metricsDb); - var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive()); + ClusterTimeseries clusterTimeseries = + new ClusterTimeseries(scalingWindow, cluster, clusterNodes, metricsDb); + AllocatableClusterResources currentAllocation = + new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive()); - int measurementsPerNode = clusterNodesTimeseries.measurementsPerNode(); + int measurementsPerNode = clusterTimeseries.measurementsPerNode(); if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow)) return Advice.none("Collecting more data before making new scaling decisions: " + "Have " + measurementsPerNode + " measurements per node but require " + minimumMeasurementsPerNode(scalingWindow)); - int nodesMeasured = clusterNodesTimeseries.nodesMeasured(); + int nodesMeasured = clusterTimeseries.nodesMeasured(); if (nodesMeasured != clusterNodes.size()) return Advice.none("Collecting more data before making new scaling decisions: " + "Have measurements from " + nodesMeasured + " but require from " + clusterNodes.size()); + double cpuLoad = clusterTimeseries.averageLoad(Resource.cpu); + double memoryLoad = clusterTimeseries.averageLoad(Resource.memory); + double diskLoad = clusterTimeseries.averageLoad(Resource.disk); - var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); - var target = ResourceTarget.idealLoad(clusterTimeseries, clusterNodesTimeseries, currentAllocation, application); + var target = ResourceTarget.idealLoad(cpuLoad, memoryLoad, diskLoad, currentAllocation, application); Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits); @@ -122,6 +128,32 @@ public class Autoscaler { .isAfter(nodeRepository.clock().instant().minus(delay)); } + /** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */ + private Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) { + int completedEventCount = 0; + Duration totalDuration = Duration.ZERO; + for (ScalingEvent event : cluster.scalingEvents()) { + if (event.duration().isEmpty()) continue; + completedEventCount++; + totalDuration = totalDuration.plus(event.duration().get()); + } + + if (completedEventCount == 0) { // Use defaults + if (clusterSpec.isStateful()) return Duration.ofHours(12); + return Duration.ofMinutes(10); + } + else { + Duration predictedDuration = totalDuration.dividedBy(completedEventCount); + + // TODO: Remove when we have reliable completion for content clusters + if (clusterSpec.isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative()) + return Duration.ofHours(12); + + if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum + return predictedDuration; + } + } + static Duration maxScalingWindow() { return Duration.ofHours(48); } |