diff options
Diffstat (limited to 'node-repository')
3 files changed, 20 insertions, 15 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 92de7232786..409d7111b9d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -63,16 +63,18 @@ public class Autoscaler { if ( ! stable(clusterNodes, nodeRepository)) return Advice.none("Cluster change in progress"); + Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster); + + ClusterTimeseries clusterTimeseries = + new ClusterTimeseries(nodeRepository.clock().instant().minus(scalingWindow), cluster, clusterNodes, metricsDb); AllocatableClusterResources currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive()); - ClusterTimeseries clusterTimeseries = new ClusterTimeseries(cluster, clusterNodes, metricsDb, nodeRepository); - int measurementsPerNode = clusterTimeseries.measurementsPerNode(); - if (measurementsPerNode < minimumMeasurementsPerNode(clusterNodes.clusterSpec())) + if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow)) return Advice.none("Collecting more data before making new scaling decisions" + " (has " + measurementsPerNode + " measurements per node but need " + - minimumMeasurementsPerNode(clusterNodes.clusterSpec()) + ")"); + minimumMeasurementsPerNode(scalingWindow) + ")"); int nodesMeasured = clusterTimeseries.nodesMeasured(); if (nodesMeasured != clusterNodes.size()) @@ -93,7 +95,6 @@ public class Autoscaler { if (similar(bestAllocation.get(), currentAllocation)) return Advice.dontScale("Cluster is ideally scaled (within configured limits)"); - Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster); if (scaledIn(scalingWindow, cluster)) return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last rescaling"); if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(scalingWindow.multipliedBy(3), cluster)) @@ -130,7 +131,7 @@ public class Autoscaler { } /** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */ - static Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) { + private Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) { int completedEventCount = 0; Duration totalDuration = Duration.ZERO; for (ScalingEvent event : cluster.scalingEvents()) { @@ -159,10 +160,15 @@ public class Autoscaler { return Duration.ofHours(48); } - /** Measurements are currently taken once a minute. See also scalingWindow */ - static int minimumMeasurementsPerNode(ClusterSpec cluster) { - if (cluster.isStateful()) return 60; - return 4; + /** Returns the minimum measurements per node (average) we require to give autoscaling advice.*/ + private int minimumMeasurementsPerNode(Duration scalingWindow) { + // Measurements are ideally taken every minute, but no guarantees + // (network, nodes may be down, collecting is single threaded and may take longer than 1 minute to complete. + // Since the metric window is 5 minutes, we won't really improve from measuring more often: + long minimumMeasurements = scalingWindow.toMinutes() / 5; + minimumMeasurements = Math.round(0.8 * minimumMeasurements); // Allow 20% metrics collection blackout + if (minimumMeasurements < 1) minimumMeasurements = 1; + return (int)minimumMeasurements; } public static boolean stable(NodeList nodes, NodeRepository nodeRepository) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index d73f7985a7b..e1a3ceca033 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -5,6 +5,7 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import java.time.Instant; import java.util.List; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -21,10 +22,9 @@ public class ClusterTimeseries { /** The measurements for all nodes in this snapshot */ private final List<NodeTimeseries> allTimeseries; - public ClusterTimeseries(Cluster cluster, NodeList clusterNodes, MetricsDb db, NodeRepository nodeRepository) { + public ClusterTimeseries(Instant startTime, Cluster cluster, NodeList clusterNodes, MetricsDb db) { this.clusterNodes = clusterNodes; - var timeseries = db.getNodeTimeseries(nodeRepository.clock().instant().minus(Autoscaler.scalingWindow(clusterNodes.clusterSpec(), cluster)), - clusterNodes); + var timeseries = db.getNodeTimeseries(startTime, clusterNodes); if (cluster.lastScalingEvent().isPresent()) timeseries = filter(timeseries, snapshot -> snapshot.generation() < 0 || // Content nodes do not yet send generation diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 5694f84c555..a217d97ac27 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -52,7 +52,7 @@ public class AutoscalingTest { assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); tester.clock().advance(Duration.ofDays(1)); - tester.addCpuMeasurements(0.25f, 1f, 60, application1); + tester.addCpuMeasurements(0.25f, 1f, 120, application1); ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high", 15, 1, 1.3, 28.6, 28.6, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -76,7 +76,6 @@ public class AutoscalingTest { tester.autoscale(application1, cluster1.id(), min, max).target()); var events = tester.nodeRepository().applications().get(application1).get().cluster(cluster1.id()).get().scalingEvents(); - events.forEach(e -> System.out.println(e)); } /** We prefer fewer nodes for container clusters as (we assume) they all use the same disk and memory */ |