aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-12-10 15:46:44 +0100
committerJon Bratseth <bratseth@gmail.com>2020-12-10 15:46:44 +0100
commitc26f102ef7e0a06bab9d3daf6f99b6ef7dbbdd05 (patch)
treec10d2d7685718ccb123cfbed714c451dce640b50 /node-repository
parentfdf963af01ce963de8f02938252fa45b30f16c37 (diff)
Require minimum measurements depending on scaling window
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java26
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java3
3 files changed, 20 insertions, 15 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 92de7232786..409d7111b9d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -63,16 +63,18 @@ public class Autoscaler {
if ( ! stable(clusterNodes, nodeRepository))
return Advice.none("Cluster change in progress");
+ Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster);
+
+ ClusterTimeseries clusterTimeseries =
+ new ClusterTimeseries(nodeRepository.clock().instant().minus(scalingWindow), cluster, clusterNodes, metricsDb);
AllocatableClusterResources currentAllocation =
new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
- ClusterTimeseries clusterTimeseries = new ClusterTimeseries(cluster, clusterNodes, metricsDb, nodeRepository);
-
int measurementsPerNode = clusterTimeseries.measurementsPerNode();
- if (measurementsPerNode < minimumMeasurementsPerNode(clusterNodes.clusterSpec()))
+ if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow))
return Advice.none("Collecting more data before making new scaling decisions" +
" (has " + measurementsPerNode + " measurements per node but need " +
- minimumMeasurementsPerNode(clusterNodes.clusterSpec()) + ")");
+ minimumMeasurementsPerNode(scalingWindow) + ")");
int nodesMeasured = clusterTimeseries.nodesMeasured();
if (nodesMeasured != clusterNodes.size())
@@ -93,7 +95,6 @@ public class Autoscaler {
if (similar(bestAllocation.get(), currentAllocation))
return Advice.dontScale("Cluster is ideally scaled (within configured limits)");
- Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster);
if (scaledIn(scalingWindow, cluster))
return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last rescaling");
if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(scalingWindow.multipliedBy(3), cluster))
@@ -130,7 +131,7 @@ public class Autoscaler {
}
/** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */
- static Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) {
+ private Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) {
int completedEventCount = 0;
Duration totalDuration = Duration.ZERO;
for (ScalingEvent event : cluster.scalingEvents()) {
@@ -159,10 +160,15 @@ public class Autoscaler {
return Duration.ofHours(48);
}
- /** Measurements are currently taken once a minute. See also scalingWindow */
- static int minimumMeasurementsPerNode(ClusterSpec cluster) {
- if (cluster.isStateful()) return 60;
- return 4;
+ /** Returns the minimum measurements per node (average) we require to give autoscaling advice.*/
+ private int minimumMeasurementsPerNode(Duration scalingWindow) {
+ // Measurements are ideally taken every minute, but no guarantees
+ // (network, nodes may be down, collecting is single threaded and may take longer than 1 minute to complete.
+ // Since the metric window is 5 minutes, we won't really improve from measuring more often:
+ long minimumMeasurements = scalingWindow.toMinutes() / 5;
+ minimumMeasurements = Math.round(0.8 * minimumMeasurements); // Allow 20% metrics collection blackout
+ if (minimumMeasurements < 1) minimumMeasurements = 1;
+ return (int)minimumMeasurements;
}
public static boolean stable(NodeList nodes, NodeRepository nodeRepository) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
index d73f7985a7b..e1a3ceca033 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
@@ -5,6 +5,7 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import java.time.Instant;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;
@@ -21,10 +22,9 @@ public class ClusterTimeseries {
/** The measurements for all nodes in this snapshot */
private final List<NodeTimeseries> allTimeseries;
- public ClusterTimeseries(Cluster cluster, NodeList clusterNodes, MetricsDb db, NodeRepository nodeRepository) {
+ public ClusterTimeseries(Instant startTime, Cluster cluster, NodeList clusterNodes, MetricsDb db) {
this.clusterNodes = clusterNodes;
- var timeseries = db.getNodeTimeseries(nodeRepository.clock().instant().minus(Autoscaler.scalingWindow(clusterNodes.clusterSpec(), cluster)),
- clusterNodes);
+ var timeseries = db.getNodeTimeseries(startTime, clusterNodes);
if (cluster.lastScalingEvent().isPresent())
timeseries = filter(timeseries, snapshot -> snapshot.generation() < 0 || // Content nodes do not yet send generation
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 5694f84c555..a217d97ac27 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -52,7 +52,7 @@ public class AutoscalingTest {
assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
tester.clock().advance(Duration.ofDays(1));
- tester.addCpuMeasurements(0.25f, 1f, 60, application1);
+ tester.addCpuMeasurements(0.25f, 1f, 120, application1);
ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high",
15, 1, 1.3, 28.6, 28.6,
tester.autoscale(application1, cluster1.id(), min, max).target());
@@ -76,7 +76,6 @@ public class AutoscalingTest {
tester.autoscale(application1, cluster1.id(), min, max).target());
var events = tester.nodeRepository().applications().get(application1).get().cluster(cluster1.id()).get().scalingEvents();
- events.forEach(e -> System.out.println(e));
}
/** We prefer fewer nodes for container clusters as (we assume) they all use the same disk and memory */