diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2021-03-17 21:40:29 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-17 21:40:29 +0100 |
commit | 903cfb2fe4875d8c189965fa898a30085088514c (patch) | |
tree | 31e087d9591834b9539f77e708168e26c16ce067 /node-repository | |
parent | 320330f55d4bc45a984750d2fd13965213065d74 (diff) | |
parent | dc5d6a5388052919bc9abf221a6a455a28f22574 (diff) |
Merge pull request #17019 from vespa-engine/bratseth/predict-in-scaling-window
Bratseth/predict in scaling window
Diffstat (limited to 'node-repository')
9 files changed, 222 insertions, 116 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index c7549a5ddee..ac3430fecf9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -86,7 +86,8 @@ public class Autoscaler { clusterTimeseries, clusterNodesTimeseries, currentAllocation, - application); + application, + nodeRepository.clock()); Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index 3f5255c6618..e12e5442c52 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -3,10 +3,14 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterSpec; +import java.time.Clock; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Optional; +import java.util.OptionalDouble; /** * A list of metric snapshots from a cluster, sorted by increasing time (newest last). @@ -43,16 +47,17 @@ public class ClusterTimeseries { return new ClusterTimeseries(cluster, list); } - /** The max query growth rate we can predict from this time-series as a fraction of the current traffic per minute */ - public double maxQueryGrowthRate() { + /** + * The max query growth rate we can predict from this time-series as a fraction of the average traffic in the window + */ + public double maxQueryGrowthRate(Duration window, Clock clock) { if (cachedMaxQueryGrowthRate != null) return cachedMaxQueryGrowthRate; - return cachedMaxQueryGrowthRate = computeMaxQueryGrowthRate(); + return cachedMaxQueryGrowthRate = computeMaxQueryGrowthRate(window, clock); } - private double computeMaxQueryGrowthRate() { + private double computeMaxQueryGrowthRate(Duration window, Clock clock) { if (snapshots.isEmpty()) return 0.1; - // Find the period having the highest growth rate, where total growth exceeds 30% increase double maxGrowthRate = 0; // In query rate per minute for (int start = 0; start < snapshots.size(); start++) { @@ -80,24 +85,40 @@ public class ClusterTimeseries { else return 0.0; // ... because load is stable } - if (currentQueryRate() == 0) return 0.1; // Growth not expressible as a fraction of the current rate - return maxGrowthRate / currentQueryRate(); + OptionalDouble queryRate = queryRate(window, clock); + if (queryRate.orElse(0) == 0) return 0.1; // Growth not expressible as a fraction of the current rate + return maxGrowthRate / queryRate.getAsDouble(); } - /** The current query rate as a fraction of the peak rate in this timeseries */ - public double currentQueryFractionOfMax() { + /** + * The current query rate, averaged over the same window we average utilization over, + * as a fraction of the peak rate in this timeseries + */ + public double queryFractionOfMax(Duration window, Clock clock) { if (snapshots.isEmpty()) return 0.5; var max = snapshots.stream().mapToDouble(ClusterMetricSnapshot::queryRate).max().getAsDouble(); if (max == 0) return 1.0; - return snapshots.get(snapshots.size() - 1).queryRate() / max; + var average = queryRate(window, clock); + if (average.isEmpty()) return 0.5; // No measurements in the relevant time period + return average.getAsDouble() / max; } - public double currentQueryRate() { - return queryRateAt(snapshots.size() - 1); + /** Returns the average query rate in the given window, or empty if there are no measurements in it */ + public OptionalDouble queryRate(Duration window, Clock clock) { + Instant oldest = clock.instant().minus(window); + return snapshots.stream() + .filter(snapshot -> ! snapshot.at().isBefore(oldest)) + .mapToDouble(snapshot -> snapshot.queryRate()) + .average(); } - public double currentWriteRate() { - return writeRateAt(snapshots.size() - 1); + /** Returns the average query rate in the given window, or empty if there are no measurements in it */ + public OptionalDouble writeRate(Duration window, Clock clock) { + Instant oldest = clock.instant().minus(window); + return snapshots.stream() + .filter(snapshot -> ! snapshot.at().isBefore(oldest)) + .mapToDouble(snapshot -> snapshot.writeRate()) + .average(); } private double queryRateAt(int index) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java index 35717b97cf4..9f6a4fc77cd 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java @@ -3,7 +3,9 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.vespa.hosted.provision.applications.Application; +import java.time.Clock; import java.time.Duration; +import java.util.OptionalDouble; /** * A resource target to hit for the allocation optimizer. @@ -53,9 +55,10 @@ public class ResourceTarget { ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries clusterNodesTimeseries, AllocatableClusterResources current, - Application application) { + Application application, + Clock clock) { return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current) - / idealCpuLoad(scalingDuration, clusterTimeseries, application), + / idealCpuLoad(scalingDuration, clusterTimeseries, application, clock), nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current) / Resource.memory.idealAverageLoad(), nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current) @@ -74,14 +77,15 @@ public class ResourceTarget { /** Ideal cpu load must take the application traffic fraction into account */ public static double idealCpuLoad(Duration scalingDuration, ClusterTimeseries clusterTimeseries, - Application application) { - double queryCpuFraction = queryCpuFraction(clusterTimeseries); + Application application, + Clock clock) { + double queryCpuFraction = queryCpuFraction(clusterTimeseries, scalingDuration, clock); // What's needed to have headroom for growth during scale-up as a fraction of current resources? - double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(); // in fraction per minute of the current traffic + double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(scalingDuration, clock); // in fraction per minute of the current traffic double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes(); // Cap headroom at 10% above the historical observed peak - double fractionOfMax = clusterTimeseries.currentQueryFractionOfMax(); + double fractionOfMax = clusterTimeseries.queryFractionOfMax(scalingDuration, clock); if (fractionOfMax != 0) growthRateHeadroom = Math.min(growthRateHeadroom, 1 / fractionOfMax + 0.1); @@ -103,11 +107,11 @@ public class ResourceTarget { (1 - queryCpuFraction) * idealWriteCpuLoad(); } - private static double queryCpuFraction(ClusterTimeseries clusterTimeseries) { - double queryRate = clusterTimeseries.currentQueryRate(); - double writeRate = clusterTimeseries.currentWriteRate(); - if (queryRate == 0 && writeRate == 0) return queryCpuFraction(0.5); - return queryCpuFraction(queryRate / (queryRate + writeRate)); + private static double queryCpuFraction(ClusterTimeseries clusterTimeseries, Duration scalingDuration, Clock clock) { + OptionalDouble queryRate = clusterTimeseries.queryRate(scalingDuration, clock); + OptionalDouble writeRate = clusterTimeseries.writeRate(scalingDuration, clock); + if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5); + return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0))); } private static double queryCpuFraction(double queryFraction) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java index 4d1c963d8ea..8d8d7e01049 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java @@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.autoscale.Resource; import com.yahoo.vespa.hosted.provision.autoscale.ResourceTarget; import java.net.URI; +import java.time.Clock; import java.time.Duration; import java.util.Collection; import java.util.List; @@ -71,12 +72,12 @@ public class ApplicationSerializer { if (cluster.shouldSuggestResources(currentResources)) cluster.suggestedResources().ifPresent(suggested -> toSlime(suggested.resources(), clusterObject.setObject("suggested"))); cluster.targetResources().ifPresent(target -> toSlime(target, clusterObject.setObject("target"))); - clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, clusterObject.setObject("utilization")); + clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, metricsDb.clock(), clusterObject.setObject("utilization")); scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray("scalingEvents")); clusterObject.setString("autoscalingStatus", cluster.autoscalingStatus()); clusterObject.setLong("scalingDuration", scalingDuration.toMillis()); - clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate()); - clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.currentQueryFractionOfMax()); + clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate(scalingDuration, metricsDb.clock())); + clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.queryFractionOfMax(scalingDuration, metricsDb.clock())); } private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) { @@ -89,9 +90,10 @@ public class ApplicationSerializer { Duration scalingDuration, ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries clusterNodesTimeseries, + Clock clock, Cursor utilizationObject) { utilizationObject.setDouble("cpu", clusterNodesTimeseries.averageLoad(Resource.cpu)); - utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application)); + utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application, clock)); utilizationObject.setDouble("memory", clusterNodesTimeseries.averageLoad(Resource.memory)); utilizationObject.setDouble("idealMemory", ResourceTarget.idealMemoryLoad()); utilizationObject.setDouble("disk", clusterNodesTimeseries.averageLoad(Resource.disk)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 650bfe761b5..17ae36b3636 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -53,10 +53,11 @@ public class AutoscalingTest { assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); tester.clock().advance(Duration.ofDays(1)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high", - 14, 1, 1.4, 30.8, 30.8, + 15, 1, 1.2, 28.6, 28.6, tester.autoscale(application1, cluster1.id(), min, max).target()); tester.deploy(application1, cluster1, scaledResources); @@ -66,15 +67,21 @@ public class AutoscalingTest { tester.clock().advance(Duration.ofDays(2)); tester.addCpuMeasurements(0.8f, 1f, 3, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only assertTrue("Load change is large, but insufficient measurements for new config -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); tester.addCpuMeasurements(0.19f, 1f, 100, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only assertEquals("Load change is small -> No change", Optional.empty(), tester.autoscale(application1, cluster1.id(), min, max).target()); tester.addCpuMeasurements(0.1f, 1f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling down to minimum since usage has gone down significantly", - 15, 1, 1.0, 28.6, 28.6, + 7, 1, 1.0, 66.7, 66.7, tester.autoscale(application1, cluster1.id(), min, max).target()); var events = tester.nodeRepository().applications().get(application1).get().cluster(cluster1.id()).get().scalingEvents(); @@ -93,9 +100,9 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, resources); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only - tester.addCpuMeasurements(0.25f, 1f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only ClusterResources scaledResources = tester.assertResources("Scaling up since cpu usage is too high", 7, 1, 2.5, 80.0, 80.0, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -104,6 +111,8 @@ public class AutoscalingTest { tester.deactivateRetired(application1, cluster1, scaledResources); tester.addCpuMeasurements(0.1f, 1f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling down since cpu usage has gone down", 4, 1, 2.5, 68.6, 68.6, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -123,7 +132,7 @@ public class AutoscalingTest { .allMatch(n -> n.allocation().get().requestedResources().diskSpeed() == NodeResources.DiskSpeed.slow); tester.clock().advance(Duration.ofDays(2)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); // Changing min and max from slow to any ClusterResources min = new ClusterResources( 2, 1, @@ -183,8 +192,9 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, new NodeResources(1.9, 70, 70, 1)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMeasurements(0.25f, 0.95f, 0.95f, 0, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up to limit since resource usage is too high", 6, 1, 2.4, 78.0, 79.0, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -220,8 +230,9 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 5, new NodeResources(3.0, 10, 10, 1)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements( 0.3f, 1f, 240, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up since resource usage is too high", 6, 6, 3.6, 8.0, 10.0, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -256,8 +267,9 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, resources); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up since resource usage is too high", 7, 1, 2.5, 80.0, 80.0, tester.suggest(application1, cluster1.id(), min, max).target()); @@ -309,8 +321,9 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 5, resources); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up since resource usage is too high", 7, 7, 2.5, 80.0, 80.0, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -328,8 +341,9 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 2, resources); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up since resource usage is too high, changing to 1 group is cheaper", 8, 1, 2.7, 83.3, 83.3, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -348,10 +362,11 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 2, new NodeResources(10, 100, 100, 1)); tester.clock().advance(Duration.ofDays(1)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMemMeasurements(1.0f, 1f, 1000, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Increase group size to reduce memory load", - 8, 2, 13.6, 89.3, 62.5, + 8, 2, 12.4, 89.3, 62.5, tester.autoscale(application1, cluster1.id(), min, max).target()); } @@ -368,7 +383,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2)); tester.clock().advance(Duration.ofDays(2)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMemMeasurements(0.02f, 0.95f, 120, application1); tester.assertResources("Scaling down", 6, 1, 2.9, 4.0, 95.0, @@ -386,17 +401,20 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1"); tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only // No autoscaling as it is too soon to scale down after initial deploy (counting as a scaling event) tester.addMemMeasurements(0.02f, 0.95f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only assertTrue(tester.autoscale(application1, cluster1.id(), min, max).target().isEmpty()); // Trying the same later causes autoscaling tester.clock().advance(Duration.ofDays(2)); tester.addMemMeasurements(0.02f, 0.95f, 120, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling down", - 6, 1, 2.9, 4.0, 95.0, + 6, 1, 1.4, 4.0, 95.0, tester.autoscale(application1, cluster1.id(), min, max).target()); } @@ -413,10 +431,11 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1"); tester.deploy(application1, cluster1, min); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMeasurements(1.0f, 1.0f, 0.7f, 0, 1000, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up", - 4, 1, 7.4, 20, 200, + 4, 1, 6.7, 20, 200, tester.autoscale(application1, cluster1.id(), min, max).target()); } @@ -427,10 +446,11 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1"); tester.deploy(application1, cluster1, min); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMeasurements(1.0f, 1.0f, 0.7f, 0, 1000, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up", - 4, 1, 7.4, 34, 200, + 4, 1, 6.7, 34, 200, tester.autoscale(application1, cluster1.id(), min, max).target()); } } @@ -467,10 +487,11 @@ public class AutoscalingTest { tester.deactivateRetired(application1, cluster1, scaledResources); tester.clock().advance(Duration.ofDays(2)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMemMeasurements(0.3f, 0.6f, 1000, application1); + tester.clock().advance(Duration.ofMinutes(-10 * 5)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling down since resource usage has gone down", - 6, 1, 3, 83, 28.8, + 5, 1, 3, 83, 36.0, tester.autoscale(application1, cluster1.id(), min, max).target()); } @@ -485,8 +506,9 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); tester.deploy(application1, cluster1, 5, 1, resources); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only - tester.addCpuMeasurements(0.25f, 1f, 120, application1); + tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addCpuMeasurements(0.25f, 1f, 100, application1); // (no read share stored) tester.assertResources("Advice to scale up since we set aside for bcp by default", @@ -507,29 +529,34 @@ public class AutoscalingTest { @Test public void test_autoscaling_considers_growth_rate() { - NodeResources resources = new NodeResources(3, 100, 100, 1); - ClusterResources min = new ClusterResources( 1, 1, resources); - ClusterResources max = new ClusterResources(10, 1, resources); - AutoscalingTester tester = new AutoscalingTester(resources.withVcpu(resources.vcpu() * 2)); + NodeResources minResources = new NodeResources( 1, 100, 100, 1); + NodeResources midResources = new NodeResources( 5, 100, 100, 1); + NodeResources maxResources = new NodeResources(10, 100, 100, 1); + ClusterResources min = new ClusterResources(5, 1, minResources); + ClusterResources max = new ClusterResources(5, 1, maxResources); + AutoscalingTester tester = new AutoscalingTester(maxResources.withVcpu(maxResources.vcpu() * 2)); ApplicationId application1 = tester.applicationId("application1"); ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); - tester.deploy(application1, cluster1, 5, 1, resources); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only - tester.addCpuMeasurements(0.25f, 1f, 120, application1); + tester.deploy(application1, cluster1, 5, 1, midResources); + tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addCpuMeasurements(0.25f, 1f, 100, application1); // (no query rate data) - tester.assertResources("Advice to scale up since we assume we need 2x cpu for growth when no data", - 7, 1, 3, 100, 100, + tester.assertResources("Scale up since we assume we need 2x cpu for growth when no data scaling time data", + 5, 1, 6.3, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); tester.setScalingDuration(application1, cluster1.id(), Duration.ofMinutes(5)); tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> 10.0 + (t < 50 ? t : 100 - t)); - tester.assertResources("Advice to scale down since observed growth is much slower than scaling time", - 4, 1, 3, 100, 100, + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addCpuMeasurements(0.25f, 1f, 100, application1); + tester.assertResources("Scale down since observed growth is slower than scaling time", + 5, 1, 3.4, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); tester.clearQueryRateMeasurements(application1, cluster1.id()); @@ -538,8 +565,10 @@ public class AutoscalingTest { tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> 10.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49))); - tester.assertResources("Advice to scale up since observed growth is much faster than scaling time", - 10, 1, 3, 100, 100, + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addCpuMeasurements(0.25f, 1f, 100, application1); + tester.assertResources("Scale up since observed growth is faster than scaling time", + 5, 1, 10.0, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); } @@ -561,27 +590,37 @@ public class AutoscalingTest { // Why twice the query rate at time = 0? // This makes headroom for queries doubling, which we want to observe the effect of here - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 10.0); + tester.addCpuMeasurements(0.4f, 1f, 100, application1); + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t -> 10.0); tester.assertResources("Query and write load is equal -> scale up somewhat", 5, 1, 7.3, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 100.0 : 50.0, t -> 10.0); - tester.assertResources("Query load is 5x write load -> scale up more", - 5, 1, 9.7, 100, 100, + tester.addCpuMeasurements(0.4f, 1f, 100, application1); + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 80.0 : 40.0, t -> 10.0); + tester.assertResources("Query load is 4x write load -> scale up more", + 5, 1, 9.5, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); + tester.addCpuMeasurements(0.4f, 1f, 100, application1); + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); tester.assertResources("Write load is 10x query load -> scale down", 5, 1, 3.8, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t-> 0.0); + tester.addCpuMeasurements(0.4f, 1f, 100, application1); + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t-> 0.0); tester.assertResources("Query only -> largest possible", 5, 1, 10.0, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> 0.0, t -> 10.0); + tester.addCpuMeasurements(0.4f, 1f, 100, application1); + tester.clock().advance(Duration.ofMinutes(-100 * 5)); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> 0.0, t -> 10.0); tester.assertResources("Write only -> smallest possible", 5, 1, 2.1, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); @@ -597,9 +636,9 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); tester.deploy(application1, cluster1, 2, 1, resources); - tester.addCpuMeasurements(0.5f, 1f, 10, application1); tester.addQueryRateMeasurements(application1, cluster1.id(), 500, t -> 0.0); + tester.addCpuMeasurements(0.5f, 1f, 10, application1); tester.assertResources("Advice to scale up since observed growth is much faster than scaling time", 3, 1, 1, 4, 50, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index e24146d4752..cc3eeb47073 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -138,7 +138,7 @@ class AutoscalingTester { NodeList nodes = nodeRepository().nodes().list(Node.State.active).owner(applicationId); float oneExtraNodeFactor = (float)(nodes.size() - 1.0) / (nodes.size()); for (int i = 0; i < count; i++) { - clock().advance(Duration.ofMinutes(1)); + clock().advance(Duration.ofMinutes(5)); for (Node node : nodes) { float cpu = value * oneExtraNodeFactor; float memory = (float) Resource.memory.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor; @@ -241,10 +241,9 @@ class AutoscalingTester { int measurements, IntFunction<Double> queryRate, IntFunction<Double> writeRate) { - Instant time = clock().instant(); for (int i = 0; i < measurements; i++) { - db.addClusterMetrics(application, Map.of(cluster, new ClusterMetricSnapshot(time, queryRate.apply(i), writeRate.apply(i)))); - time = time.plus(Duration.ofMinutes(5)); + db.addClusterMetrics(application, Map.of(cluster, new ClusterMetricSnapshot(clock().instant(), queryRate.apply(i), writeRate.apply(i)))); + clock().advance(Duration.ofMinutes(5)); } } @@ -253,10 +252,9 @@ class AutoscalingTester { ClusterSpec.Id cluster, int measurements, IntFunction<Double> queryRate) { - Instant time = clock().instant(); for (int i = 0; i < measurements; i++) { - db.addClusterMetrics(application, Map.of(cluster, new ClusterMetricSnapshot(time, queryRate.apply(i), 0.0))); - time = time.plus(Duration.ofMinutes(5)); + db.addClusterMetrics(application, Map.of(cluster, new ClusterMetricSnapshot(clock().instant(), queryRate.apply(i), 0.0))); + clock().advance(Duration.ofMinutes(5)); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseriesTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseriesTest.java index 9a08e7b3279..c56e65ebbba 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseriesTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseriesTest.java @@ -22,22 +22,23 @@ public class ClusterTimeseriesTest { @Test public void test_empty() { + ManualClock clock = new ManualClock(); var timeseries = new ClusterTimeseries(cluster, List.of()); - assertEquals(0.1, timeseries.maxQueryGrowthRate(), delta); + assertEquals(0.1, timeseries.maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test public void test_constant_rate_short() { var clock = new ManualClock(); var timeseries = new ClusterTimeseries(cluster, queryRate(10, clock, t -> 50.0)); - assertEquals(0.1, timeseries.maxQueryGrowthRate(), delta); + assertEquals(0.1, timeseries.maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test public void test_constant_rate_long() { var clock = new ManualClock(); var timeseries = new ClusterTimeseries(cluster, queryRate(10000, clock, t -> 50.0)); - assertEquals(0.0, timeseries.maxQueryGrowthRate(), delta); + assertEquals(0.0, timeseries.maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test @@ -47,7 +48,7 @@ public class ClusterTimeseriesTest { snapshots.addAll(queryRate(1000, clock, t -> 50.0)); snapshots.addAll(queryRate(10, clock, t -> 400.0)); snapshots.addAll(queryRate(1000, clock, t -> 50.0)); - assertEquals((400-50)/5.0/50.0, new ClusterTimeseries(cluster, snapshots).maxQueryGrowthRate(), delta); + assertEquals((400-50)/5.0/50.0, new ClusterTimeseries(cluster, snapshots).maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test @@ -61,7 +62,7 @@ public class ClusterTimeseriesTest { snapshots.addAll(queryRate(1000, clock, t -> 50.0)); snapshots.addAll(queryRate(10, clock, t -> 800.0)); snapshots.addAll(queryRate(1000, clock, t -> 50.0)); - assertEquals((800-50)/5.0/50.0, new ClusterTimeseries(cluster, snapshots).maxQueryGrowthRate(), delta); + assertEquals((800-50)/5.0/50.0, new ClusterTimeseries(cluster, snapshots).maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test @@ -70,7 +71,7 @@ public class ClusterTimeseriesTest { var snapshots = new ArrayList<ClusterMetricSnapshot>(); snapshots.addAll(queryRate(100, clock, t -> (double)t)); snapshots.addAll(queryRate(100, clock, t -> 100.0 - t)); - assertEquals(1/5.0, new ClusterTimeseries(cluster, snapshots).maxQueryGrowthRate(), delta); + assertEquals(1/5.0, new ClusterTimeseries(cluster, snapshots).maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test @@ -78,7 +79,7 @@ public class ClusterTimeseriesTest { var clock = new ManualClock(); var timeseries = new ClusterTimeseries(cluster, queryRate(10000, clock, t -> 10.0 + 100.0 * Math.sin(t))); - assertEquals(0.26, timeseries.maxQueryGrowthRate(), delta); + assertEquals(0.26, timeseries.maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test @@ -86,7 +87,7 @@ public class ClusterTimeseriesTest { var clock = new ManualClock(); var timeseries = new ClusterTimeseries(cluster, queryRate(10000, clock, t -> 1000.0 + 10.0 * Math.sin(t))); - assertEquals(0.0, timeseries.maxQueryGrowthRate(), delta); + assertEquals(0.0, timeseries.maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } @Test @@ -94,7 +95,7 @@ public class ClusterTimeseriesTest { var clock = new ManualClock(); var timeseries = new ClusterTimeseries(cluster, queryRate(10000, clock, t -> 10.0 + 100.0 * Math.sin(t) + 80.0 * Math.sin(10 * t)) ); - assertEquals(1.765, timeseries.maxQueryGrowthRate(), delta); + assertEquals(1.765, timeseries.maxQueryGrowthRate(Duration.ofMinutes(5), clock), delta); } private List<ClusterMetricSnapshot> queryRate(int count, ManualClock clock, IntFunction<Double> rate) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java index f616e3e8b9d..6f60de62f1f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java @@ -28,48 +28,79 @@ public class ResourceTargetTest { @Test public void test_traffic_headroom() { + ManualClock clock = new ManualClock(); Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); - Cluster cluster = new Cluster(ClusterSpec.Id.from("test"), - false, - new ClusterResources(5, 1, new NodeResources(1, 10, 100, 1)), - new ClusterResources(5, 1, new NodeResources(1, 10, 100, 1)), - Optional.empty(), - Optional.empty(), - List.of(), - ""); + Cluster cluster = cluster(new NodeResources(1, 10, 100, 1)); application = application.with(cluster); - // No current traffic: Ideal load is low but capped + // No current traffic share: Ideal load is low but capped application = application.with(new Status(0.0, 1.0)); assertEquals(0.131, ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - new ClusterTimeseries(cluster.id(), - loadSnapshots(100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0)), - application), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), + application, + clock), delta); - // Almost current traffic: Ideal load is low but capped + // Almost no current traffic share: Ideal load is low but capped application = application.with(new Status(0.0001, 1.0)); assertEquals(0.131, ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - new ClusterTimeseries(cluster.id(), - loadSnapshots(100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0)), - application), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), + application, + clock), delta); } + @Test + public void test_growth_headroom() { + ManualClock clock = new ManualClock(); + + Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); + Cluster cluster = cluster(new NodeResources(1, 10, 100, 1)); + application = application.with(cluster); + + // No current traffic: Ideal load is low but capped + assertEquals(0.275, + ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), + application, + clock), + delta); + + // Almost current traffic: Ideal load is low but capped + application = application.with(new Status(0.0001, 1.0)); + assertEquals(0.04, + ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock), + application, + clock), + delta); + } + + private Cluster cluster(NodeResources resources) { + return new Cluster(ClusterSpec.Id.from("test"), + false, + new ClusterResources(5, 1, resources), + new ClusterResources(5, 1, resources), + Optional.empty(), + Optional.empty(), + List.of(), + ""); + } /** Creates the given number of measurements, spaced 5 minutes between, using the given function */ - private List<ClusterMetricSnapshot> loadSnapshots(int measurements, - IntFunction<Double> queryRate, - IntFunction<Double> writeRate) { + private ClusterTimeseries timeseries(Cluster cluster, + int measurements, + IntFunction<Double> queryRate, + IntFunction<Double> writeRate, + ManualClock clock) { List<ClusterMetricSnapshot> snapshots = new ArrayList<>(measurements); - ManualClock clock = new ManualClock(); for (int i = 0; i < measurements; i++) { snapshots.add(new ClusterMetricSnapshot(clock.instant(), queryRate.apply(i), writeRate.apply(i))); clock.advance(Duration.ofMinutes(5)); } - return snapshots; + return new ClusterTimeseries(cluster.id(),snapshots); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java index f292ab8ccf1..b0b3f8c3ed8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java @@ -161,10 +161,19 @@ public class AutoscalingMaintainerTest { tester.clock().advance(Duration.ofDays(1)); - if (i % 2 == 0) // high load - tester.addMeasurements(0.9f, 0.9f, 0.9f, i, 200, app1); - else // low load - tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 200, app1); + if (i % 2 == 0) { // high load + for (int j = 0; j < 200; j++ ) { + tester.addMeasurements(0.99f, 0.99f, 0.99f, i, 1, app1); + tester.clock().advance(Duration.ofMinutes(1)); + } + } + else { // low load + for (int j = 0; j < 200; j++ ) { + tester.addMeasurements(0.2f, 0.2f, 0.2f, i, 1, app1); + tester.clock().advance(Duration.ofMinutes(1)); + } + } + tester.addQueryRateMeasurements(app1, cluster1.id(), 2, t -> (t == 0 ? 20.0 : 10.0 )); tester.maintainer().maintain(); } |