diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-12-12 15:32:12 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-12 15:32:12 +0100 |
commit | 6cba335640ae35e45f87d7566bc339ef6eb2c235 (patch) | |
tree | 1e2b60412b39f9399ae5c441e938652381928a05 | |
parent | 9cb4add47c48a264b9d204f59ca73d3082c2b490 (diff) | |
parent | 2d393d038aab7b2b438f04cf01e6202e0090a4ea (diff) |
Merge pull request #25216 from vespa-engine/bratseth/autoscaling-noise
Downweight traffic headroom when traffic is low
10 files changed, 106 insertions, 59 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 0facc6d37ea..1928a784763 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -57,6 +57,7 @@ public class ClusterModel { // Lazily initialized members private Double queryFractionOfMax = null; private Double maxQueryGrowthRate = null; + private OptionalDouble averageQueryRate = null; public ClusterModel(Zone zone, Application application, @@ -131,19 +132,25 @@ public class ClusterModel { /** * Returns the predicted max query growth rate per minute as a fraction of the average traffic - * in the scaling window + * in the scaling window. */ public double maxQueryGrowthRate() { if (maxQueryGrowthRate != null) return maxQueryGrowthRate; return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock); } - /** Returns the average query rate in the scaling window as a fraction of the max observed query rate */ + /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */ public double queryFractionOfMax() { if (queryFractionOfMax != null) return queryFractionOfMax; return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock); } + /** Returns the average query rate in the scaling window. */ + public OptionalDouble averageQueryRate() { + if (averageQueryRate != null) return averageQueryRate; + return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock); + } + /** Returns the average of the last load measurement from each node. */ public Load currentLoad() { return nodeTimeseries().currentLoad(); } @@ -239,7 +246,8 @@ public class ClusterModel { // Cap headroom at 10% above the historical observed peak if (queryFractionOfMax() != 0) growthRateHeadroom = Math.min(growthRateHeadroom, 1 / queryFractionOfMax() + 0.1); - return growthRateHeadroom; + + return adjustByConfidence(growthRateHeadroom); } /** @@ -255,15 +263,23 @@ public class ClusterModel { trafficShiftHeadroom = 1/application.status().maxReadShare(); else trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare(); - return Math.min(trafficShiftHeadroom, 1/application.status().maxReadShare()); + return adjustByConfidence(Math.min(trafficShiftHeadroom, 1/application.status().maxReadShare())); + } + + /** + * Headroom values are a multiplier of the current query rate. + * Adjust this value closer to 1 if the query rate is too low to derive statistical conclusions + * with high confidence to avoid large adjustments caused by random noise due to low traffic numbers. + */ + private double adjustByConfidence(double headroom) { + return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / 100.0) ) + 1; } /** The estimated fraction of cpu usage which goes to processing queries vs. writes */ public double queryCpuFraction() { - OptionalDouble queryRate = clusterTimeseries().queryRate(scalingDuration(), clock); OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock); - if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5); - return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0))); + if (averageQueryRate().orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5); + return queryCpuFraction(averageQueryRate().orElse(0) / (averageQueryRate().orElse(0) + writeRate.orElse(0))); } private double queryCpuFraction(double queryRateFraction) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index 40bad7022d6..1a3ac17c7ef 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -74,7 +74,8 @@ public class GroupPreparer { public PrepareResult prepare(ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes, List<Node> surplusActiveNodes, NodeIndices indices, int wantedGroups, NodesAndHosts<LockedNodeList> allNodesAndHosts) { - log.log(Level.FINE, () -> "Preparing " + cluster.type().name() + " " + cluster.id() + " with requested resources " + requestedNodes.resources().orElse(NodeResources.unspecified())); + log.log(Level.FINE, () -> "Preparing " + cluster.type().name() + " " + cluster.id() + " with requested resources " + + requestedNodes.resources().orElse(NodeResources.unspecified())); // Try preparing in memory without global unallocated lock. Most of the time there should be no changes, // and we can return nodes previously allocated. NodeAllocation probeAllocation = prepareAllocation(application, cluster, requestedNodes, surplusActiveNodes, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index eda677c6e59..f6c393a6f4d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -457,7 +457,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); Duration timePassed = fixture.loader().addCpuMeasurements(0.25, 120); fixture.tester().clock().advance(timePassed.negated()); - fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 1.0); + fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 200.0 : 100.0, t -> 10.0); fixture.tester().assertResources("Scaling up cpu, others down, changing to 1 group is cheaper", 8, 1, 2.8, 36.2, 56.4, fixture.autoscale()); @@ -496,7 +496,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(1)); fixture.loader().applyMemLoad(1.0, 1000); fixture.tester().assertResources("Increase group size to reduce memory load", - 8, 2, 4.5, 97.1, 74.7, + 8, 2, 13.9, 97.1, 66.6, fixture.autoscale()); } @@ -564,7 +564,7 @@ public class AutoscalingTest { var fixture = AutoscalingTester.fixture().awsProdSetup(true).build(); fixture.tester().clock().advance(Duration.ofDays(2)); - Duration timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t -> 0.0); + Duration timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t -> 0.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.25, 200); @@ -574,17 +574,17 @@ public class AutoscalingTest { fixture.setScalingDuration(Duration.ofMinutes(5)); fixture.tester().clock().advance(Duration.ofDays(2)); - timeAdded = fixture.loader().addLoadMeasurements(100, t -> 10.0 + (t < 50 ? t : 100 - t), t -> 0.0); + timeAdded = fixture.loader().addLoadMeasurements(100, t -> 100.0 + (t < 50 ? t : 100 - t), t -> 0.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.25, 200); fixture.tester().assertResources("Scale down since observed growth is slower than scaling time", - 5, 1, 2.2, 13.3, 83.2, + 5, 1, 2.1, 13.3, 83.2, fixture.autoscale()); fixture.setScalingDuration(Duration.ofMinutes(60)); fixture.tester().clock().advance(Duration.ofDays(2)); timeAdded = fixture.loader().addLoadMeasurements(100, - t -> 10.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49)), + t -> 100.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49)), t -> 0.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.25, 200); @@ -594,6 +594,23 @@ public class AutoscalingTest { } @Test + public void test_autoscaling_weights_growth_rate_by_confidence() { + var fixture = AutoscalingTester.fixture().awsProdSetup(true).build(); + + double scalingFactor = 1.0/6000; // To make the average query rate low + fixture.setScalingDuration(Duration.ofMinutes(60)); + fixture.tester().clock().advance(Duration.ofDays(2)); + Duration timeAdded = fixture.loader().addLoadMeasurements(100, + t -> scalingFactor * (100.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49))), + t -> 0.0); + fixture.tester.clock().advance(timeAdded.negated()); + fixture.loader().addCpuMeasurements(0.7, 200); + fixture.tester().assertResources("Scale up slightly since observed growth is faster than scaling time, but we are not confident", + 5, 1, 2.1, 13.3, 83.2, + fixture.autoscale()); + } + + @Test public void test_autoscaling_considers_query_vs_write_rate() { var fixture = AutoscalingTester.fixture().awsProdSetup(true).build(); @@ -603,7 +620,7 @@ public class AutoscalingTest { // This makes headroom for queries doubling, which we want to observe the effect of here fixture.tester().clock().advance(Duration.ofDays(2)); - var timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t -> 10.0); + var timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t -> 100.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester.assertResources("Query and write load is equal -> scale up somewhat", @@ -611,7 +628,7 @@ public class AutoscalingTest { fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); - timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 80.0 : 40.0, t -> 10.0); + timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 800.0 : 400.0, t -> 100.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); // TODO: Ackhually, we scale down here - why? @@ -620,7 +637,7 @@ public class AutoscalingTest { fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); - timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); + timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t -> 1000.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Write load is 10x query load -> scale down", @@ -628,7 +645,7 @@ public class AutoscalingTest { fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); - timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t-> 0.0); + timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t-> 0.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Query only -> largest possible", @@ -636,7 +653,7 @@ public class AutoscalingTest { fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); - timeAdded = fixture.loader().addLoadMeasurements(100, t -> 0.0, t -> 10.0); + timeAdded = fixture.loader().addLoadMeasurements(100, t -> 0.0, t -> 100.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Write only -> smallest possible", diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java index b38dbfc55ae..ed00134af55 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java @@ -41,31 +41,41 @@ public class ClusterModelTest { public void test_traffic_headroom() { // No current traffic share: Ideal load is low but capped var model1 = clusterModel(new Status(0.0, 1.0), - t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); - assertEquals(0.37067209775967414, model1.idealLoad().cpu(), delta); + t -> t == 0 ? 10000.0 : 100.0, t -> 0.0); + assertEquals(0.32653061224489793, model1.idealLoad().cpu(), delta); // Almost no current traffic share: Ideal load is low but capped var model2 = clusterModel(new Status(0.0001, 1.0), - t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); - assertEquals(0.37067209775967414, model2.idealLoad().cpu(), delta); + t -> t == 0 ? 10000.0 : 100.0, t -> 0.0); + assertEquals(0.32653061224489793, model2.idealLoad().cpu(), delta); + + // Almost no traffic: Headroom impact is reduced due to uncertainty + var model3 = clusterModel(new Status(0.0001, 1.0), + t -> t == 0 ? 10000.0 : 1.0, t -> 0.0); + assertEquals(0.6465952717720751, model3.idealLoad().cpu(), delta); } @Test public void test_growth_headroom() { // No traffic data: Ideal load assumes 2 regions var model1 = clusterModel(new Status(0.0, 0.0), - t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); - assertEquals(0.2240325865580448, model1.idealLoad().cpu(), delta); + t -> t == 0 ? 10000.0 : 100.0, t -> 0.0); + assertEquals(0.16326530612244897, model1.idealLoad().cpu(), delta); // No traffic: Ideal load is higher since we now know there is only one zone var model2 = clusterModel(new Status(0.0, 1.0), - t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); - assertEquals(0.37067209775967414, model2.idealLoad().cpu(), delta); + t -> t == 0 ? 10000.0 : 100.0, t -> 0.0); + assertEquals(0.32653061224489793, model2.idealLoad().cpu(), delta); // Almost no current traffic: Similar number as above var model3 = clusterModel(new Status(0.0001, 1.0), - t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0); + t -> t == 0 ? 10000.0 : 100.0, t -> 0.0); assertEquals(0.32653061224489793, model3.idealLoad().cpu(), delta); + + // Low query rate: Impact of growth headroom is reduced due to uncertainty + var model4 = clusterModel(new Status(0.0001, 1.0), + t -> t == 0 ? 100.0 : 1.0, t -> 0.0); + assertEquals(0.6465952717720751, model4.idealLoad().cpu(), delta); } private ClusterModel clusterModelWithNoData() { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java index 9158262b134..10c8c7434b1 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java @@ -82,13 +82,13 @@ public class Loader { public void applyCpuLoad(double cpuLoad, int measurements) { addCpuMeasurements((float)cpuLoad, measurements); fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements)); - addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only } public void applyMemLoad(double memLoad, int measurements) { addMemMeasurements(memLoad, measurements); fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements)); - addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only } /** @@ -140,13 +140,13 @@ public class Loader { public void applyLoad(Load load, int measurements) { addMeasurements(load, measurements); fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements)); - addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only } public void applyLoad(Load load, int generation, boolean inService, boolean stable, int measurements) { addMeasurements(load, generation, inService, stable, measurements); fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements)); - addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only } public Duration addQueryRateMeasurements(int measurements, IntFunction<Double> queryRate) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java index 5ceb28d3fed..214d842e4bb 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java @@ -70,8 +70,8 @@ public class AutoscalingMaintainerTest { assertTrue(tester.deployer().lastDeployTime(app1).isEmpty()); assertTrue(tester.deployer().lastDeployTime(app2).isEmpty()); - tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1); - tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app2); + tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1, cluster1.id()); + tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app2, cluster2.id()); tester.clock().advance(Duration.ofMinutes(10)); tester.maintainer().maintain(); @@ -93,7 +93,7 @@ public class AutoscalingMaintainerTest { tester.deploy(app1, cluster1, app1Capacity); // Measure overload - tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1); + tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1, cluster1.id()); // Causes autoscaling tester.clock().advance(Duration.ofMinutes(10)); @@ -110,24 +110,24 @@ public class AutoscalingMaintainerTest { assertEquals(firstMaintenanceTime.toEpochMilli(), events.get(1).at().toEpochMilli()); // Measure overload still, since change is not applied, but metrics are discarded - tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1); + tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli()); // Measure underload, but no autoscaling since we still haven't measured we're on the new config generation - tester.addMeasurements(0.1f, 0.1f, 0.1f, 0, 500, app1); + tester.addMeasurements(0.1f, 0.1f, 0.1f, 0, 500, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli()); // Add measurement of the expected generation, leading to rescaling // - record scaling completion tester.clock().advance(Duration.ofMinutes(5)); - tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 1, app1); + tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 1, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli()); // - measure underload tester.clock().advance(Duration.ofDays(4)); // Exit cooling period - tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 500, app1); + tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 500, app1, cluster1.id()); Instant lastMaintenanceTime = tester.clock().instant(); tester.maintainer().maintain(); assertEquals(lastMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli()); @@ -161,16 +161,16 @@ public class AutoscalingMaintainerTest { Duration samplePeriod = Duration.ofSeconds(150); for (int i = 0; i < 20; i++) { // Record completion to keep scaling window at minimum - tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 1, app1); + tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 1, app1, cluster1.id()); tester.maintainer().maintain(); tester.clock().advance(Duration.ofDays(1)); if (i % 2 == 0) { // high load - tester.addMeasurements(0.99f, 0.99f, 0.99f, i, measurements, app1); + tester.addMeasurements(0.99f, 0.99f, 0.99f, i, measurements, app1, cluster1.id()); } else { // low load - tester.addMeasurements(0.2f, 0.2f, 0.2f, i, measurements, app1); + tester.addMeasurements(0.2f, 0.2f, 0.2f, i, measurements, app1, cluster1.id()); } tester.clock().advance(samplePeriod.negated().multipliedBy(measurements)); tester.addQueryRateMeasurements(app1, cluster1.id(), measurements, t -> (t == 0 ? 20.0 : 10.0 )); @@ -180,7 +180,7 @@ public class AutoscalingMaintainerTest { assertEquals(Cluster.maxScalingEvents, tester.cluster(app1, cluster1).scalingEvents().size()); // Complete last event - tester.addMeasurements(0.1f, 0.1f, 0.1f, 20, 1, app1); + tester.addMeasurements(0.1f, 0.1f, 0.1f, 20, 1, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals("Last event is completed", tester.clock().instant(), @@ -202,7 +202,6 @@ public class AutoscalingMaintainerTest { autoscale(false, Duration.ofMinutes( 1), Duration.ofMinutes( 5), clock, app1, cluster1, tester); autoscale( true, Duration.ofMinutes(19), Duration.ofMinutes(10), clock, app1, cluster1, tester); - autoscale( true, Duration.ofMinutes(40), Duration.ofMinutes(20), clock, app1, cluster1, tester); } @Test @@ -217,21 +216,21 @@ public class AutoscalingMaintainerTest { // Add a scaling event tester.deploy(app1, cluster1, capacity); - tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 4, app1); + tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 4, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals("Scale up: " + tester.cluster(app1, cluster1).autoscalingStatus(), 1, tester.cluster(app1, cluster1).lastScalingEvent().get().generation()); // measurements with outdated generation are ignored -> no autoscaling - var duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 0, 2, app1); + var duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 0, 2, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals("Measurements with outdated generation are ignored -> no autoscaling", 1, tester.cluster(app1, cluster1).lastScalingEvent().get().generation()); tester.clock().advance(duration.negated()); - duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1); + duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals("Measurements right after generation change are ignored -> no autoscaling", 1, @@ -242,7 +241,7 @@ public class AutoscalingMaintainerTest { tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1))); tester.nodeRepository().nodes().list().owner(app1).asList().forEach(node -> recordRestart(node, tester.nodeRepository())); - duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1); + duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals("Measurements right after restart are ignored -> no autoscaling", 1, @@ -250,7 +249,7 @@ public class AutoscalingMaintainerTest { tester.clock().advance(duration.negated()); tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1))); - tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1); + tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1, cluster1.id()); tester.maintainer().maintain(); assertEquals("We have valid measurements -> scale up", 2, @@ -310,7 +309,7 @@ public class AutoscalingMaintainerTest { clock.advance(completionTime); float load = down ? 0.1f : 1.0f; - tester.addMeasurements(load, load, load, generation, 1, application); + tester.addMeasurements(load, load, load, generation, 1, application, cluster.id()); tester.maintainer().maintain(); assertEvent("Measured completion of the last scaling event, but no new autoscaling yet", generation, Optional.of(clock.instant()), @@ -320,7 +319,7 @@ public class AutoscalingMaintainerTest { else clock.advance(expectedWindow.minus(completionTime)); - tester.addMeasurements(load, load, load, generation, 200, application); + tester.addMeasurements(load, load, load, generation, 200, application, cluster.id()); tester.maintainer().maintain(); assertEquals("We passed window duration so a new autoscaling is started: " + tester.cluster(application, cluster).autoscalingStatus(), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java index d921af9543e..95e36787219 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java @@ -71,7 +71,8 @@ public class AutoscalingMaintainerTester { return provisioningTester.deploy(application, cluster, capacity); } - public Duration addMeasurements(float cpu, float mem, float disk, long generation, int count, ApplicationId applicationId) { + public Duration addMeasurements(float cpu, float mem, float disk, long generation, int count, + ApplicationId applicationId, ClusterSpec.Id clusterId) { NodeList nodes = nodeRepository().nodes().list(Node.State.active).owner(applicationId); Instant startTime = clock().instant(); for (int i = 0; i < count; i++) { @@ -85,7 +86,10 @@ public class AutoscalingMaintainerTester { 0.0)))); clock().advance(Duration.ofSeconds(150)); } - return Duration.between(startTime, clock().instant()); + var totalDuration = Duration.between(startTime, clock().instant()); + clock().advance(totalDuration.negated()); + addQueryRateMeasurements(applicationId, clusterId, count, t -> 100.0); + return totalDuration; } /** Creates the given number of measurements, spaced 5 minutes between, using the given function */ diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index b43baf444c8..f5ab822721f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -70,9 +70,9 @@ public class ScalingSuggestionsMaintainerTest { new TestMetric()); maintainer.maintain(); - assertEquals("13 nodes with [vcpu: 5.5, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", + assertEquals("8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", suggestionOf(app1, cluster1, tester).get().resources().toString()); - assertEquals("8 nodes with [vcpu: 11.0, memory: 4.4 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", + assertEquals("8 nodes with [vcpu: 3.6, memory: 4.4 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", suggestionOf(app2, cluster2, tester).get().resources().toString()); // Utilization goes way down @@ -80,14 +80,14 @@ public class ScalingSuggestionsMaintainerTest { addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository()); maintainer.maintain(); assertEquals("Suggestion stays at the peak value observed", - "13 nodes with [vcpu: 5.5, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", + "8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", suggestionOf(app1, cluster1, tester).get().resources().toString()); // Utilization is still way down and a week has passed tester.clock().advance(Duration.ofDays(7)); addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository()); maintainer.maintain(); assertEquals("Peak suggestion has been outdated", - "5 nodes with [vcpu: 1.8, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", + "3 nodes with [vcpu: 1.2, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]", suggestionOf(app1, cluster1, tester).get().resources().toString()); assertTrue(shouldSuggest(app1, cluster1, tester)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json index 6adcb1199eb..0d640f7e3b2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json @@ -71,7 +71,7 @@ }, "utilization" : { "cpu" : 0.0, - "idealCpu": 0.1375, + "idealCpu": 0.40750000000000003, "currentCpu": 0.0, "peakCpu": 0.0, "memory" : 0.0, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json index 5babf5fc843..80da118f620 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json @@ -45,7 +45,7 @@ }, "utilization" : { "cpu" : 0.0, - "idealCpu": 0.1394913986537023, + "idealCpu": 0.42670157068062825, "currentCpu": 0.0, "peakCpu": 0.0, "memory" : 0.0, |