Merge pull request #25216 from vespa-engine/bratseth/autoscaling-noise

Downweight traffic headroom when traffic is low
author: Jon Bratseth <bratseth@gmail.com> 2022-12-12 15:32:12 +0100
committer: GitHub <noreply@github.com> 2022-12-12 15:32:12 +0100
commit: 6cba335640ae35e45f87d7566bc339ef6eb2c235 (patch)
tree: 1e2b60412b39f9399ae5c441e938652381928a05
parent: 9cb4add47c48a264b9d204f59ca73d3082c2b490 (diff)
parent: 2d393d038aab7b2b438f04cf01e6202e0090a4ea (diff)
10 files changed, 106 insertions, 59 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 0facc6d37ea..1928a784763 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -57,6 +57,7 @@ public class ClusterModel {
     // Lazily initialized members
     private Double queryFractionOfMax = null;
     private Double maxQueryGrowthRate = null;
+    private OptionalDouble averageQueryRate = null;
 
     public ClusterModel(Zone zone,
                         Application application,
@@ -131,19 +132,25 @@ public class ClusterModel {
 
     /**
      * Returns the predicted max query growth rate per minute as a fraction of the average traffic
-     * in the scaling window
+     * in the scaling window.
      */
     public double maxQueryGrowthRate() {
         if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
         return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock);
     }
 
-    /** Returns the average query rate in the scaling window as a fraction of the max observed query rate */
+    /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */
     public double queryFractionOfMax() {
         if (queryFractionOfMax != null) return queryFractionOfMax;
         return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock);
     }
 
+    /** Returns the average query rate in the scaling window. */
+    public OptionalDouble averageQueryRate() {
+        if (averageQueryRate != null) return averageQueryRate;
+        return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
+    }
+
     /** Returns the average of the last load measurement from each node. */
     public Load currentLoad() { return nodeTimeseries().currentLoad(); }
 
@@ -239,7 +246,8 @@ public class ClusterModel {
         // Cap headroom at 10% above the historical observed peak
         if (queryFractionOfMax() != 0)
             growthRateHeadroom = Math.min(growthRateHeadroom, 1 / queryFractionOfMax() + 0.1);
-        return growthRateHeadroom;
+
+        return adjustByConfidence(growthRateHeadroom);
     }
 
     /**
@@ -255,15 +263,23 @@ public class ClusterModel {
             trafficShiftHeadroom = 1/application.status().maxReadShare();
         else
             trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare();
-        return Math.min(trafficShiftHeadroom, 1/application.status().maxReadShare());
+        return adjustByConfidence(Math.min(trafficShiftHeadroom, 1/application.status().maxReadShare()));
+    }
+
+    /**
+     * Headroom values are a multiplier of the current query rate.
+     * Adjust this value closer to 1 if the query rate is too low to derive statistical conclusions
+     * with high confidence to avoid large adjustments caused by random noise due to low traffic numbers.
+     */
+    private double adjustByConfidence(double headroom) {
+        return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / 100.0) ) + 1;
     }
 
     /** The estimated fraction of cpu usage which goes to processing queries vs. writes */
     public double queryCpuFraction() {
-        OptionalDouble queryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
         OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock);
-        if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
-        return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0)));
+        if (averageQueryRate().orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
+        return queryCpuFraction(averageQueryRate().orElse(0) / (averageQueryRate().orElse(0) + writeRate.orElse(0)));
     }
 
     private double queryCpuFraction(double queryRateFraction) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
index 40bad7022d6..1a3ac17c7ef 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
@@ -74,7 +74,8 @@ public class GroupPreparer {
     public PrepareResult prepare(ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes,
                                  List<Node> surplusActiveNodes, NodeIndices indices, int wantedGroups,
                                  NodesAndHosts<LockedNodeList> allNodesAndHosts) {
-        log.log(Level.FINE, () -> "Preparing " + cluster.type().name() + " " + cluster.id() + " with requested resources " + requestedNodes.resources().orElse(NodeResources.unspecified()));
+        log.log(Level.FINE, () -> "Preparing " + cluster.type().name() + " " + cluster.id() + " with requested resources " +
+                                  requestedNodes.resources().orElse(NodeResources.unspecified()));
         // Try preparing in memory without global unallocated lock. Most of the time there should be no changes,
         // and we can return nodes previously allocated.
         NodeAllocation probeAllocation = prepareAllocation(application, cluster, requestedNodes, surplusActiveNodes,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index eda677c6e59..f6c393a6f4d 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -457,7 +457,7 @@ public class AutoscalingTest {
         fixture.tester().clock().advance(Duration.ofDays(2));
         Duration timePassed = fixture.loader().addCpuMeasurements(0.25, 120);
         fixture.tester().clock().advance(timePassed.negated());
-        fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 1.0);
+        fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 200.0 : 100.0, t -> 10.0);
         fixture.tester().assertResources("Scaling up cpu, others down, changing to 1 group is cheaper",
                                          8, 1, 2.8, 36.2, 56.4,
                                          fixture.autoscale());
@@ -496,7 +496,7 @@ public class AutoscalingTest {
         fixture.tester().clock().advance(Duration.ofDays(1));
         fixture.loader().applyMemLoad(1.0, 1000);
         fixture.tester().assertResources("Increase group size to reduce memory load",
-                                         8, 2, 4.5,  97.1, 74.7,
+                                         8, 2, 13.9,  97.1, 66.6,
                                          fixture.autoscale());
     }
 
@@ -564,7 +564,7 @@ public class AutoscalingTest {
         var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
 
         fixture.tester().clock().advance(Duration.ofDays(2));
-        Duration timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t -> 0.0);
+        Duration timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t -> 0.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.25, 200);
 
@@ -574,17 +574,17 @@ public class AutoscalingTest {
 
         fixture.setScalingDuration(Duration.ofMinutes(5));
         fixture.tester().clock().advance(Duration.ofDays(2));
-        timeAdded = fixture.loader().addLoadMeasurements(100, t -> 10.0 + (t < 50 ? t : 100 - t), t -> 0.0);
+        timeAdded = fixture.loader().addLoadMeasurements(100, t -> 100.0 + (t < 50 ? t : 100 - t), t -> 0.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.25, 200);
         fixture.tester().assertResources("Scale down since observed growth is slower than scaling time",
-                                         5, 1, 2.2,   13.3, 83.2,
+                                         5, 1, 2.1,   13.3, 83.2,
                                          fixture.autoscale());
 
         fixture.setScalingDuration(Duration.ofMinutes(60));
         fixture.tester().clock().advance(Duration.ofDays(2));
         timeAdded = fixture.loader().addLoadMeasurements(100,
-                                                         t -> 10.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49)),
+                                                         t -> 100.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49)),
                                                          t -> 0.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.25, 200);
@@ -594,6 +594,23 @@ public class AutoscalingTest {
     }
 
     @Test
+    public void test_autoscaling_weights_growth_rate_by_confidence() {
+        var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
+
+        double scalingFactor = 1.0/6000; // To make the average query rate low
+        fixture.setScalingDuration(Duration.ofMinutes(60));
+        fixture.tester().clock().advance(Duration.ofDays(2));
+        Duration timeAdded = fixture.loader().addLoadMeasurements(100,
+                                                         t -> scalingFactor * (100.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49))),
+                                                         t -> 0.0);
+        fixture.tester.clock().advance(timeAdded.negated());
+        fixture.loader().addCpuMeasurements(0.7, 200);
+        fixture.tester().assertResources("Scale up slightly since observed growth is faster than scaling time, but we are not confident",
+                                         5, 1, 2.1,  13.3, 83.2,
+                                         fixture.autoscale());
+    }
+
+    @Test
     public void test_autoscaling_considers_query_vs_write_rate() {
         var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
 
@@ -603,7 +620,7 @@ public class AutoscalingTest {
         // This makes headroom for queries doubling, which we want to observe the effect of here
 
         fixture.tester().clock().advance(Duration.ofDays(2));
-        var timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t -> 10.0);
+        var timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t -> 100.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.4, 200);
         fixture.tester.assertResources("Query and write load is equal -> scale up somewhat",
@@ -611,7 +628,7 @@ public class AutoscalingTest {
                                        fixture.autoscale());
 
         fixture.tester().clock().advance(Duration.ofDays(2));
-        timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 80.0 : 40.0, t -> 10.0);
+        timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 800.0 : 400.0, t -> 100.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.4, 200);
         // TODO: Ackhually, we scale down here - why?
@@ -620,7 +637,7 @@ public class AutoscalingTest {
                                          fixture.autoscale());
 
         fixture.tester().clock().advance(Duration.ofDays(2));
-        timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t -> 100.0);
+        timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t -> 1000.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.4, 200);
         fixture.tester().assertResources("Write load is 10x query load -> scale down",
@@ -628,7 +645,7 @@ public class AutoscalingTest {
                                          fixture.autoscale());
 
         fixture.tester().clock().advance(Duration.ofDays(2));
-        timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 20.0 : 10.0, t-> 0.0);
+        timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 200.0 : 100.0, t-> 0.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.4, 200);
         fixture.tester().assertResources("Query only -> largest possible",
@@ -636,7 +653,7 @@ public class AutoscalingTest {
                                          fixture.autoscale());
 
         fixture.tester().clock().advance(Duration.ofDays(2));
-        timeAdded = fixture.loader().addLoadMeasurements(100, t ->  0.0, t -> 10.0);
+        timeAdded = fixture.loader().addLoadMeasurements(100, t ->  0.0, t -> 100.0);
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.4, 200);
         fixture.tester().assertResources("Write only -> smallest possible",
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
index b38dbfc55ae..ed00134af55 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
@@ -41,31 +41,41 @@ public class ClusterModelTest {
     public void test_traffic_headroom() {
         // No current traffic share: Ideal load is low but capped
         var model1 = clusterModel(new Status(0.0, 1.0),
-                                  t -> t == 0 ? 10000.0 : 0.0, t -> 0.0);
-        assertEquals(0.37067209775967414, model1.idealLoad().cpu(), delta);
+                                  t -> t == 0 ? 10000.0 : 100.0, t -> 0.0);
+        assertEquals(0.32653061224489793, model1.idealLoad().cpu(), delta);
 
         // Almost no current traffic share: Ideal load is low but capped
         var model2 = clusterModel(new Status(0.0001, 1.0),
-                                  t -> t == 0 ? 10000.0 : 0.0, t -> 0.0);
-        assertEquals(0.37067209775967414, model2.idealLoad().cpu(), delta);
+                                  t -> t == 0 ? 10000.0 : 100.0, t -> 0.0);
+        assertEquals(0.32653061224489793, model2.idealLoad().cpu(), delta);
+
+        // Almost no traffic: Headroom impact is reduced due to uncertainty
+        var model3 = clusterModel(new Status(0.0001, 1.0),
+                                  t -> t == 0 ? 10000.0 : 1.0, t -> 0.0);
+        assertEquals(0.6465952717720751, model3.idealLoad().cpu(), delta);
     }
 
     @Test
     public void test_growth_headroom() {
         // No traffic data: Ideal load assumes 2 regions
         var model1 = clusterModel(new Status(0.0, 0.0),
-                                  t -> t == 0 ? 10000.0 : 0.0, t -> 0.0);
-        assertEquals(0.2240325865580448, model1.idealLoad().cpu(), delta);
+                                  t -> t == 0 ? 10000.0 : 100.0, t -> 0.0);
+        assertEquals(0.16326530612244897, model1.idealLoad().cpu(), delta);
 
         // No traffic: Ideal load is higher since we now know there is only one zone
         var model2 = clusterModel(new Status(0.0, 1.0),
-                                  t -> t == 0 ? 10000.0 : 0.0, t -> 0.0);
-        assertEquals(0.37067209775967414, model2.idealLoad().cpu(), delta);
+                                  t -> t == 0 ? 10000.0 : 100.0, t -> 0.0);
+        assertEquals(0.32653061224489793, model2.idealLoad().cpu(), delta);
 
         // Almost no current traffic: Similar number as above
         var model3 = clusterModel(new Status(0.0001, 1.0),
-                                  t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0);
+                                  t -> t == 0 ? 10000.0 : 100.0, t -> 0.0);
         assertEquals(0.32653061224489793, model3.idealLoad().cpu(), delta);
+
+        // Low query rate: Impact of growth headroom is reduced due to uncertainty
+        var model4 = clusterModel(new Status(0.0001, 1.0),
+                                  t -> t == 0 ? 100.0 : 1.0, t -> 0.0);
+        assertEquals(0.6465952717720751, model4.idealLoad().cpu(), delta);
     }
 
     private ClusterModel clusterModelWithNoData() {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
index 9158262b134..10c8c7434b1 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
@@ -82,13 +82,13 @@ public class Loader {
     public void applyCpuLoad(double cpuLoad, int measurements) {
         addCpuMeasurements((float)cpuLoad, measurements);
         fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements));
-        addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
+        addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
     }
 
     public void applyMemLoad(double memLoad, int measurements) {
         addMemMeasurements(memLoad, measurements);
         fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements));
-        addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
+        addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
     }
 
     /**
@@ -140,13 +140,13 @@ public class Loader {
     public void applyLoad(Load load, int measurements) {
         addMeasurements(load, measurements);
         fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements));
-        addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
+        addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
     }
 
     public void applyLoad(Load load, int generation, boolean inService, boolean stable, int measurements) {
         addMeasurements(load, generation, inService, stable, measurements);
         fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements));
-        addQueryRateMeasurements(measurements, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
+        addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
     }
 
     public Duration addQueryRateMeasurements(int measurements, IntFunction<Double> queryRate) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
index 5ceb28d3fed..214d842e4bb 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
@@ -70,8 +70,8 @@ public class AutoscalingMaintainerTest {
         assertTrue(tester.deployer().lastDeployTime(app1).isEmpty());
         assertTrue(tester.deployer().lastDeployTime(app2).isEmpty());
 
-        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1);
-        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app2);
+        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1, cluster1.id());
+        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app2, cluster2.id());
 
         tester.clock().advance(Duration.ofMinutes(10));
         tester.maintainer().maintain();
@@ -93,7 +93,7 @@ public class AutoscalingMaintainerTest {
         tester.deploy(app1, cluster1, app1Capacity);
 
         // Measure overload
-        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1);
+        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1, cluster1.id());
 
         // Causes autoscaling
         tester.clock().advance(Duration.ofMinutes(10));
@@ -110,24 +110,24 @@ public class AutoscalingMaintainerTest {
         assertEquals(firstMaintenanceTime.toEpochMilli(), events.get(1).at().toEpochMilli());
 
         // Measure overload still, since change is not applied, but metrics are discarded
-        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1);
+        tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
 
         // Measure underload, but no autoscaling since we still haven't measured we're on the new config generation
-        tester.addMeasurements(0.1f, 0.1f, 0.1f, 0, 500, app1);
+        tester.addMeasurements(0.1f, 0.1f, 0.1f, 0, 500, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
 
         // Add measurement of the expected generation, leading to rescaling
         // - record scaling completion
         tester.clock().advance(Duration.ofMinutes(5));
-        tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 1, app1);
+        tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 1, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
         // - measure underload
         tester.clock().advance(Duration.ofDays(4)); // Exit cooling period
-        tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 500, app1);
+        tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 500, app1, cluster1.id());
         Instant lastMaintenanceTime = tester.clock().instant();
         tester.maintainer().maintain();
         assertEquals(lastMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
@@ -161,16 +161,16 @@ public class AutoscalingMaintainerTest {
         Duration samplePeriod = Duration.ofSeconds(150);
         for (int i = 0; i < 20; i++) {
             // Record completion to keep scaling window at minimum
-            tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 1, app1);
+            tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 1, app1, cluster1.id());
             tester.maintainer().maintain();
 
             tester.clock().advance(Duration.ofDays(1));
 
             if (i % 2 == 0) { // high load
-                tester.addMeasurements(0.99f, 0.99f, 0.99f, i, measurements, app1);
+                tester.addMeasurements(0.99f, 0.99f, 0.99f, i, measurements, app1, cluster1.id());
             }
             else { // low load
-                tester.addMeasurements(0.2f, 0.2f, 0.2f, i, measurements, app1);
+                tester.addMeasurements(0.2f, 0.2f, 0.2f, i, measurements, app1, cluster1.id());
             }
             tester.clock().advance(samplePeriod.negated().multipliedBy(measurements));
             tester.addQueryRateMeasurements(app1, cluster1.id(), measurements, t -> (t == 0 ? 20.0 : 10.0 ));
@@ -180,7 +180,7 @@ public class AutoscalingMaintainerTest {
         assertEquals(Cluster.maxScalingEvents, tester.cluster(app1, cluster1).scalingEvents().size());
 
         // Complete last event
-        tester.addMeasurements(0.1f, 0.1f, 0.1f, 20, 1, app1);
+        tester.addMeasurements(0.1f, 0.1f, 0.1f, 20, 1, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals("Last event is completed",
                      tester.clock().instant(),
@@ -202,7 +202,6 @@ public class AutoscalingMaintainerTest {
 
         autoscale(false, Duration.ofMinutes( 1), Duration.ofMinutes( 5), clock, app1, cluster1, tester);
         autoscale( true, Duration.ofMinutes(19), Duration.ofMinutes(10), clock, app1, cluster1, tester);
-        autoscale( true, Duration.ofMinutes(40), Duration.ofMinutes(20), clock, app1, cluster1, tester);
     }
 
     @Test
@@ -217,21 +216,21 @@ public class AutoscalingMaintainerTest {
 
         // Add a scaling event
         tester.deploy(app1, cluster1, capacity);
-        tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 4, app1);
+        tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 4, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals("Scale up: " + tester.cluster(app1, cluster1).autoscalingStatus(),
                      1,
                      tester.cluster(app1, cluster1).lastScalingEvent().get().generation());
 
         // measurements with outdated generation are ignored -> no autoscaling
-        var duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 0, 2, app1);
+        var duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 0, 2, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals("Measurements with outdated generation are ignored -> no autoscaling",
                      1,
                      tester.cluster(app1, cluster1).lastScalingEvent().get().generation());
         tester.clock().advance(duration.negated());
 
-        duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1);
+        duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals("Measurements right after generation change are ignored -> no autoscaling",
                      1,
@@ -242,7 +241,7 @@ public class AutoscalingMaintainerTest {
         tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1)));
         tester.nodeRepository().nodes().list().owner(app1).asList().forEach(node -> recordRestart(node, tester.nodeRepository()));
 
-        duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1);
+        duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals("Measurements right after restart are ignored -> no autoscaling",
                      1,
@@ -250,7 +249,7 @@ public class AutoscalingMaintainerTest {
         tester.clock().advance(duration.negated());
 
         tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1)));
-        tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1);
+        tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1, cluster1.id());
         tester.maintainer().maintain();
         assertEquals("We have valid measurements -> scale up",
                      2,
@@ -310,7 +309,7 @@ public class AutoscalingMaintainerTest {
 
         clock.advance(completionTime);
         float load = down ? 0.1f : 1.0f;
-        tester.addMeasurements(load, load, load, generation, 1, application);
+        tester.addMeasurements(load, load, load, generation, 1, application, cluster.id());
         tester.maintainer().maintain();
         assertEvent("Measured completion of the last scaling event, but no new autoscaling yet",
                     generation, Optional.of(clock.instant()),
@@ -320,7 +319,7 @@ public class AutoscalingMaintainerTest {
         else
             clock.advance(expectedWindow.minus(completionTime));
 
-        tester.addMeasurements(load, load, load, generation, 200, application);
+        tester.addMeasurements(load, load, load, generation, 200, application, cluster.id());
         tester.maintainer().maintain();
         assertEquals("We passed window duration so a new autoscaling is started: " +
                      tester.cluster(application, cluster).autoscalingStatus(),
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java
index d921af9543e..95e36787219 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java
@@ -71,7 +71,8 @@ public class AutoscalingMaintainerTester {
         return provisioningTester.deploy(application, cluster, capacity);
     }
 
-    public Duration addMeasurements(float cpu, float mem, float disk, long generation, int count, ApplicationId applicationId) {
+    public Duration addMeasurements(float cpu, float mem, float disk, long generation, int count,
+                                    ApplicationId applicationId, ClusterSpec.Id clusterId) {
         NodeList nodes = nodeRepository().nodes().list(Node.State.active).owner(applicationId);
         Instant startTime = clock().instant();
         for (int i = 0; i < count; i++) {
@@ -85,7 +86,10 @@ public class AutoscalingMaintainerTester {
                                                                                                       0.0))));
             clock().advance(Duration.ofSeconds(150));
         }
-        return Duration.between(startTime, clock().instant());
+        var totalDuration = Duration.between(startTime, clock().instant());
+        clock().advance(totalDuration.negated());
+        addQueryRateMeasurements(applicationId, clusterId, count, t -> 100.0);
+        return totalDuration;
     }
 
     /** Creates the given number of measurements, spaced 5 minutes between, using the given function */
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index b43baf444c8..f5ab822721f 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -70,9 +70,9 @@ public class ScalingSuggestionsMaintainerTest {
                                                                                    new TestMetric());
         maintainer.maintain();
 
-        assertEquals("13 nodes with [vcpu: 5.5, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
+        assertEquals("8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
                      suggestionOf(app1, cluster1, tester).get().resources().toString());
-        assertEquals("8 nodes with [vcpu: 11.0, memory: 4.4 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
+        assertEquals("8 nodes with [vcpu: 3.6, memory: 4.4 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
                      suggestionOf(app2, cluster2, tester).get().resources().toString());
 
         // Utilization goes way down
@@ -80,14 +80,14 @@ public class ScalingSuggestionsMaintainerTest {
         addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository());
         maintainer.maintain();
         assertEquals("Suggestion stays at the peak value observed",
-                     "13 nodes with [vcpu: 5.5, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
+                     "8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
                      suggestionOf(app1, cluster1, tester).get().resources().toString());
         // Utilization is still way down and a week has passed
         tester.clock().advance(Duration.ofDays(7));
         addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository());
         maintainer.maintain();
         assertEquals("Peak suggestion has been  outdated",
-                     "5 nodes with [vcpu: 1.8, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
+                     "3 nodes with [vcpu: 1.2, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps, architecture: x86_64]",
                      suggestionOf(app1, cluster1, tester).get().resources().toString());
         assertTrue(shouldSuggest(app1, cluster1, tester));
 
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
index 6adcb1199eb..0d640f7e3b2 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
@@ -71,7 +71,7 @@
       },
       "utilization" : {
           "cpu" : 0.0,
-          "idealCpu": 0.1375,
+          "idealCpu": 0.40750000000000003,
           "currentCpu": 0.0,
           "peakCpu": 0.0,
           "memory" : 0.0,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json
index 5babf5fc843..80da118f620 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json
@@ -45,7 +45,7 @@
       },
       "utilization" : {
         "cpu" : 0.0,
-        "idealCpu": 0.1394913986537023,
+        "idealCpu": 0.42670157068062825,
         "currentCpu": 0.0,
         "peakCpu": 0.0,
         "memory" : 0.0,
author	Jon Bratseth <bratseth@gmail.com>	2022-12-12 15:32:12 +0100
committer	GitHub <noreply@github.com>	2022-12-12 15:32:12 +0100
commit	6cba335640ae35e45f87d7566bc339ef6eb2c235 (patch)
tree	1e2b60412b39f9399ae5c441e938652381928a05
parent	9cb4add47c48a264b9d204f59ca73d3082c2b490 (diff)
parent	2d393d038aab7b2b438f04cf01e6202e0090a4ea (diff)