Merge pull request #27854 from vespa-engine/bratseth/scale-down-less

Don't scale down if we are likely to scale back up
author: Martin Polden <mpolden@mpolden.no> 2023-07-21 09:11:53 +0200
committer: GitHub <noreply@github.com> 2023-07-21 09:11:53 +0200
commit: 8d315ba956eb0dc814e92e180e3b8533b81c6e61 (patch)
tree: b5e62cdc21315b84e92214858b0e193a46d88468 /node-repository
parent: 3e40e5363a3b76a72910dbe701ec05294d17ec30 (diff)
parent: 97bd65b51e942fb81eeb43b14b03cad8d2474c6d (diff)
6 files changed, 105 insertions, 31 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index a7d5cc50828..795cbd59c4b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -2,7 +2,6 @@
 package com.yahoo.vespa.hosted.provision.autoscale;
 
 import com.yahoo.config.provision.ClusterResources;
-import com.yahoo.vespa.hosted.provision.Node;
 import com.yahoo.vespa.hosted.provision.NodeList;
 import com.yahoo.vespa.hosted.provision.NodeRepository;
 import com.yahoo.vespa.hosted.provision.applications.Application;
@@ -10,7 +9,6 @@ import com.yahoo.vespa.hosted.provision.applications.Cluster;
 import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling.Status;
 
 import java.time.Duration;
-import java.time.Instant;
 import java.util.Optional;
 
 /**
@@ -23,7 +21,9 @@ public class Autoscaler {
     /** What cost difference is worth a reallocation? */
     private static final double costDifferenceWorthReallocation = 0.1;
     /** What resource difference is worth a reallocation? */
-    private static final double resourceDifferenceWorthReallocation = 0.03;
+    private static final double resourceIncreaseWorthReallocation = 0.03;
+    /** The load increase headroom (as a fraction) we should have before needing to scale up, to decide to scale down */
+    private static final double headroomRequiredToScaleDown = 0.1;
 
     private final NodeRepository nodeRepository;
     private final AllocationOptimizer allocationOptimizer;
@@ -70,22 +70,53 @@ public class Autoscaler {
         if ( ! clusterModel.isStable(nodeRepository))
             return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", clusterModel);
 
-        var currentAllocation = new AllocatableClusterResources(clusterNodes.not().retired(), nodeRepository);
-        Optional<AllocatableClusterResources> bestAllocation =
-                allocationOptimizer.findBestAllocation(clusterModel.loadAdjustment(), currentAllocation, clusterModel, limits);
-        if (bestAllocation.isEmpty())
+        var current = new AllocatableClusterResources(clusterNodes.not().retired(), nodeRepository);
+        var loadAdjustment = clusterModel.loadAdjustment();
+
+        // Ensure we only scale down if we'll have enough headroom to not scale up again given a small load increase
+        var target = allocationOptimizer.findBestAllocation(loadAdjustment, current, clusterModel, limits);
+        var headroomAdjustedLoadAdjustment = adjustForHeadroom(loadAdjustment, clusterModel, target);
+        if ( ! headroomAdjustedLoadAdjustment.equals(loadAdjustment)) {
+            loadAdjustment = headroomAdjustedLoadAdjustment;
+            target = allocationOptimizer.findBestAllocation(loadAdjustment, current, clusterModel, limits);
+        }
+
+        if (target.isEmpty())
             return Autoscaling.dontScale(Status.insufficient, "No allocations are possible within configured limits", clusterModel);
 
-        if (! worthRescaling(currentAllocation.realResources(), bestAllocation.get().realResources())) {
-            if (bestAllocation.get().fulfilment() < 0.9999999)
+        if (! worthRescaling(current.realResources(), target.get().realResources())) {
+            if (target.get().fulfilment() < 0.9999999)
                 return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents ideal scaling of this cluster", clusterModel);
             else if ( ! clusterModel.safeToScaleDown() && clusterModel.idealLoad().any(v -> v < 1.0))
                 return Autoscaling.dontScale(Status.ideal, "Cooling off before considering to scale down", clusterModel);
             else
-                return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled (within limits)", clusterModel);
+                return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled (within configured limits)", clusterModel);
         }
 
-        return Autoscaling.scaleTo(bestAllocation.get().advertisedResources(), clusterModel);
+        return Autoscaling.scaleTo(target.get().advertisedResources(), clusterModel);
+    }
+
+    /**
+     * When scaling down we may end up with resources that are just barely below the new ideal with the new number
+     * of nodes, as fewer nodes leads to a lower ideal load (due to redundancy).
+     * If that headroom is too small, then do not scale down as it will likely lead to scaling back up again soon.
+     */
+    private Load adjustForHeadroom(Load loadAdjustment, ClusterModel clusterModel,
+                                   Optional<AllocatableClusterResources> target) {
+        if (target.isEmpty()) return loadAdjustment;
+
+        // If we change to this target, what would our current peak be compared to the ideal
+        var relativeLoadWithTarget =
+                loadAdjustment // redundancy aware target relative to current load
+                .multiply(clusterModel.loadWith(target.get().nodes(), target.get().groups())) // redundancy aware adjustment with target
+                .divide(clusterModel.redundancyAdjustment()); // correct for double redundancy adjustment
+        if (loadAdjustment.cpu() < 1 && (1.0 - relativeLoadWithTarget.cpu()) < headroomRequiredToScaleDown)
+            loadAdjustment = loadAdjustment.withCpu(1.0);
+        if (loadAdjustment.memory() < 1 && (1.0 - relativeLoadWithTarget.memory()) < headroomRequiredToScaleDown)
+            loadAdjustment = loadAdjustment.withMemory(1.0);
+        if (loadAdjustment.disk() < 1 && (1.0 - relativeLoadWithTarget.disk()) < headroomRequiredToScaleDown)
+            loadAdjustment = loadAdjustment.withDisk(1.0);
+        return loadAdjustment;
     }
 
     /** Returns true if it is worthwhile to make the given resource change, false if it is too insignificant */
@@ -95,12 +126,14 @@ public class Autoscaler {
         if (meaningfulIncrease(from.totalResources().memoryGb(), to.totalResources().memoryGb())) return true;
         if (meaningfulIncrease(from.totalResources().diskGb(), to.totalResources().diskGb())) return true;
 
-        // Otherwise, only *decrease* if it reduces cost meaningfully
+        // Otherwise, only *decrease* if
+        // - cost is reduced meaningfully
+        // - the new resources won't be so much smaller that a small fluctuation in load will cause an increase
         return ! similar(from.cost(), to.cost(), costDifferenceWorthReallocation);
     }
 
     public static boolean meaningfulIncrease(double from, double to) {
-        return from < to && ! similar(from, to, resourceDifferenceWorthReallocation);
+        return from < to && ! similar(from, to, resourceIncreaseWorthReallocation);
     }
 
     private static boolean similar(double r1, double r2, double threshold) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 289025f9d21..a5490996a2c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -175,9 +175,9 @@ public class ClusterModel {
      * Returns the relative load adjustment accounting for redundancy given these nodes+groups
      * relative to node nodes+groups in this.
      */
-    public Load loadWith(int trueNodes, int trueGroups) {
-        int nodes = nodesAdjustedForRedundancy(trueNodes, trueGroups);
-        int groups = groupsAdjustedForRedundancy(trueNodes, trueGroups);
+    public Load loadWith(int givenNodes, int givenGroups) {
+        int nodes = nodesAdjustedForRedundancy(givenNodes, givenGroups);
+        int groups = groupsAdjustedForRedundancy(givenNodes, givenGroups);
         if (clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content
             int groupSize = nodes / groups;
 
@@ -272,7 +272,7 @@ public class ClusterModel {
 
     /** The number of nodes this cluster has, or will have if not deployed yet. */
     // TODO: Make this the deployed, not current count
-    private int nodeCount() {
+    public int nodeCount() {
         if ( ! nodes.isEmpty()) return (int)nodes.not().retired().stream().count();
         return cluster.minResources().nodes();
     }
@@ -289,12 +289,12 @@ public class ClusterModel {
         return (int)Math.ceil((double)nodeCount() / groupCount());
     }
 
-    private int nodesAdjustedForRedundancy(int nodes, int groups) {
+    private static int nodesAdjustedForRedundancy(int nodes, int groups) {
         int groupSize = (int)Math.ceil((double)nodes / groups);
         return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes;
     }
 
-    private int groupsAdjustedForRedundancy(int nodes, int groups) {
+    private static int groupsAdjustedForRedundancy(int nodes, int groups) {
         return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups;
     }
 
@@ -340,8 +340,7 @@ public class ClusterModel {
         /** Ideal cpu load must take the application traffic fraction into account. */
         double idealLoad() {
             double queryCpuFraction = queryFraction();
-
-            // Assumptions: 1) Write load is not organic so we should not grow to handle more.
+            // Assumptions: 1) Write load is not organic so we should not increase to handle potential future growth.
             //                 (TODO: But allow applications to set their target write rate and size for that)
             //              2) Write load does not change in BCP scenarios.
             return queryCpuFraction * 1/growthRateHeadroom() * 1/trafficShiftHeadroom() * idealQueryCpuLoad +
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 47206265c68..54178865693 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -12,6 +12,7 @@ import com.yahoo.config.provision.NodeResources.DiskSpeed;
 import com.yahoo.config.provision.NodeResources.StorageType;
 import com.yahoo.config.provision.RegionName;
 import com.yahoo.config.provision.Zone;
+import com.yahoo.vespa.hosted.provision.Node;
 import com.yahoo.vespa.hosted.provision.provisioning.CapacityPolicies;
 import com.yahoo.vespa.hosted.provision.provisioning.DynamicProvisioningTester;
 import org.junit.Test;
@@ -87,7 +88,7 @@ public class AutoscalingTest {
         fixture.tester().clock().advance(Duration.ofDays(7));
         fixture.loader().applyCpuLoad(0.1f, 10);
         fixture.tester().assertResources("Scaling cpu down since usage has gone down significantly",
-                                         6, 1, 1.1, 8.8, 346.8,
+                                         6, 1, 1.1, 9.8, 390.2,
                                          fixture.autoscale());
     }
 
@@ -585,7 +586,7 @@ public class AutoscalingTest {
     @Test
     public void test_autoscaling_groupsize_by_cpu_read_dominated() {
         var min = new ClusterResources( 3, 1, new NodeResources(1, 1, 1, 1));
-        var now = new ClusterResources(6, 2, new NodeResources(3, 100, 100, 1));
+        var now = new ClusterResources( 6, 2, new NodeResources(3, 100, 100, 1));
         var max = new ClusterResources(21, 7, new NodeResources(100, 1000, 1000, 1));
         var fixture = DynamicProvisioningTester.fixture()
                                                .awsProdSetup(true)
@@ -665,7 +666,7 @@ public class AutoscalingTest {
         fixture.tester().clock().advance(Duration.ofHours(12 * 3 + 1));
         fixture.loader().applyCpuLoad(0.02, 5);
         fixture.tester().assertResources("Scaling down since enough time has passed",
-                                         3, 1, 1.0, 23.6, 101.4,
+                                         3, 1, 1.0, 29.5, 126.7,
                                          fixture.autoscale());
     }
 
@@ -798,7 +799,7 @@ public class AutoscalingTest {
         fixture.tester.clock().advance(timeAdded.negated());
         fixture.loader().addCpuMeasurements(0.4, 200);
         fixture.tester().assertResources("Write only -> smallest possible",
-                                         4, 1, 1.1,  16.1, 67.6,
+                                         4, 1, 1.1,  20.1, 84.5,
                                          fixture.autoscale());
     }
 
@@ -881,6 +882,23 @@ public class AutoscalingTest {
     }
 
     @Test
+    public void test_scaling_down_leaves_too_little_headroom() {
+        var r = new NodeResources(16, 32, 100, 1, NodeResources.DiskSpeed.any);
+        var min = new ClusterResources( 3, 3, r);
+        var now = new ClusterResources( 4, 4, r);
+        var max = new ClusterResources( 5, 5, r);
+        var fixture = DynamicProvisioningTester.fixture()
+                                               .awsProdSetup(false)
+                                               .capacity(Capacity.from(min, max, IntRange.from(1)))
+                                               .clusterType(ClusterSpec.Type.content)
+                                               .initialResources(Optional.of(now))
+                                               .build();
+        fixture.loader().applyCpuLoad(0.17, 10);
+        assertTrue("Not scaling down as that would leave just 4.5% headroom before needing to scale up again",
+                   fixture.autoscale().resources().isEmpty());
+    }
+
+    @Test
     public void test_changing_exclusivity() {
         var min = new ClusterResources( 2, 1, new NodeResources(  3,    4,  100, 1));
         var max = new ClusterResources(20, 1, new NodeResources(100, 1000, 1000, 1));
@@ -923,7 +941,7 @@ public class AutoscalingTest {
         fixture.loader().applyLoad(new Load(0.06, 0.52, 0.27), 100);
         var autoscaling = fixture.autoscale();
         fixture.tester().assertResources("Scaling down",
-                                         7, 1, 2, 14.5, 384.0,
+                                         7, 1, 2, 15.8, 384.0,
                                          autoscaling);
         fixture.deploy(Capacity.from(autoscaling.resources().get()));
         assertEquals("Initial nodes are kept", initialNodes, fixture.nodes().asList());
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java
index 637932681ee..379dbb27d87 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java
@@ -85,7 +85,7 @@ public class AutoscalingUsingBcpGroupInfoTest {
         fixture.store(new BcpGroupInfo(100, 1.1, 0.3));
         fixture.loader().addCpuMeasurements(0.7f, 10);
         fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
-                                         3, 3, 10.5, 38.4, 168.9,
+                                         3, 3, 10.5, 43.2, 190.0,
                                          fixture.autoscale());
 
         // Higher query rate
@@ -93,7 +93,7 @@ public class AutoscalingUsingBcpGroupInfoTest {
         fixture.store(new BcpGroupInfo(200, 1.1, 0.3));
         fixture.loader().addCpuMeasurements(0.7f, 10);
         fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
-                                         3, 3, 20.9, 38.4, 168.9,
+                                         3, 3, 20.9, 43.2, 190.0,
                                          fixture.autoscale());
 
         // Higher headroom
@@ -101,7 +101,7 @@ public class AutoscalingUsingBcpGroupInfoTest {
         fixture.store(new BcpGroupInfo(100, 1.3, 0.3));
         fixture.loader().addCpuMeasurements(0.7f, 10);
         fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
-                                         3, 3, 12.4, 38.4, 168.9,
+                                         3, 3, 12.4, 43.2, 190.0,
                                          fixture.autoscale());
 
         // Higher per query cost
@@ -109,7 +109,7 @@ public class AutoscalingUsingBcpGroupInfoTest {
         fixture.store(new BcpGroupInfo(100, 1.1, 0.45));
         fixture.loader().addCpuMeasurements(0.7f, 10);
         fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
-                                         3, 3, 15.7, 38.4, 168.9,
+                                         3, 3, 15.7, 43.2, 190.0,
                                          fixture.autoscale());
     }
 
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
index b150b372fe8..33d3d3d50dc 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
@@ -49,6 +49,8 @@ public class Fixture {
     final Capacity capacity;
     final Loader loader;
 
+    Autoscaling lastAutoscaling = Autoscaling.empty();
+
     public Fixture(Fixture.Builder builder, Optional<ClusterResources> initialResources, int hostCount) {
         applicationId = builder.application;
         clusterSpec = builder.cluster;
@@ -105,7 +107,7 @@ public class Fixture {
 
     /** Autoscale within the given capacity. */
     public Autoscaling autoscale(Capacity capacity) {
-        return tester().autoscale(applicationId, clusterSpec, capacity);
+        return lastAutoscaling = tester().autoscale(applicationId, clusterSpec, capacity);
     }
 
     /** Compute an autoscaling suggestion for this. */
@@ -123,6 +125,17 @@ public class Fixture {
         tester().deploy(applicationId, clusterSpec, capacity);
     }
 
+    public void deployTarget() {
+        if (lastAutoscaling.isEmpty()) throw new IllegalStateException("Autoscaling is empty");
+        if (lastAutoscaling.resources().isEmpty()) throw new IllegalStateException("Autoscaling target is empty: " + lastAutoscaling);
+        try (var lock = tester().nodeRepository().applications().lock(applicationId)) {
+            var updated = tester().nodeRepository().applications().require(applicationId).with(cluster().withTarget(lastAutoscaling));
+            tester().nodeRepository().applications().put(updated, lock);
+        }
+        deploy(capacity);
+        deactivateRetired(capacity);
+    }
+
     public void deactivateRetired(Capacity capacity) {
         tester().deactivateRetired(applicationId, clusterSpec, capacity);
     }
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java
index 4799d3b5577..c982b195787 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java
@@ -186,6 +186,17 @@ public class DynamicProvisioningTester {
                         resources);
     }
 
+    public void assertResources(String message,
+                                int nodeCount, int groupCount,
+                                NodeResources expectedResources,
+                                Autoscaling autoscaling) {
+        assertTrue("Resources are present: " + message + " (" + autoscaling + ": " + autoscaling.status() + ")",
+                   autoscaling.resources().isPresent());
+        assertResources(message, nodeCount, groupCount,
+                        expectedResources.vcpu(), expectedResources.memoryGb(), expectedResources.diskGb(),
+                        autoscaling.resources().get());
+    }
+
     public ClusterResources assertResources(String message,
                                             int nodeCount, int groupCount,
                                             double approxCpu, double approxMemory, double approxDisk,
author	Martin Polden <mpolden@mpolden.no>	2023-07-21 09:11:53 +0200
committer	GitHub <noreply@github.com>	2023-07-21 09:11:53 +0200
commit	8d315ba956eb0dc814e92e180e3b8533b81c6e61 (patch)
tree	b5e62cdc21315b84e92214858b0e193a46d88468 /node-repository
parent	3e40e5363a3b76a72910dbe701ec05294d17ec30 (diff)
parent	97bd65b51e942fb81eeb43b14b03cad8d2474c6d (diff)