More accurate accounting of redistribution cost

author: Jon Bratseth <bratseth@vespa.ai> 2023-08-16 11:30:17 +0200
committer: Jon Bratseth <bratseth@vespa.ai> 2023-08-16 11:30:17 +0200
commit: f5d7c5b0662efd35113cd0ec76531bb4a46e505b (patch)
tree: 118a40cf8c565bf78e6ca464b0d6a50a7e683c4d /node-repository
parent: 59eaaa1824ec2f686e9763ed3ab420944112b5c2 (diff)
4 files changed, 24 insertions, 35 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 7c2f3a563fb..40b0bd8d88b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -143,9 +143,13 @@ public class ClusterModel {
      */
     public Duration allocationDuration() { return allocationDuration; }
 
+    public boolean isContent() {
+        return clusterSpec.type().isContent();
+    }
+
     /** Returns the predicted duration of data redistribution in this cluster. */
     public Duration redistributionDuration() {
-        if (! clusterSpec.type().isContent()) return Duration.ofMinutes(0);
+        if (! isContent()) return Duration.ofMinutes(0);
         return scalingDuration(); // TODO: Estimate separately
     }
 
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
index c0cbb40d992..7a26a217e61 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
@@ -24,9 +24,14 @@ public class ResourceChange {
 
     /** Returns the estimated total cost of this resource change (coming in addition to the "to" resource cost). */
     public double cost() {
-        if (requiresRedistribution()) return toHours(model.redistributionDuration()) * from.cost();
-        if (requiresNodeReplacement()) return toHours(model.nodeReplacementDuration()) * from.cost();
-        return 0;
+        if (model.isContent()) {
+            if (requiresNodeReplacement()) return toHours(model.redistributionDuration()) * from.cost();
+            return toHours(model.redistributionDuration()) * from.advertisedResources().cost() * nodesToRetire();
+        }
+        else {
+            if (requiresNodeReplacement()) return toHours(model.nodeReplacementDuration()) * from.cost();
+            return 0;
+        }
     }
 
     private boolean requiresRedistribution() {
@@ -37,6 +42,15 @@ public class ResourceChange {
         return false;
     }
 
+    /**
+     * Returns the estimated number of nodes that will be retired by this change,
+     * given that it is a content cluster and no node replacement is necessary.
+     * This is not necessarily always perfectly correct if this changes group layout.
+     */
+    private int nodesToRetire() {
+        return Math.max(0, from.nodes() - to.nodes());
+    }
+
     /** Returns true if the *existing* nodes of this needs to be replaced in this change. */
     private boolean requiresNodeReplacement() {
         var fromNodes = from.advertisedResources().nodeResources();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index c7171b6b478..4e19d04ffac 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -521,7 +521,7 @@ public class AutoscalingTest {
         fixture.tester().clock().advance(Duration.ofDays(2));
         fixture.loader().applyLoad(new Load(0.5, 0.8, 0.1), 120);
         fixture.tester().assertResources("Suggesting resources where disk is 3x memory (this is a content cluster)",
-                                         10, 1, 14.3,  66.2,  198.6,
+                                         11, 1, 13.0,  60.0,  179.9,
                                          fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min));
         fixture.tester().assertResources("Autoscaling to resources where disk is 3x memory (this is a content cluster)",
                                          10, 1, 10.0,  66.2,  198.6,
@@ -981,33 +981,4 @@ public class AutoscalingTest {
                      NodeResources.Architecture.x86_64, nodes.get(0).resources().architecture());
     }
 
-    // Verify that we choose not to increase to 3 nodes even though that is cheaper (dure to redundancy),
-    // due to considering the cost of redistribution. This depends quite finely on the parameters,
-    // and the easiest way to move it back if there is a change is to increase the scaling duration,
-    // as that is a redistribution cost multiplier (until redistribution is measured separately).
-    @Test
-    public void change_not_causing_redistribution_is_preferred() {
-        var min = new ClusterResources(2, 1, new NodeResources(  16, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
-        var max = new ClusterResources(4, 1, new NodeResources(  32, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
-
-        var fixture = DynamicProvisioningTester.fixture()
-                                               .clusterType(ClusterSpec.Type.content)
-                                               .awsSetup(true, Environment.prod)
-                                               .capacity(Capacity.from(min, max))
-                                               .initialResources(Optional.of(min))
-                                               .build();
-        fixture.setScalingDuration(Duration.ofMinutes(35));
-        var nodes = fixture.nodes().not().retired().asList();
-        assertEquals(2, nodes.size());
-        assertEquals(16.0, nodes.get(0).resources().vcpu(), 0.000001);
-
-        fixture.tester().clock().advance(Duration.ofHours(5));
-        fixture.loader().applyCpuLoad(0.75, 700); // trigger rescaling, but don't cause fulfilment < 1
-        var autoscaling = fixture.autoscale();
-        fixture.deploy(Capacity.from(autoscaling.resources().get()));
-        nodes = fixture.nodes().not().retired().asList();
-        assertEquals("Increasing cpu is preferred to adding nodes to avoid redistribution", 2, nodes.size());
-        assertEquals(28.5, nodes.get(0).resources().vcpu(), 0.000001);
-    }
-
 }
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 8aaf0eb20e7..3145675325b 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -75,7 +75,7 @@ public class ScalingSuggestionsMaintainerTest {
 
         assertEquals("8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk: 10.0 Gb, bandwidth: 0.1 Gbps, architecture: any]",
                      suggestionOf(app1, cluster1, tester).resources().get().toString());
-        assertEquals("8 nodes with [vcpu: 3.6, memory: 4.7 Gb, disk: 14.2 Gb, bandwidth: 0.1 Gbps, architecture: any]",
+        assertEquals("7 nodes with [vcpu: 4.1, memory: 5.3 Gb, disk: 16.5 Gb, bandwidth: 0.1 Gbps, architecture: any]",
                      suggestionOf(app2, cluster2, tester).resources().get().toString());
 
         // Utilization goes way down
author	Jon Bratseth <bratseth@vespa.ai>	2023-08-16 11:30:17 +0200
committer	Jon Bratseth <bratseth@vespa.ai>	2023-08-16 11:30:17 +0200
commit	f5d7c5b0662efd35113cd0ec76531bb4a46e505b (patch)
tree	118a40cf8c565bf78e6ca464b0d6a50a7e683c4d /node-repository
parent	59eaaa1824ec2f686e9763ed3ab420944112b5c2 (diff)