diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-08-16 11:30:17 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-08-16 11:30:17 +0200 |
commit | f5d7c5b0662efd35113cd0ec76531bb4a46e505b (patch) | |
tree | 118a40cf8c565bf78e6ca464b0d6a50a7e683c4d | |
parent | 59eaaa1824ec2f686e9763ed3ab420944112b5c2 (diff) |
More accurate accounting of redistribution cost
4 files changed, 24 insertions, 35 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 7c2f3a563fb..40b0bd8d88b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -143,9 +143,13 @@ public class ClusterModel { */ public Duration allocationDuration() { return allocationDuration; } + public boolean isContent() { + return clusterSpec.type().isContent(); + } + /** Returns the predicted duration of data redistribution in this cluster. */ public Duration redistributionDuration() { - if (! clusterSpec.type().isContent()) return Duration.ofMinutes(0); + if (! isContent()) return Duration.ofMinutes(0); return scalingDuration(); // TODO: Estimate separately } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java index c0cbb40d992..7a26a217e61 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java @@ -24,9 +24,14 @@ public class ResourceChange { /** Returns the estimated total cost of this resource change (coming in addition to the "to" resource cost). */ public double cost() { - if (requiresRedistribution()) return toHours(model.redistributionDuration()) * from.cost(); - if (requiresNodeReplacement()) return toHours(model.nodeReplacementDuration()) * from.cost(); - return 0; + if (model.isContent()) { + if (requiresNodeReplacement()) return toHours(model.redistributionDuration()) * from.cost(); + return toHours(model.redistributionDuration()) * from.advertisedResources().cost() * nodesToRetire(); + } + else { + if (requiresNodeReplacement()) return toHours(model.nodeReplacementDuration()) * from.cost(); + return 0; + } } private boolean requiresRedistribution() { @@ -37,6 +42,15 @@ public class ResourceChange { return false; } + /** + * Returns the estimated number of nodes that will be retired by this change, + * given that it is a content cluster and no node replacement is necessary. + * This is not necessarily always perfectly correct if this changes group layout. + */ + private int nodesToRetire() { + return Math.max(0, from.nodes() - to.nodes()); + } + /** Returns true if the *existing* nodes of this needs to be replaced in this change. */ private boolean requiresNodeReplacement() { var fromNodes = from.advertisedResources().nodeResources(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index c7171b6b478..4e19d04ffac 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -521,7 +521,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyLoad(new Load(0.5, 0.8, 0.1), 120); fixture.tester().assertResources("Suggesting resources where disk is 3x memory (this is a content cluster)", - 10, 1, 14.3, 66.2, 198.6, + 11, 1, 13.0, 60.0, 179.9, fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min)); fixture.tester().assertResources("Autoscaling to resources where disk is 3x memory (this is a content cluster)", 10, 1, 10.0, 66.2, 198.6, @@ -981,33 +981,4 @@ public class AutoscalingTest { NodeResources.Architecture.x86_64, nodes.get(0).resources().architecture()); } - // Verify that we choose not to increase to 3 nodes even though that is cheaper (dure to redundancy), - // due to considering the cost of redistribution. This depends quite finely on the parameters, - // and the easiest way to move it back if there is a change is to increase the scaling duration, - // as that is a redistribution cost multiplier (until redistribution is measured separately). - @Test - public void change_not_causing_redistribution_is_preferred() { - var min = new ClusterResources(2, 1, new NodeResources( 16, 64, 200, 1, DiskSpeed.fast, StorageType.remote)); - var max = new ClusterResources(4, 1, new NodeResources( 32, 64, 200, 1, DiskSpeed.fast, StorageType.remote)); - - var fixture = DynamicProvisioningTester.fixture() - .clusterType(ClusterSpec.Type.content) - .awsSetup(true, Environment.prod) - .capacity(Capacity.from(min, max)) - .initialResources(Optional.of(min)) - .build(); - fixture.setScalingDuration(Duration.ofMinutes(35)); - var nodes = fixture.nodes().not().retired().asList(); - assertEquals(2, nodes.size()); - assertEquals(16.0, nodes.get(0).resources().vcpu(), 0.000001); - - fixture.tester().clock().advance(Duration.ofHours(5)); - fixture.loader().applyCpuLoad(0.75, 700); // trigger rescaling, but don't cause fulfilment < 1 - var autoscaling = fixture.autoscale(); - fixture.deploy(Capacity.from(autoscaling.resources().get())); - nodes = fixture.nodes().not().retired().asList(); - assertEquals("Increasing cpu is preferred to adding nodes to avoid redistribution", 2, nodes.size()); - assertEquals(28.5, nodes.get(0).resources().vcpu(), 0.000001); - } - } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index 8aaf0eb20e7..3145675325b 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -75,7 +75,7 @@ public class ScalingSuggestionsMaintainerTest { assertEquals("8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk: 10.0 Gb, bandwidth: 0.1 Gbps, architecture: any]", suggestionOf(app1, cluster1, tester).resources().get().toString()); - assertEquals("8 nodes with [vcpu: 3.6, memory: 4.7 Gb, disk: 14.2 Gb, bandwidth: 0.1 Gbps, architecture: any]", + assertEquals("7 nodes with [vcpu: 4.1, memory: 5.3 Gb, disk: 16.5 Gb, bandwidth: 0.1 Gbps, architecture: any]", suggestionOf(app2, cluster2, tester).resources().get().toString()); // Utilization goes way down |