summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-08-16 11:30:17 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-08-16 11:30:17 +0200
commitf5d7c5b0662efd35113cd0ec76531bb4a46e505b (patch)
tree118a40cf8c565bf78e6ca464b0d6a50a7e683c4d /node-repository
parent59eaaa1824ec2f686e9763ed3ab420944112b5c2 (diff)
More accurate accounting of redistribution cost
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java20
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java31
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java2
4 files changed, 24 insertions, 35 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 7c2f3a563fb..40b0bd8d88b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -143,9 +143,13 @@ public class ClusterModel {
*/
public Duration allocationDuration() { return allocationDuration; }
+ public boolean isContent() {
+ return clusterSpec.type().isContent();
+ }
+
/** Returns the predicted duration of data redistribution in this cluster. */
public Duration redistributionDuration() {
- if (! clusterSpec.type().isContent()) return Duration.ofMinutes(0);
+ if (! isContent()) return Duration.ofMinutes(0);
return scalingDuration(); // TODO: Estimate separately
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
index c0cbb40d992..7a26a217e61 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
@@ -24,9 +24,14 @@ public class ResourceChange {
/** Returns the estimated total cost of this resource change (coming in addition to the "to" resource cost). */
public double cost() {
- if (requiresRedistribution()) return toHours(model.redistributionDuration()) * from.cost();
- if (requiresNodeReplacement()) return toHours(model.nodeReplacementDuration()) * from.cost();
- return 0;
+ if (model.isContent()) {
+ if (requiresNodeReplacement()) return toHours(model.redistributionDuration()) * from.cost();
+ return toHours(model.redistributionDuration()) * from.advertisedResources().cost() * nodesToRetire();
+ }
+ else {
+ if (requiresNodeReplacement()) return toHours(model.nodeReplacementDuration()) * from.cost();
+ return 0;
+ }
}
private boolean requiresRedistribution() {
@@ -37,6 +42,15 @@ public class ResourceChange {
return false;
}
+ /**
+ * Returns the estimated number of nodes that will be retired by this change,
+ * given that it is a content cluster and no node replacement is necessary.
+ * This is not necessarily always perfectly correct if this changes group layout.
+ */
+ private int nodesToRetire() {
+ return Math.max(0, from.nodes() - to.nodes());
+ }
+
/** Returns true if the *existing* nodes of this needs to be replaced in this change. */
private boolean requiresNodeReplacement() {
var fromNodes = from.advertisedResources().nodeResources();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index c7171b6b478..4e19d04ffac 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -521,7 +521,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(0.5, 0.8, 0.1), 120);
fixture.tester().assertResources("Suggesting resources where disk is 3x memory (this is a content cluster)",
- 10, 1, 14.3, 66.2, 198.6,
+ 11, 1, 13.0, 60.0, 179.9,
fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min));
fixture.tester().assertResources("Autoscaling to resources where disk is 3x memory (this is a content cluster)",
10, 1, 10.0, 66.2, 198.6,
@@ -981,33 +981,4 @@ public class AutoscalingTest {
NodeResources.Architecture.x86_64, nodes.get(0).resources().architecture());
}
- // Verify that we choose not to increase to 3 nodes even though that is cheaper (dure to redundancy),
- // due to considering the cost of redistribution. This depends quite finely on the parameters,
- // and the easiest way to move it back if there is a change is to increase the scaling duration,
- // as that is a redistribution cost multiplier (until redistribution is measured separately).
- @Test
- public void change_not_causing_redistribution_is_preferred() {
- var min = new ClusterResources(2, 1, new NodeResources( 16, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
- var max = new ClusterResources(4, 1, new NodeResources( 32, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
-
- var fixture = DynamicProvisioningTester.fixture()
- .clusterType(ClusterSpec.Type.content)
- .awsSetup(true, Environment.prod)
- .capacity(Capacity.from(min, max))
- .initialResources(Optional.of(min))
- .build();
- fixture.setScalingDuration(Duration.ofMinutes(35));
- var nodes = fixture.nodes().not().retired().asList();
- assertEquals(2, nodes.size());
- assertEquals(16.0, nodes.get(0).resources().vcpu(), 0.000001);
-
- fixture.tester().clock().advance(Duration.ofHours(5));
- fixture.loader().applyCpuLoad(0.75, 700); // trigger rescaling, but don't cause fulfilment < 1
- var autoscaling = fixture.autoscale();
- fixture.deploy(Capacity.from(autoscaling.resources().get()));
- nodes = fixture.nodes().not().retired().asList();
- assertEquals("Increasing cpu is preferred to adding nodes to avoid redistribution", 2, nodes.size());
- assertEquals(28.5, nodes.get(0).resources().vcpu(), 0.000001);
- }
-
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 8aaf0eb20e7..3145675325b 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -75,7 +75,7 @@ public class ScalingSuggestionsMaintainerTest {
assertEquals("8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk: 10.0 Gb, bandwidth: 0.1 Gbps, architecture: any]",
suggestionOf(app1, cluster1, tester).resources().get().toString());
- assertEquals("8 nodes with [vcpu: 3.6, memory: 4.7 Gb, disk: 14.2 Gb, bandwidth: 0.1 Gbps, architecture: any]",
+ assertEquals("7 nodes with [vcpu: 4.1, memory: 5.3 Gb, disk: 16.5 Gb, bandwidth: 0.1 Gbps, architecture: any]",
suggestionOf(app2, cluster2, tester).resources().get().toString());
// Utilization goes way down