aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-12-09 07:30:00 +0100
committerJon Bratseth <bratseth@gmail.com>2021-12-09 07:30:00 +0100
commitccc1235730c483c7367f3ee7a02aa2a24d0804b5 (patch)
treec73ab3707e69010bf9dadf015d9a066e501fd32b /node-repository
parent2853733d3227818dd83e006e63a3c915679a7b53 (diff)
Increase container ideal disk utilization
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java17
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java14
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java17
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json2
6 files changed, 42 insertions, 20 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 1001ab83cc0..bf8a3374484 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -29,9 +29,11 @@ public class ClusterModel {
static final double idealQueryCpuLoad = 0.8;
static final double idealWriteCpuLoad = 0.95;
static final double idealMemoryLoad = 0.65;
- static final double idealDiskLoad = 0.6;
+ static final double idealContainerDiskLoad = 0.95;
+ static final double idealContentDiskLoad = 0.6;
private final Application application;
+ private final ClusterSpec clusterSpec;
private final Cluster cluster;
/** The current nodes of this cluster, or empty if this models a new cluster not yet deployed */
private final NodeList nodes;
@@ -51,6 +53,7 @@ public class ClusterModel {
MetricsDb metricsDb,
Clock clock) {
this.application = application;
+ this.clusterSpec = clusterSpec;
this.cluster = cluster;
this.nodes = clusterNodes;
this.clock = clock;
@@ -61,12 +64,14 @@ public class ClusterModel {
/** For testing */
ClusterModel(Application application,
+ ClusterSpec clusterSpec,
Cluster cluster,
Clock clock,
Duration scalingDuration,
ClusterTimeseries clusterTimeseries,
ClusterNodesTimeseries nodeTimeseries) {
this.application = application;
+ this.clusterSpec = clusterSpec;
this.cluster = cluster;
this.nodes = null;
this.clock = clock;
@@ -76,6 +81,8 @@ public class ClusterModel {
this.nodeTimeseries = nodeTimeseries;
}
+ public Application application() { return application; }
+ public ClusterSpec clusterSpec() { return clusterSpec; }
public Cluster cluster() { return cluster; }
/** Returns the predicted duration of a rescaling of this cluster */
@@ -107,7 +114,7 @@ public class ClusterModel {
public Load averageLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(scalingDuration())); }
public Load idealLoad() {
- return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad);
+ return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad());
}
/** Ideal cpu load must take the application traffic fraction into account */
@@ -190,6 +197,12 @@ public class ClusterModel {
return duration;
}
+ private double idealDiskLoad() {
+ // Stateless clusters are not expected to consume more disk over time -
+ // if they do it is due to logs which will be rotated away right before the disk is full
+ return clusterSpec.isStateful() ? idealContentDiskLoad : idealContainerDiskLoad;
+ }
+
/**
* Create a cluster model if possible and logs a warning and returns empty otherwise.
* This is useful in cases where it's possible to continue without the cluser model,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 90ab5cba772..466db0a7e78 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -109,7 +109,7 @@ public class AutoscalingTest {
tester.clock().advance(Duration.ofMinutes(-10 * 5));
tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
ClusterResources scaledResources = tester.assertResources("Scaling up since cpu usage is too high",
- 7, 1, 2.5, 80.0, 80.0,
+ 7, 1, 2.5, 80.0, 50.5,
tester.autoscale(application1, cluster1.id(), capacity).target());
tester.deploy(application1, cluster1, scaledResources);
@@ -119,7 +119,7 @@ public class AutoscalingTest {
tester.clock().advance(Duration.ofMinutes(-10 * 5));
tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.assertResources("Scaling down since cpu usage has gone down",
- 4, 1, 2.5, 68.6, 68.6,
+ 4, 1, 2.5, 68.6, 27.4,
tester.autoscale(application1, cluster1.id(), capacity).target());
}
@@ -204,7 +204,7 @@ public class AutoscalingTest {
tester.clock().advance(Duration.ofMinutes(-10 * 5));
tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.assertResources("Scaling up to limit since resource usage is too high",
- 6, 1, 2.4, 78.0, 79.0,
+ 6, 1, 2.4, 78.0, 70.0,
tester.autoscale(application1, cluster1.id(), capacity).target());
}
@@ -341,7 +341,7 @@ public class AutoscalingTest {
tester.clock().advance(Duration.ofMinutes(-10 * 5));
tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.assertResources("Scaling up since resource usage is too high",
- 7, 1, 2.5, 80.0, 80.0,
+ 7, 1, 2.5, 80.0, 50.5,
tester.suggest(application1, cluster1.id(), min, max).target());
}
@@ -398,7 +398,7 @@ public class AutoscalingTest {
tester.clock().advance(Duration.ofMinutes(-10 * 5));
tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.assertResources("Scaling up since resource usage is too high",
- 7, 7, 2.5, 80.0, 80.0,
+ 7, 7, 2.5, 80.0, 50.5,
tester.autoscale(application1, cluster1.id(), capacity).target());
}
@@ -421,7 +421,7 @@ public class AutoscalingTest {
t -> t == 0 ? 20.0 : 10.0,
t -> 1.0);
tester.assertResources("Scaling up since resource usage is too high, changing to 1 group is cheaper",
- 8, 1, 2.6, 83.3, 83.3,
+ 8, 1, 2.6, 83.3, 52.6,
tester.autoscale(application1, cluster1.id(), capacity).target());
}
@@ -445,7 +445,7 @@ public class AutoscalingTest {
t -> t == 0 ? 20.0 : 10.0,
t -> 100.0);
tester.assertResources("Scaling down since resource usage is too high, changing to 1 group is cheaper",
- 4, 1, 2.1, 83.3, 83.3,
+ 4, 1, 2.1, 83.3, 52.6,
tester.autoscale(application1, cluster1.id(), capacity).target());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index 8d59181a027..95bf0f6b7ae 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -142,7 +142,7 @@ class AutoscalingTester {
for (Node node : nodes) {
Load load = new Load(value,
ClusterModel.idealMemoryLoad * otherResourcesLoad,
- ClusterModel.idealDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
+ ClusterModel.idealContentDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(),
new NodeMetricSnapshot(clock().instant(),
load,
@@ -172,7 +172,7 @@ class AutoscalingTester {
clock().advance(Duration.ofMinutes(5));
for (Node node : nodes) {
Load load = new Load(ClusterModel.idealQueryCpuLoad * otherResourcesLoad,
- ClusterModel.idealDiskLoad * otherResourcesLoad,
+ ClusterModel.idealContentDiskLoad * otherResourcesLoad,
value).multiply(oneExtraNodeFactor);
nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(),
new NodeMetricSnapshot(clock().instant(),
@@ -204,10 +204,10 @@ class AutoscalingTester {
for (Node node : nodes) {
float cpu = (float) 0.2 * otherResourcesLoad * oneExtraNodeFactor;
float memory = value * oneExtraNodeFactor;
- float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
+ float disk = (float) ClusterModel.idealContentDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
Load load = new Load(0.2 * otherResourcesLoad,
value,
- ClusterModel.idealDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
+ ClusterModel.idealContentDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(),
new NodeMetricSnapshot(clock().instant(),
load,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
index bd7300ad6bf..516a7a92d04 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
@@ -30,19 +30,20 @@ public class ClusterModelTest {
public void test_traffic_headroom() {
ManualClock clock = new ManualClock();
Application application = Application.empty(ApplicationId.from("t1", "a1", "i1"));
+ ClusterSpec clusterSpec = clusterSpec();
Cluster cluster = cluster(new NodeResources(1, 10, 100, 1));
application = application.with(cluster);
// No current traffic share: Ideal load is low but capped
var model1 = new ClusterModel(application.with(new Status(0.0, 1.0)),
- cluster, clock, Duration.ofMinutes(10),
+ clusterSpec, cluster, clock, Duration.ofMinutes(10),
timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
ClusterNodesTimeseries.empty());
assertEquals(0.131, model1.idealLoad().cpu(), delta);
// Almost no current traffic share: Ideal load is low but capped
var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
- cluster, clock, Duration.ofMinutes(10),
+ clusterSpec, cluster, clock, Duration.ofMinutes(10),
timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
ClusterNodesTimeseries.empty());
assertEquals(0.131, model2.idealLoad().cpu(), delta);
@@ -53,24 +54,32 @@ public class ClusterModelTest {
ManualClock clock = new ManualClock();
Application application = Application.empty(ApplicationId.from("t1", "a1", "i1"));
+ ClusterSpec clusterSpec = clusterSpec();
Cluster cluster = cluster(new NodeResources(1, 10, 100, 1));
application = application.with(cluster);
// No current traffic: Ideal load is low but capped
var model1 = new ClusterModel(application,
- cluster, clock, Duration.ofMinutes(10),
+ clusterSpec, cluster, clock, Duration.ofMinutes(10),
timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
ClusterNodesTimeseries.empty());
assertEquals(0.275, model1.idealLoad().cpu(), delta);
// Almost no current traffic: Ideal load is low but capped
var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
- cluster, clock, Duration.ofMinutes(10),
+ clusterSpec, cluster, clock, Duration.ofMinutes(10),
timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock),
ClusterNodesTimeseries.empty());
assertEquals(0.040, model2.idealLoad().cpu(), delta);
}
+ private ClusterSpec clusterSpec() {
+ return ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("test"))
+ .group(ClusterSpec.Group.from(0))
+ .vespaVersion("7.1.1")
+ .build();
+ }
+
private Cluster cluster(NodeResources resources) {
return Cluster.create(ClusterSpec.Id.from("test"),
false,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 03b41412896..2ab36ae7202 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -68,7 +68,7 @@ public class ScalingSuggestionsMaintainerTest {
new TestMetric());
maintainer.maintain();
- assertEquals("11 nodes with [vcpu: 6.5, memory: 5.5 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]",
+ assertEquals("12 nodes with [vcpu: 6.0, memory: 5.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]",
suggestionOf(app1, cluster1, tester).get().resources().toString());
assertEquals("8 nodes with [vcpu: 11.0, memory: 4.4 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps]",
suggestionOf(app2, cluster2, tester).get().resources().toString());
@@ -78,7 +78,7 @@ public class ScalingSuggestionsMaintainerTest {
addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository());
maintainer.maintain();
assertEquals("Suggestion stays at the peak value observed",
- "11 nodes with [vcpu: 6.5, memory: 5.5 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]",
+ "12 nodes with [vcpu: 6.0, memory: 5.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]",
suggestionOf(app1, cluster1, tester).get().resources().toString());
// Utilization is still way down and a week has passed
tester.clock().advance(Duration.ofDays(7));
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
index 689b6f3816b..fcdcdf1a8ca 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
@@ -72,7 +72,7 @@
"idealMemory": 0.65,
"currentMemory": 0.0,
"disk" : 0.0,
- "idealDisk": 0.6,
+ "idealDisk": 0.95,
"currentDisk": 0.0
},
"scalingEvents" : [