diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-12-09 07:30:00 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-12-09 07:30:00 +0100 |
commit | ccc1235730c483c7367f3ee7a02aa2a24d0804b5 (patch) | |
tree | c73ab3707e69010bf9dadf015d9a066e501fd32b /node-repository | |
parent | 2853733d3227818dd83e006e63a3c915679a7b53 (diff) |
Increase container ideal disk utilization
Diffstat (limited to 'node-repository')
6 files changed, 42 insertions, 20 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 1001ab83cc0..bf8a3374484 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -29,9 +29,11 @@ public class ClusterModel { static final double idealQueryCpuLoad = 0.8; static final double idealWriteCpuLoad = 0.95; static final double idealMemoryLoad = 0.65; - static final double idealDiskLoad = 0.6; + static final double idealContainerDiskLoad = 0.95; + static final double idealContentDiskLoad = 0.6; private final Application application; + private final ClusterSpec clusterSpec; private final Cluster cluster; /** The current nodes of this cluster, or empty if this models a new cluster not yet deployed */ private final NodeList nodes; @@ -51,6 +53,7 @@ public class ClusterModel { MetricsDb metricsDb, Clock clock) { this.application = application; + this.clusterSpec = clusterSpec; this.cluster = cluster; this.nodes = clusterNodes; this.clock = clock; @@ -61,12 +64,14 @@ public class ClusterModel { /** For testing */ ClusterModel(Application application, + ClusterSpec clusterSpec, Cluster cluster, Clock clock, Duration scalingDuration, ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries nodeTimeseries) { this.application = application; + this.clusterSpec = clusterSpec; this.cluster = cluster; this.nodes = null; this.clock = clock; @@ -76,6 +81,8 @@ public class ClusterModel { this.nodeTimeseries = nodeTimeseries; } + public Application application() { return application; } + public ClusterSpec clusterSpec() { return clusterSpec; } public Cluster cluster() { return cluster; } /** Returns the predicted duration of a rescaling of this cluster */ @@ -107,7 +114,7 @@ public class ClusterModel { public Load averageLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(scalingDuration())); } public Load idealLoad() { - return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad); + return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()); } /** Ideal cpu load must take the application traffic fraction into account */ @@ -190,6 +197,12 @@ public class ClusterModel { return duration; } + private double idealDiskLoad() { + // Stateless clusters are not expected to consume more disk over time - + // if they do it is due to logs which will be rotated away right before the disk is full + return clusterSpec.isStateful() ? idealContentDiskLoad : idealContainerDiskLoad; + } + /** * Create a cluster model if possible and logs a warning and returns empty otherwise. * This is useful in cases where it's possible to continue without the cluser model, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 90ab5cba772..466db0a7e78 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -109,7 +109,7 @@ public class AutoscalingTest { tester.clock().advance(Duration.ofMinutes(-10 * 5)); tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only ClusterResources scaledResources = tester.assertResources("Scaling up since cpu usage is too high", - 7, 1, 2.5, 80.0, 80.0, + 7, 1, 2.5, 80.0, 50.5, tester.autoscale(application1, cluster1.id(), capacity).target()); tester.deploy(application1, cluster1, scaledResources); @@ -119,7 +119,7 @@ public class AutoscalingTest { tester.clock().advance(Duration.ofMinutes(-10 * 5)); tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling down since cpu usage has gone down", - 4, 1, 2.5, 68.6, 68.6, + 4, 1, 2.5, 68.6, 27.4, tester.autoscale(application1, cluster1.id(), capacity).target()); } @@ -204,7 +204,7 @@ public class AutoscalingTest { tester.clock().advance(Duration.ofMinutes(-10 * 5)); tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up to limit since resource usage is too high", - 6, 1, 2.4, 78.0, 79.0, + 6, 1, 2.4, 78.0, 70.0, tester.autoscale(application1, cluster1.id(), capacity).target()); } @@ -341,7 +341,7 @@ public class AutoscalingTest { tester.clock().advance(Duration.ofMinutes(-10 * 5)); tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up since resource usage is too high", - 7, 1, 2.5, 80.0, 80.0, + 7, 1, 2.5, 80.0, 50.5, tester.suggest(application1, cluster1.id(), min, max).target()); } @@ -398,7 +398,7 @@ public class AutoscalingTest { tester.clock().advance(Duration.ofMinutes(-10 * 5)); tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.assertResources("Scaling up since resource usage is too high", - 7, 7, 2.5, 80.0, 80.0, + 7, 7, 2.5, 80.0, 50.5, tester.autoscale(application1, cluster1.id(), capacity).target()); } @@ -421,7 +421,7 @@ public class AutoscalingTest { t -> t == 0 ? 20.0 : 10.0, t -> 1.0); tester.assertResources("Scaling up since resource usage is too high, changing to 1 group is cheaper", - 8, 1, 2.6, 83.3, 83.3, + 8, 1, 2.6, 83.3, 52.6, tester.autoscale(application1, cluster1.id(), capacity).target()); } @@ -445,7 +445,7 @@ public class AutoscalingTest { t -> t == 0 ? 20.0 : 10.0, t -> 100.0); tester.assertResources("Scaling down since resource usage is too high, changing to 1 group is cheaper", - 4, 1, 2.1, 83.3, 83.3, + 4, 1, 2.1, 83.3, 52.6, tester.autoscale(application1, cluster1.id(), capacity).target()); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index 8d59181a027..95bf0f6b7ae 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -142,7 +142,7 @@ class AutoscalingTester { for (Node node : nodes) { Load load = new Load(value, ClusterModel.idealMemoryLoad * otherResourcesLoad, - ClusterModel.idealDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor); + ClusterModel.idealContentDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor); nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), load, @@ -172,7 +172,7 @@ class AutoscalingTester { clock().advance(Duration.ofMinutes(5)); for (Node node : nodes) { Load load = new Load(ClusterModel.idealQueryCpuLoad * otherResourcesLoad, - ClusterModel.idealDiskLoad * otherResourcesLoad, + ClusterModel.idealContentDiskLoad * otherResourcesLoad, value).multiply(oneExtraNodeFactor); nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), @@ -204,10 +204,10 @@ class AutoscalingTester { for (Node node : nodes) { float cpu = (float) 0.2 * otherResourcesLoad * oneExtraNodeFactor; float memory = value * oneExtraNodeFactor; - float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor; + float disk = (float) ClusterModel.idealContentDiskLoad * otherResourcesLoad * oneExtraNodeFactor; Load load = new Load(0.2 * otherResourcesLoad, value, - ClusterModel.idealDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor); + ClusterModel.idealContentDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor); nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), load, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java index bd7300ad6bf..516a7a92d04 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java @@ -30,19 +30,20 @@ public class ClusterModelTest { public void test_traffic_headroom() { ManualClock clock = new ManualClock(); Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); + ClusterSpec clusterSpec = clusterSpec(); Cluster cluster = cluster(new NodeResources(1, 10, 100, 1)); application = application.with(cluster); // No current traffic share: Ideal load is low but capped var model1 = new ClusterModel(application.with(new Status(0.0, 1.0)), - cluster, clock, Duration.ofMinutes(10), + clusterSpec, cluster, clock, Duration.ofMinutes(10), timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), ClusterNodesTimeseries.empty()); assertEquals(0.131, model1.idealLoad().cpu(), delta); // Almost no current traffic share: Ideal load is low but capped var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)), - cluster, clock, Duration.ofMinutes(10), + clusterSpec, cluster, clock, Duration.ofMinutes(10), timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), ClusterNodesTimeseries.empty()); assertEquals(0.131, model2.idealLoad().cpu(), delta); @@ -53,24 +54,32 @@ public class ClusterModelTest { ManualClock clock = new ManualClock(); Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); + ClusterSpec clusterSpec = clusterSpec(); Cluster cluster = cluster(new NodeResources(1, 10, 100, 1)); application = application.with(cluster); // No current traffic: Ideal load is low but capped var model1 = new ClusterModel(application, - cluster, clock, Duration.ofMinutes(10), + clusterSpec, cluster, clock, Duration.ofMinutes(10), timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), ClusterNodesTimeseries.empty()); assertEquals(0.275, model1.idealLoad().cpu(), delta); // Almost no current traffic: Ideal load is low but capped var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)), - cluster, clock, Duration.ofMinutes(10), + clusterSpec, cluster, clock, Duration.ofMinutes(10), timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock), ClusterNodesTimeseries.empty()); assertEquals(0.040, model2.idealLoad().cpu(), delta); } + private ClusterSpec clusterSpec() { + return ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("test")) + .group(ClusterSpec.Group.from(0)) + .vespaVersion("7.1.1") + .build(); + } + private Cluster cluster(NodeResources resources) { return Cluster.create(ClusterSpec.Id.from("test"), false, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index 03b41412896..2ab36ae7202 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -68,7 +68,7 @@ public class ScalingSuggestionsMaintainerTest { new TestMetric()); maintainer.maintain(); - assertEquals("11 nodes with [vcpu: 6.5, memory: 5.5 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]", + assertEquals("12 nodes with [vcpu: 6.0, memory: 5.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]", suggestionOf(app1, cluster1, tester).get().resources().toString()); assertEquals("8 nodes with [vcpu: 11.0, memory: 4.4 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps]", suggestionOf(app2, cluster2, tester).get().resources().toString()); @@ -78,7 +78,7 @@ public class ScalingSuggestionsMaintainerTest { addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository()); maintainer.maintain(); assertEquals("Suggestion stays at the peak value observed", - "11 nodes with [vcpu: 6.5, memory: 5.5 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]", + "12 nodes with [vcpu: 6.0, memory: 5.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]", suggestionOf(app1, cluster1, tester).get().resources().toString()); // Utilization is still way down and a week has passed tester.clock().advance(Duration.ofDays(7)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json index 689b6f3816b..fcdcdf1a8ca 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json @@ -72,7 +72,7 @@ "idealMemory": 0.65, "currentMemory": 0.0, "disk" : 0.0, - "idealDisk": 0.6, + "idealDisk": 0.95, "currentDisk": 0.0 }, "scalingEvents" : [ |