Increase container ideal disk utilization

author: Jon Bratseth <bratseth@gmail.com> 2021-12-09 07:30:00 +0100
committer: Jon Bratseth <bratseth@gmail.com> 2021-12-09 07:30:00 +0100
commit: ccc1235730c483c7367f3ee7a02aa2a24d0804b5 (patch)
tree: c73ab3707e69010bf9dadf015d9a066e501fd32b /node-repository
parent: 2853733d3227818dd83e006e63a3c915679a7b53 (diff)
6 files changed, 42 insertions, 20 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 1001ab83cc0..bf8a3374484 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -29,9 +29,11 @@ public class ClusterModel {
     static final double idealQueryCpuLoad = 0.8;
     static final double idealWriteCpuLoad = 0.95;
     static final double idealMemoryLoad = 0.65;
-    static final double idealDiskLoad = 0.6;
+    static final double idealContainerDiskLoad = 0.95;
+    static final double idealContentDiskLoad = 0.6;
 
     private final Application application;
+    private final ClusterSpec clusterSpec;
     private final Cluster cluster;
     /** The current nodes of this cluster, or empty if this models a new cluster not yet deployed */
     private final NodeList nodes;
@@ -51,6 +53,7 @@ public class ClusterModel {
                         MetricsDb metricsDb,
                         Clock clock) {
         this.application = application;
+        this.clusterSpec = clusterSpec;
         this.cluster = cluster;
         this.nodes = clusterNodes;
         this.clock = clock;
@@ -61,12 +64,14 @@ public class ClusterModel {
 
     /** For testing */
     ClusterModel(Application application,
+                 ClusterSpec clusterSpec,
                  Cluster cluster,
                  Clock clock,
                  Duration scalingDuration,
                  ClusterTimeseries clusterTimeseries,
                  ClusterNodesTimeseries nodeTimeseries) {
         this.application = application;
+        this.clusterSpec = clusterSpec;
         this.cluster = cluster;
         this.nodes = null;
         this.clock = clock;
@@ -76,6 +81,8 @@ public class ClusterModel {
         this.nodeTimeseries = nodeTimeseries;
     }
 
+    public Application application() { return application; }
+    public ClusterSpec clusterSpec() { return clusterSpec; }
     public Cluster cluster() { return cluster; }
 
     /** Returns the predicted duration of a rescaling of this cluster */
@@ -107,7 +114,7 @@ public class ClusterModel {
     public Load averageLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(scalingDuration())); }
 
     public Load idealLoad() {
-        return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad);
+        return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad());
     }
 
     /** Ideal cpu load must take the application traffic fraction into account */
@@ -190,6 +197,12 @@ public class ClusterModel {
         return duration;
     }
 
+    private double idealDiskLoad() {
+        // Stateless clusters are not expected to consume more disk over time -
+        // if they do it is due to logs which will be rotated away right before the disk is full
+        return clusterSpec.isStateful() ? idealContentDiskLoad : idealContainerDiskLoad;
+    }
+
     /**
      * Create a cluster model if possible and logs a warning and returns empty otherwise.
      * This is useful in cases where it's possible to continue without the cluser model,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 90ab5cba772..466db0a7e78 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -109,7 +109,7 @@ public class AutoscalingTest {
         tester.clock().advance(Duration.ofMinutes(-10 * 5));
         tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
         ClusterResources scaledResources = tester.assertResources("Scaling up since cpu usage is too high",
-                                                                  7, 1, 2.5,  80.0, 80.0,
+                                                                  7, 1, 2.5,  80.0, 50.5,
                                                                   tester.autoscale(application1, cluster1.id(), capacity).target());
 
         tester.deploy(application1, cluster1, scaledResources);
@@ -119,7 +119,7 @@ public class AutoscalingTest {
         tester.clock().advance(Duration.ofMinutes(-10 * 5));
         tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
         tester.assertResources("Scaling down since cpu usage has gone down",
-                               4, 1, 2.5, 68.6, 68.6,
+                               4, 1, 2.5, 68.6, 27.4,
                                tester.autoscale(application1, cluster1.id(), capacity).target());
     }
 
@@ -204,7 +204,7 @@ public class AutoscalingTest {
         tester.clock().advance(Duration.ofMinutes(-10 * 5));
         tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
         tester.assertResources("Scaling up to limit since resource usage is too high",
-                               6, 1, 2.4,  78.0, 79.0,
+                               6, 1, 2.4,  78.0, 70.0,
                                tester.autoscale(application1, cluster1.id(), capacity).target());
     }
 
@@ -341,7 +341,7 @@ public class AutoscalingTest {
         tester.clock().advance(Duration.ofMinutes(-10 * 5));
         tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
         tester.assertResources("Scaling up since resource usage is too high",
-                               7, 1, 2.5,  80.0, 80.0,
+                               7, 1, 2.5,  80.0, 50.5,
                                tester.suggest(application1, cluster1.id(), min, max).target());
     }
 
@@ -398,7 +398,7 @@ public class AutoscalingTest {
         tester.clock().advance(Duration.ofMinutes(-10 * 5));
         tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
         tester.assertResources("Scaling up since resource usage is too high",
-                               7, 7, 2.5,  80.0, 80.0,
+                               7, 7, 2.5,  80.0, 50.5,
                                tester.autoscale(application1, cluster1.id(), capacity).target());
     }
 
@@ -421,7 +421,7 @@ public class AutoscalingTest {
                                    t -> t == 0 ? 20.0 : 10.0,
                                    t -> 1.0);
         tester.assertResources("Scaling up since resource usage is too high, changing to 1 group is cheaper",
-                               8, 1, 2.6,  83.3, 83.3,
+                               8, 1, 2.6,  83.3, 52.6,
                                tester.autoscale(application1, cluster1.id(), capacity).target());
     }
 
@@ -445,7 +445,7 @@ public class AutoscalingTest {
                                    t -> t == 0 ? 20.0 : 10.0,
                                    t -> 100.0);
         tester.assertResources("Scaling down since resource usage is too high, changing to 1 group is cheaper",
-                               4, 1, 2.1,  83.3, 83.3,
+                               4, 1, 2.1,  83.3, 52.6,
                                tester.autoscale(application1, cluster1.id(), capacity).target());
     }
 
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index 8d59181a027..95bf0f6b7ae 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -142,7 +142,7 @@ class AutoscalingTester {
             for (Node node : nodes) {
                 Load load = new Load(value,
                                      ClusterModel.idealMemoryLoad * otherResourcesLoad,
-                                     ClusterModel.idealDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
+                                     ClusterModel.idealContentDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
                 nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(),
                                                                   new NodeMetricSnapshot(clock().instant(),
                                                                                          load,
@@ -172,7 +172,7 @@ class AutoscalingTester {
             clock().advance(Duration.ofMinutes(5));
             for (Node node : nodes) {
                 Load load = new Load(ClusterModel.idealQueryCpuLoad * otherResourcesLoad,
-                                     ClusterModel.idealDiskLoad * otherResourcesLoad,
+                                     ClusterModel.idealContentDiskLoad * otherResourcesLoad,
                                      value).multiply(oneExtraNodeFactor);
                 nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(),
                                                                   new NodeMetricSnapshot(clock().instant(),
@@ -204,10 +204,10 @@ class AutoscalingTester {
             for (Node node : nodes) {
                 float cpu  = (float) 0.2 * otherResourcesLoad * oneExtraNodeFactor;
                 float memory = value * oneExtraNodeFactor;
-                float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
+                float disk = (float) ClusterModel.idealContentDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
                 Load load = new Load(0.2 * otherResourcesLoad,
                                      value,
-                                     ClusterModel.idealDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
+                                     ClusterModel.idealContentDiskLoad * otherResourcesLoad).multiply(oneExtraNodeFactor);
                 nodeMetricsDb().addNodeMetrics(List.of(new Pair<>(node.hostname(),
                                                                   new NodeMetricSnapshot(clock().instant(),
                                                                                          load,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
index bd7300ad6bf..516a7a92d04 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
@@ -30,19 +30,20 @@ public class ClusterModelTest {
     public void test_traffic_headroom() {
         ManualClock clock = new ManualClock();
         Application application = Application.empty(ApplicationId.from("t1", "a1", "i1"));
+        ClusterSpec clusterSpec = clusterSpec();
         Cluster cluster = cluster(new NodeResources(1, 10, 100, 1));
         application = application.with(cluster);
 
         // No current traffic share: Ideal load is low but capped
         var model1 = new ClusterModel(application.with(new Status(0.0, 1.0)),
-                                      cluster, clock, Duration.ofMinutes(10),
+                                      clusterSpec, cluster, clock, Duration.ofMinutes(10),
                                       timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
                                       ClusterNodesTimeseries.empty());
         assertEquals(0.131, model1.idealLoad().cpu(), delta);
 
         // Almost no current traffic share: Ideal load is low but capped
         var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
-                                      cluster, clock, Duration.ofMinutes(10),
+                                      clusterSpec, cluster, clock, Duration.ofMinutes(10),
                                       timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
                                       ClusterNodesTimeseries.empty());
         assertEquals(0.131, model2.idealLoad().cpu(), delta);
@@ -53,24 +54,32 @@ public class ClusterModelTest {
         ManualClock clock = new ManualClock();
 
         Application application = Application.empty(ApplicationId.from("t1", "a1", "i1"));
+        ClusterSpec clusterSpec = clusterSpec();
         Cluster cluster = cluster(new NodeResources(1, 10, 100, 1));
         application = application.with(cluster);
 
         // No current traffic: Ideal load is low but capped
         var model1 = new ClusterModel(application,
-                                      cluster, clock, Duration.ofMinutes(10),
+                                      clusterSpec, cluster, clock, Duration.ofMinutes(10),
                                       timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
                                       ClusterNodesTimeseries.empty());
         assertEquals(0.275, model1.idealLoad().cpu(), delta);
 
         // Almost no current traffic: Ideal load is low but capped
         var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
-                                      cluster, clock, Duration.ofMinutes(10),
+                                      clusterSpec, cluster, clock, Duration.ofMinutes(10),
                                       timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock),
                                       ClusterNodesTimeseries.empty());
         assertEquals(0.040, model2.idealLoad().cpu(), delta);
     }
 
+    private ClusterSpec clusterSpec() {
+        return ClusterSpec.specification(ClusterSpec.Type.content, ClusterSpec.Id.from("test"))
+                          .group(ClusterSpec.Group.from(0))
+                          .vespaVersion("7.1.1")
+                          .build();
+    }
+
     private Cluster cluster(NodeResources resources) {
         return Cluster.create(ClusterSpec.Id.from("test"),
                               false,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 03b41412896..2ab36ae7202 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -68,7 +68,7 @@ public class ScalingSuggestionsMaintainerTest {
                                                                                    new TestMetric());
         maintainer.maintain();
 
-        assertEquals("11 nodes with [vcpu: 6.5, memory: 5.5 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]",
+        assertEquals("12 nodes with [vcpu: 6.0, memory: 5.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]",
                      suggestionOf(app1, cluster1, tester).get().resources().toString());
         assertEquals("8 nodes with [vcpu: 11.0, memory: 4.4 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps]",
                      suggestionOf(app2, cluster2, tester).get().resources().toString());
@@ -78,7 +78,7 @@ public class ScalingSuggestionsMaintainerTest {
         addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository());
         maintainer.maintain();
         assertEquals("Suggestion stays at the peak value observed",
-                     "11 nodes with [vcpu: 6.5, memory: 5.5 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]",
+                     "12 nodes with [vcpu: 6.0, memory: 5.5 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]",
                      suggestionOf(app1, cluster1, tester).get().resources().toString());
         // Utilization is still way down and a week has passed
         tester.clock().advance(Duration.ofDays(7));
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
index 689b6f3816b..fcdcdf1a8ca 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json
@@ -72,7 +72,7 @@
           "idealMemory": 0.65,
           "currentMemory": 0.0,
           "disk" : 0.0,
-          "idealDisk": 0.6,
+          "idealDisk": 0.95,
           "currentDisk": 0.0
       },
       "scalingEvents" : [
author	Jon Bratseth <bratseth@gmail.com>	2021-12-09 07:30:00 +0100
committer	Jon Bratseth <bratseth@gmail.com>	2021-12-09 07:30:00 +0100
commit	ccc1235730c483c7367f3ee7a02aa2a24d0804b5 (patch)
tree	c73ab3707e69010bf9dadf015d9a066e501fd32b /node-repository
parent	2853733d3227818dd83e006e63a3c915679a7b53 (diff)