summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-08-04 13:02:52 +0200
committerJon Bratseth <bratseth@gmail.com>2022-08-04 13:02:52 +0200
commit5be0e33d7e749858f107e9ffa1446fb615801e69 (patch)
tree528ccba047778f5965b0d1b5a7e888462dabb342 /node-repository
parentbf103a2299acd66b399cd85c8acc2ac86efa6b98 (diff)
Scale up fast
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java7
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java51
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java2
4 files changed, 30 insertions, 36 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 7a02fa9eb7e..69271b9b45f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -69,7 +69,7 @@ public class Autoscaler {
if ( ! clusterIsStable(clusterNodes, nodeRepository))
return Advice.none(Status.waiting, "Cluster change in progress");
-
+/*
if (scaledIn(clusterModel.scalingDuration(), cluster))
return Advice.dontScale(Status.waiting,
"Won't autoscale now: Less than " + clusterModel.scalingDuration() +
@@ -87,7 +87,7 @@ public class Autoscaler {
"Collecting more data before making new scaling decisions:" +
" Have measurements from " + clusterModel.nodeTimeseries().nodesMeasured() +
" nodes, but require from " + clusterNodes.size());
-
+*/
var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository);
Optional<AllocatableClusterResources> bestAllocation =
allocationOptimizer.findBestAllocation(clusterModel.loadAdjustment(), currentAllocation, clusterModel, limits);
@@ -134,7 +134,7 @@ public class Autoscaler {
return ! similar(from.cost(), to.cost(), costDifferenceWorthReallocation);
}
- private static boolean meaningfulIncrease(double from, double to) {
+ public static boolean meaningfulIncrease(double from, double to) {
return from < to && ! similar(from, to, resourceDifferenceWorthReallocation);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 6fe065bf48d..9b9b7dcecb0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -94,16 +94,13 @@ public class ClusterModel {
/** Returns the relative load adjustment that should be made to this cluster given available measurements. */
public Load loadAdjustment() {
if (nodeTimeseries().measurementsPerNode() == 0) return Load.one(); // No info, no change
-
// Should we scale up?
- /*
Load relativePeak = nodeTimeseries().peakLoad().divide(idealLoad());
- System.out.println("Relative peak " + relativePeak);
- if (relativePeak.any(v -> v > 1))
+ if (relativePeak.any(v -> v > 1.01)) // "meaningful growth": 1% over status quo.
return relativePeak.max(Load.one()); // Don't downscale any dimension if we upscale
+
// Should we scale down?
// TODO
- */
return averageLoad().divide(idealLoad());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 393baa63c88..007b2629952 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -19,7 +19,6 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.Nodelike;
import com.yahoo.vespa.hosted.provision.provisioning.CapacityPolicies;
import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator;
-import org.junit.Ignore;
import org.junit.Test;
import java.time.Duration;
@@ -38,10 +37,10 @@ public class AutoscalingTest {
var fixture = AutoscalingTester.fixture().build();
fixture.tester().clock().advance(Duration.ofDays(1));
- assertTrue("No measurements -> No change", fixture.autoscale().isEmpty());
+ assertTrue("No measurements -> No change", fixture.autoscale().target().isEmpty());
fixture.loader().applyCpuLoad(0.7f, 59);
- assertTrue("Too few measurements -> No change", fixture.autoscale().isEmpty());
+ assertTrue("Too few measurements -> No change", fixture.autoscale().target().isEmpty());
fixture.tester().clock().advance(Duration.ofDays(1));
fixture.loader().applyCpuLoad(0.7f, 120);
@@ -54,17 +53,13 @@ public class AutoscalingTest {
fixture.deactivateRetired(Capacity.from(scaledResources));
- fixture.tester().clock().advance(Duration.ofDays(2));
- fixture.loader().applyCpuLoad(0.8f, 3);
- assertTrue("Load change is large, but insufficient measurements for new config -> No change",
- fixture.autoscale().isEmpty());
-
fixture.loader().applyCpuLoad(0.19f, 100);
assertEquals("Load change is small -> No change", Optional.empty(), fixture.autoscale().target());
+ fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(0.1f, 120);
fixture.tester().assertResources("Scaling cpu down since usage has gone down significantly",
- 9, 1, 1.0, 5.0, 50.0,
+ 6, 1, 1.1, 8, 80.0,
fixture.autoscale());
}
@@ -78,12 +73,11 @@ public class AutoscalingTest {
/** Using too many resources for a short period is proof we should scale up regardless of the time that takes. */
@Test
- @Ignore
public void test_autoscaling_up_is_fast() {
var fixture = AutoscalingTester.fixture().build();
- fixture.loader().applyLoad(1.0, 1.0, 1.0, 6);
+ fixture.loader().applyLoad(1.0, 1.0, 1.0, 3);
fixture.tester().assertResources("Scaling up since resource usage is too high",
- 10, 1, 9.4, 8.5, 92.6,
+ 10, 1, 7.2, 8.5, 92.6,
fixture.autoscale());
}
@@ -91,14 +85,17 @@ public class AutoscalingTest {
@Test
public void test_autoscaling_single_container_group() {
var fixture = AutoscalingTester.fixture().clusterType(ClusterSpec.Type.container).build();
+// Optional<ClusterResources> initialResources = Optional.of(new ClusterResources(5, 1, new NodeResources(3, 10, 100, 1)));
+
fixture.loader().applyCpuLoad(0.25f, 120);
- ClusterResources scaledResources = fixture.tester().assertResources("Scaling up since cpu usage is too high",
- 5, 1, 3.8, 8.0, 50.5,
+ ClusterResources scaledResources = fixture.tester().assertResources("Scaling up since cpu, not scaling others down",
+ 4, 1, 5, 10.0, 100.0,
fixture.autoscale());
fixture.deploy(Capacity.from(scaledResources));
+ fixture.deactivateRetired(Capacity.from(scaledResources));
fixture.loader().applyCpuLoad(0.1f, 120);
fixture.tester().assertResources("Scaling down since cpu usage has gone down",
- 4, 1, 2.5, 6.4, 25.5,
+ 4, 1, 2.5, 7.5, 47.4,
fixture.autoscale());
}
@@ -218,9 +215,9 @@ public class AutoscalingTest {
.capacity(Capacity.from(min, max))
.build();
fixture.tester().clock().advance(Duration.ofDays(2));
- fixture.loader().applyCpuLoad(0.3, 240);
+ fixture.loader().applyCpuLoad(0.4, 240);
fixture.tester().assertResources("Scaling up",
- 6, 6, 3.8, 8.0, 10.0,
+ 6, 6, 5.0, 10.0, 10.0,
fixture.autoscale());
}
@@ -298,7 +295,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(0.9, 0.6, 0.7, 1, false, true, 120);
assertTrue("Not scaling up since nodes were measured while cluster was out of service",
- fixture.autoscale().isEmpty());
+ fixture.autoscale().target().isEmpty());
}
@Test
@@ -306,8 +303,8 @@ public class AutoscalingTest {
var fixture = AutoscalingTester.fixture().build();
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(0.9, 0.6, 0.7, 1, true, false, 120);
- assertTrue("Not scaling up since nodes were measured while cluster was out of service",
- fixture.autoscale().isEmpty());
+ assertTrue("Not scaling up since nodes were measured while cluster was unstable",
+ fixture.autoscale().target().isEmpty());
}
@Test
@@ -321,8 +318,8 @@ public class AutoscalingTest {
.build();
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(0.9, 120);
- fixture.tester().assertResources("Scaling the number of groups, but nothing requires us to stay with 1 node per group",
- 10, 5, 7.7, 40.0, 40.0,
+ fixture.tester().assertResources("Scaling up to 2 nodes, not scaling memory and disk down at the same time",
+ 10, 5, 7.7, 50.0, 50.0,
fixture.autoscale());
}
@@ -339,8 +336,8 @@ public class AutoscalingTest {
Duration timePassed = fixture.loader().addCpuMeasurements(0.25, 120);
fixture.tester().clock().advance(timePassed.negated());
fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 1.0);
- fixture.tester().assertResources("Scaling up since resource usage is too high, changing to 1 group is cheaper",
- 10, 1, 2.3, 27.8, 27.8,
+ fixture.tester().assertResources("Scaling up cpu, not scaling others down, changing to 1 group is cheaper",
+ 10, 1, 2.3, 33.3, 33.3,
fixture.autoscale());
}
@@ -375,7 +372,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(1));
fixture.loader().applyMemLoad(1.0, 1000);
fixture.tester().assertResources("Increase group size to reduce memory load",
- 8, 2, 6.5, 96.2, 62.5,
+ 8, 2, 7.8, 96.2, 75,
fixture.autoscale());
}
@@ -389,9 +386,9 @@ public class AutoscalingTest {
.capacity(Capacity.from(min, max))
.build();
fixture.tester().clock().advance(Duration.ofDays(2));
- fixture.loader().applyMemLoad(0.02, 120);
+ fixture.loader().applyLoad(0.16, 0.02, 0.5, 120);
fixture.tester().assertResources("Scaling down",
- 6, 1, 3.1, 4.0, 100.0,
+ 6, 1, 3.0, 4.0, 100.0,
fixture.autoscale());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 6d007368db5..508168261df 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -97,7 +97,7 @@ public class ScalingSuggestionsMaintainerTest {
var suggested = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().suggestedResources().get().resources();
tester.deploy(app1, cluster1, Capacity.from(suggested, suggested, false, true));
tester.clock().advance(Duration.ofDays(2));
- addMeasurements(0.2f, 0.7f, 0.6f,
+ addMeasurements(0.2f, 0.65f, 0.6f,
0, 500, app1, tester.nodeRepository());
maintainer.maintain();
assertEquals("Suggestion is to keep the current allocation",