diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-08-09 13:46:56 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2022-08-09 13:46:56 +0200 |
commit | 180a265397ab329ae8f8f34f68cae09d48790785 (patch) | |
tree | edb0ebb37b9607b6011dc9e8c5bd9d5d986130ba /node-repository/src | |
parent | 5be0e33d7e749858f107e9ffa1446fb615801e69 (diff) |
Scale down when we have sufficient confidence
Diffstat (limited to 'node-repository/src')
5 files changed, 36 insertions, 25 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 9b9b7dcecb0..5dbc6465411 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import java.time.Clock; import java.time.Duration; +import java.time.Instant; import java.util.Optional; import java.util.OptionalDouble; import java.util.logging.Level; @@ -94,15 +95,31 @@ public class ClusterModel { /** Returns the relative load adjustment that should be made to this cluster given available measurements. */ public Load loadAdjustment() { if (nodeTimeseries().measurementsPerNode() == 0) return Load.one(); // No info, no change + + Load peak = nodeTimeseries().peakLoad().divide(idealLoad()); // Peak relative to ideal + // Should we scale up? - Load relativePeak = nodeTimeseries().peakLoad().divide(idealLoad()); - if (relativePeak.any(v -> v > 1.01)) // "meaningful growth": 1% over status quo. - return relativePeak.max(Load.one()); // Don't downscale any dimension if we upscale + if (peak.any(v -> v > 1.01)) // "meaningful growth": 1% over status quo. + return peak.map(v -> v < 1 ? 1 : v); // Don't downscale any dimension if we upscale // Should we scale down? - // TODO + if (canScaleDown()) + return averageLoad().divide(idealLoad()); + + return Load.one(); + } + + /** Are we in a position to make decisions to scale down at this point? */ + private boolean canScaleDown() { + if (hasScaledIn(scalingDuration().multipliedBy(3))) return false; + if (nodeTimeseries().measurementsPerNode() < 4) return false; + if (nodeTimeseries().nodesMeasured() != nodeCount()) return false; + return true; + } - return averageLoad().divide(idealLoad()); + private boolean hasScaledIn(Duration period) { + return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN) + .isAfter(clock.instant().minus(period)); } /** Returns the predicted duration of a rescaling of this cluster */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java index 88c7e70cd35..6ab5ff731d3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java @@ -54,11 +54,6 @@ public class Load { return new Load(divide(cpu, resources.vcpu()), divide(memory, resources.memoryGb()), divide(disk, resources.diskGb())); } - /** Returns the load having the max value of this and the given load in each dimension. */ - public Load max(Load other) { - return join(other, (a, b) -> Math.max(a, b)); - } - /** Returns the load where the given function is applied to each dimension of this. */ public Load map(DoubleUnaryOperator f) { return new Load(f.applyAsDouble(cpu), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 007b2629952..6f3182b6e44 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -247,6 +247,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyLoad(0.01, 0.01, 0.01, 120); + System.out.println("Asking for suggestion ..."); Autoscaler.Advice suggestion = fixture.suggest(); fixture.tester().assertResources("Choosing the remote disk flavor as it has less disk", 2, 1, 3.0, 100.0, 10.0, @@ -464,19 +465,22 @@ public class AutoscalingTest { var fixture = AutoscalingTester.fixture() .capacity(Capacity.from(min, max)) .build(); + fixture.tester.clock().advance(Duration.ofDays(1)); fixture.loader().applyCpuLoad(0.25, 120); - // (no read share stored) fixture.tester().assertResources("Advice to scale up since we set aside for bcp by default", 7, 1, 3, 100, 100, fixture.autoscale()); + fixture.loader().applyCpuLoad(0.25, 120); fixture.storeReadShare(0.25, 0.5); fixture.tester().assertResources("Half of global share is the same as the default assumption used above", 7, 1, 3, 100, 100, fixture.autoscale()); + fixture.tester.clock().advance(Duration.ofDays(1)); + fixture.loader().applyCpuLoad(0.25, 120); fixture.storeReadShare(0.5, 0.5); fixture.tester().assertResources("Advice to scale down since we don't need room for bcp", 6, 1, 3, 100, 100, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java index cee80459176..f74ace5bd3b 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java @@ -152,8 +152,9 @@ public class AutoscalingMaintainerTest { // deploy tester.deploy(app1, cluster1, app1Capacity); - tester.addQueryRateMeasurements(app1, cluster1.id(), 12, t -> t == 0 ? 20.0 : 10.0); + int measurements = 5; + Duration samplePeriod = Duration.ofSeconds(150); for (int i = 0; i < 20; i++) { // Record completion to keep scaling window at minimum tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 1, app1); @@ -162,25 +163,20 @@ public class AutoscalingMaintainerTest { tester.clock().advance(Duration.ofDays(1)); if (i % 2 == 0) { // high load - for (int j = 0; j < 200; j++ ) { - tester.addMeasurements(0.99f, 0.99f, 0.99f, i, 1, app1); - tester.clock().advance(Duration.ofMinutes(1)); - } + tester.addMeasurements(0.99f, 0.99f, 0.99f, i, measurements, app1); } else { // low load - for (int j = 0; j < 200; j++ ) { - tester.addMeasurements(0.2f, 0.2f, 0.2f, i, 1, app1); - tester.clock().advance(Duration.ofMinutes(1)); - } + tester.addMeasurements(0.2f, 0.2f, 0.2f, i, measurements, app1); } - tester.addQueryRateMeasurements(app1, cluster1.id(), 2, t -> (t == 0 ? 20.0 : 10.0 )); + tester.clock().advance(samplePeriod.negated().multipliedBy(measurements)); + tester.addQueryRateMeasurements(app1, cluster1.id(), measurements, t -> (t == 0 ? 20.0 : 10.0 )); tester.maintainer().maintain(); } assertEquals(Cluster.maxScalingEvents, tester.cluster(app1, cluster1).scalingEvents().size()); assertEquals("The latest rescaling is the last event stored", tester.clock().instant(), - tester.cluster(app1, cluster1).scalingEvents().get(Cluster.maxScalingEvents - 1).at()); + tester.cluster(app1, cluster1).scalingEvents().get(Cluster.maxScalingEvents - 1).completion().get()); } @Test diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java index e1a1a2af5fb..d921af9543e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java @@ -93,11 +93,10 @@ public class AutoscalingMaintainerTester { ClusterSpec.Id cluster, int measurements, IntFunction<Double> queryRate) { - Instant time = clock().instant(); for (int i = 0; i < measurements; i++) { nodeRepository().metricsDb().addClusterMetrics(application, - Map.of(cluster, new ClusterMetricSnapshot(time, queryRate.apply(i), 0.0))); - time = time.plus(Duration.ofMinutes(5)); + Map.of(cluster, new ClusterMetricSnapshot(clock().instant(), queryRate.apply(i), 0.0))); + clock().advance(Duration.ofSeconds(150)); } } |